From b6d547cf92394cb3f8f73b23a769a3f4c241eec3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 8 May 2024 13:22:27 +0200 Subject: [PATCH 001/130] Tiered compaction: add order asserts after delta key k-merge (#7648) Adds ordering asserts to the output of the delta key iterator `MergeDeltaKeys` that implements a k-merge. Part of #7296 : the asserts added by this PR get hit in the reproducers of #7296 as well, but they are earlier in the pipeline. --- pageserver/compaction/src/helpers.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 1b80373ba7..eb0e5ee82a 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -9,6 +9,7 @@ use pageserver_api::shard::ShardIdentity; use pin_project_lite::pin_project; use std::collections::BinaryHeap; use std::collections::VecDeque; +use std::fmt::Display; use std::future::Future; use std::ops::{DerefMut, Range}; use std::pin::Pin; @@ -214,7 +215,7 @@ pub struct KeySize { pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream, E>> where - K: Eq, + K: Eq + PartialOrd + Display + Copy, I: Stream>, D: CompactionDeltaEntry<'a, K>, { @@ -229,12 +230,15 @@ where num_values: 1, size: first.size(), }; + let mut last_key = accum.key; while let Some(this) = input.next().await { let this = this?; if this.key() == accum.key { accum.size += this.size(); accum.num_values += 1; } else { + assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key); + last_key = accum.key; yield accum; accum = KeySize { key: this.key(), @@ -243,6 +247,7 @@ where }; } } + assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key); yield accum; } } From 870786bd8214480d8ce4aa56706cd8606b07ef15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 8 May 2024 13:22:55 +0200 Subject: [PATCH 002/130] Improve tiered compaction tests (#7643) Improves the tiered compaction tests: * Adds a new test that is a simpler version of the ignored `test_many_updates_for_single_key` test. * Reduces the amount of data that `test_many_updates_for_single_key` processes to make it execute more quickly. * Adds logging support. --- pageserver/compaction/tests/tests.rs | 45 ++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs index 1cea2a20e1..7aa20e6863 100644 --- a/pageserver/compaction/tests/tests.rs +++ b/pageserver/compaction/tests/tests.rs @@ -1,5 +1,20 @@ +use once_cell::sync::OnceCell; use pageserver_compaction::interface::CompactionLayer; use pageserver_compaction::simulator::MockTimeline; +use utils::logging; + +static LOG_HANDLE: OnceCell<()> = OnceCell::new(); + +pub(crate) fn setup_logging() { + LOG_HANDLE.get_or_init(|| { + logging::init( + logging::LogFormat::Test, + logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + logging::Output::Stdout, + ) + .expect("Failed to init test logging") + }); +} /// Test the extreme case that there are so many updates for a single key that /// even if we produce an extremely narrow delta layer, spanning just that one @@ -11,13 +26,14 @@ use pageserver_compaction::simulator::MockTimeline; #[ignore] #[tokio::test] async fn test_many_updates_for_single_key() { + setup_logging(); let mut executor = MockTimeline::new(); - executor.target_file_size = 10_000_000; // 10 MB + executor.target_file_size = 1_000_000; // 1 MB - // Ingest 100 MB of updates to a single key. + // Ingest 10 MB of updates to a single key. for _ in 1..1000 { executor.ingest_uniform(100, 10, &(0..100_000)).unwrap(); - executor.ingest_uniform(10_000, 10, &(0..1)).unwrap(); + executor.ingest_uniform(1000, 10, &(0..1)).unwrap(); executor.compact().await.unwrap(); } @@ -33,3 +49,26 @@ async fn test_many_updates_for_single_key() { } } } + +#[tokio::test] +async fn test_simple_updates() { + setup_logging(); + let mut executor = MockTimeline::new(); + executor.target_file_size = 500_000; // 500 KB + + // Ingest some traffic. + for _ in 1..400 { + executor.ingest_uniform(100, 500, &(0..100_000)).unwrap(); + } + + for l in executor.live_layers.iter() { + println!("layer {}: {}", l.short_id(), l.file_size()); + } + + println!("Running compaction..."); + executor.compact().await.unwrap(); + + for l in executor.live_layers.iter() { + println!("layer {}: {}", l.short_id(), l.file_size()); + } +} From a4a4d78993781e7aa723c1df6b833435c2fb2e8c Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 8 May 2024 12:26:56 +0100 Subject: [PATCH 003/130] build(deps): bump moto from 4.1.2 to 5.0.6 (#7653) ## Problem The main point of this PR is to get rid of `python-jose` and `ecdsa` packages as transitive dependencies through `moto`. They have a bunch of open vulnerabilities[1][2][3] (which don't affect us directly), but it's nice not to have them at all. - [1] https://github.com/advisories/GHSA-wj6h-64fc-37mp - [2] https://github.com/advisories/GHSA-6c5p-j8vq-pqhj - [3] https://github.com/advisories/GHSA-cjwg-qfpm-7377 ## Summary of changes - Update `moto` from 4.1.2 to 5.0.6 - Update code to accommodate breaking changes in `moto_server` --- poetry.lock | 585 +++++++++++++++++++------ pyproject.toml | 2 +- test_runner/fixtures/remote_storage.py | 2 +- 3 files changed, 443 insertions(+), 146 deletions(-) diff --git a/poetry.lock b/poetry.lock index e437f5de74..ef9f572b17 100644 --- a/poetry.lock +++ b/poetry.lock @@ -158,6 +158,28 @@ files = [ attrs = ">=16.0.0" pluggy = ">=0.4.0" +[[package]] +name = "annotated-types" +version = "0.6.0" +description = "Reusable constraint types to use with typing.Annotated" +optional = false +python-versions = ">=3.8" +files = [ + {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, + {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, +] + +[[package]] +name = "antlr4-python3-runtime" +version = "4.13.1" +description = "ANTLR 4.13.1 runtime for Python 3" +optional = false +python-versions = "*" +files = [ + {file = "antlr4-python3-runtime-4.13.1.tar.gz", hash = "sha256:3cd282f5ea7cfb841537fe01f143350fdb1c0b1ce7981443a2fa8513fddb6d1a"}, + {file = "antlr4_python3_runtime-4.13.1-py3-none-any.whl", hash = "sha256:78ec57aad12c97ac039ca27403ad61cb98aaec8a3f9bb8144f889aa0fa28b943"}, +] + [[package]] name = "anyio" version = "4.3.0" @@ -267,22 +289,23 @@ tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy" [[package]] name = "aws-sam-translator" -version = "1.48.0" +version = "1.88.0" description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates" optional = false -python-versions = ">=3.7, <=4.0, !=4.0" +python-versions = "!=4.0,<=4.0,>=3.8" files = [ - {file = "aws-sam-translator-1.48.0.tar.gz", hash = "sha256:7171037323dfa30f8f73e9bccb9210e4c384a585e087219a9518a5204f0a2c44"}, - {file = "aws_sam_translator-1.48.0-py2-none-any.whl", hash = "sha256:be18dfa3dfe7ab291d281667c5f73ac62dbe6bfe86df7d122e4258b906b736f0"}, - {file = "aws_sam_translator-1.48.0-py3-none-any.whl", hash = "sha256:ca4f8f9910d7713aeaba59346775bfb3198f6acb47c6704572f9bd3fc0fb5bf0"}, + {file = "aws_sam_translator-1.88.0-py3-none-any.whl", hash = "sha256:aa93d498d8de3fb3d485c316155b1628144b823bbc176099a20de06df666fcac"}, + {file = "aws_sam_translator-1.88.0.tar.gz", hash = "sha256:e77c65f3488566122277accd44a0f1ec018e37403e0d5fe25120d96e537e91a7"}, ] [package.dependencies] boto3 = ">=1.19.5,<2.dev0" -jsonschema = ">=3.2,<4.0" +jsonschema = ">=3.2,<5" +pydantic = ">=1.8,<3" +typing-extensions = ">=4.4" [package.extras] -dev = ["black (==20.8b1)", "boto3 (>=1.23,<2)", "click (>=7.1,<8.0)", "coverage (>=5.3,<6.0)", "dateparser (>=0.7,<1.0)", "docopt (>=0.6.2,<0.7.0)", "flake8 (>=3.8.4,<3.9.0)", "parameterized (>=0.7.4,<0.8.0)", "pylint (>=2.9.0,<2.10.0)", "pytest (>=6.2.5,<6.3.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-xdist (>=2.5,<3.0)", "pyyaml (>=5.4,<6.0)", "requests (>=2.24.0,<2.25.0)", "tenacity (>=7.0.0,<7.1.0)", "tox (>=3.24,<4.0)"] +dev = ["black (==24.3.0)", "boto3 (>=1.23,<2)", "boto3-stubs[appconfig,serverlessrepo] (>=1.19.5,<2.dev0)", "coverage (>=5.3,<8)", "dateparser (>=1.1,<2.0)", "mypy (>=1.3.0,<1.4.0)", "parameterized (>=0.7,<1.0)", "pytest (>=6.2,<8)", "pytest-cov (>=2.10,<5)", "pytest-env (>=0.6,<1)", "pytest-rerunfailures (>=9.1,<12)", "pytest-xdist (>=2.5,<4)", "pyyaml (>=6.0,<7.0)", "requests (>=2.28,<3.0)", "ruamel.yaml (==0.17.21)", "ruff (>=0.1.0,<0.2.0)", "tenacity (>=8.0,<9.0)", "types-PyYAML (>=6.0,<7.0)", "types-jsonschema (>=3.2,<4.0)"] [[package]] name = "aws-xray-sdk" @@ -798,24 +821,26 @@ pycparser = "*" [[package]] name = "cfn-lint" -version = "0.61.3" +version = "0.87.1" description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved" optional = false -python-versions = ">=3.6, <=4.0, !=4.0" +python-versions = "!=4.0,<=4.0,>=3.8" files = [ - {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"}, - {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"}, + {file = "cfn_lint-0.87.1-py3-none-any.whl", hash = "sha256:d450f450635fc223b6f66880ccac52a5fd1a52966fa1705f1ba52b88dfed3071"}, + {file = "cfn_lint-0.87.1.tar.gz", hash = "sha256:b3ce9d3e5e0eadcea5d584c8ccaa00bf2a990a36a64d7ffd8683bc60b7e4f06f"}, ] [package.dependencies] -aws-sam-translator = ">=1.47.0" +aws-sam-translator = ">=1.87.0" jschema-to-python = ">=1.2.3,<1.3.0" jsonpatch = "*" -jsonschema = ">=3.0,<4.0" +jsonschema = ">=3.0,<5" junit-xml = ">=1.9,<2.0" -networkx = ">=2.4,<3.0" +networkx = ">=2.4,<4" pyyaml = ">5.4" +regex = ">=2021.7.1" sarif-om = ">=1.0.4,<1.1.0" +sympy = ">=1.0.0" [[package]] name = "charset-normalizer" @@ -931,24 +956,6 @@ websocket-client = ">=0.32.0" ssh = ["paramiko (>=2.4.2)"] tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"] -[[package]] -name = "ecdsa" -version = "0.18.0" -description = "ECDSA cryptographic signature library (pure python)" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" -files = [ - {file = "ecdsa-0.18.0-py2.py3-none-any.whl", hash = "sha256:80600258e7ed2f16b9aa1d7c295bd70194109ad5a30fdee0eaeefef1d4c559dd"}, - {file = "ecdsa-0.18.0.tar.gz", hash = "sha256:190348041559e21b22a1d65cee485282ca11a6f81d503fddb84d5017e9ed1e49"}, -] - -[package.dependencies] -six = ">=1.9.0" - -[package.extras] -gmpy = ["gmpy"] -gmpy2 = ["gmpy2"] - [[package]] name = "exceptiongroup" version = "1.1.1" @@ -1268,6 +1275,23 @@ files = [ {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] +[[package]] +name = "joserfc" +version = "0.9.0" +description = "The ultimate Python library for JOSE RFCs, including JWS, JWE, JWK, JWA, JWT" +optional = false +python-versions = ">=3.8" +files = [ + {file = "joserfc-0.9.0-py3-none-any.whl", hash = "sha256:4026bdbe2c196cd40574e916fa1e28874d99649412edaab0e373dec3077153fb"}, + {file = "joserfc-0.9.0.tar.gz", hash = "sha256:eebca7f587b1761ce43a98ffd5327f2b600b9aa5bb0a77b947687f503ad43bc0"}, +] + +[package.dependencies] +cryptography = "*" + +[package.extras] +drafts = ["pycryptodome"] + [[package]] name = "jschema-to-python" version = "1.2.3" @@ -1309,6 +1333,20 @@ files = [ [package.dependencies] jsonpointer = ">=1.9" +[[package]] +name = "jsonpath-ng" +version = "1.6.1" +description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming." +optional = false +python-versions = "*" +files = [ + {file = "jsonpath-ng-1.6.1.tar.gz", hash = "sha256:086c37ba4917304850bd837aeab806670224d3f038fe2833ff593a672ef0a5fa"}, + {file = "jsonpath_ng-1.6.1-py3-none-any.whl", hash = "sha256:8f22cd8273d7772eea9aaa84d922e0841aa36fdb8a2c6b7f6c3791a16a9bc0be"}, +] + +[package.dependencies] +ply = "*" + [[package]] name = "jsonpickle" version = "2.2.0" @@ -1338,24 +1376,39 @@ files = [ [[package]] name = "jsonschema" -version = "3.2.0" +version = "4.17.3" description = "An implementation of JSON Schema validation for Python" optional = false -python-versions = "*" +python-versions = ">=3.7" files = [ - {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"}, - {file = "jsonschema-3.2.0.tar.gz", hash = "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"}, + {file = "jsonschema-4.17.3-py3-none-any.whl", hash = "sha256:a870ad254da1a8ca84b6a2905cac29d265f805acc57af304784962a2aa6508f6"}, + {file = "jsonschema-4.17.3.tar.gz", hash = "sha256:0f864437ab8b6076ba6707453ef8f98a6a0d512a80e93f8abdb676f737ecb60d"}, ] [package.dependencies] attrs = ">=17.4.0" -pyrsistent = ">=0.14.0" -setuptools = "*" -six = ">=1.11.0" +pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2" [package.extras] -format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"] -format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"] +format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"] + +[[package]] +name = "jsonschema-spec" +version = "0.1.6" +description = "JSONSchema Spec with object-oriented paths" +optional = false +python-versions = ">=3.7.0,<4.0.0" +files = [ + {file = "jsonschema_spec-0.1.6-py3-none-any.whl", hash = "sha256:f2206d18c89d1824c1f775ba14ed039743b41a9167bd2c5bdb774b66b3ca0bbf"}, + {file = "jsonschema_spec-0.1.6.tar.gz", hash = "sha256:90215863b56e212086641956b20127ccbf6d8a3a38343dad01d6a74d19482f76"}, +] + +[package.dependencies] +jsonschema = ">=4.0.0,<4.18.0" +pathable = ">=0.4.1,<0.5.0" +PyYAML = ">=5.1" +requests = ">=2.31.0,<3.0.0" [[package]] name = "junit-xml" @@ -1371,6 +1424,52 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "lazy-object-proxy" +version = "1.10.0" +description = "A fast and thorough lazy object proxy." +optional = false +python-versions = ">=3.8" +files = [ + {file = "lazy-object-proxy-1.10.0.tar.gz", hash = "sha256:78247b6d45f43a52ef35c25b5581459e85117225408a4128a3daf8bf9648ac69"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:855e068b0358ab916454464a884779c7ffa312b8925c6f7401e952dcf3b89977"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab7004cf2e59f7c2e4345604a3e6ea0d92ac44e1c2375527d56492014e690c3"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc0d2fc424e54c70c4bc06787e4072c4f3b1aa2f897dfdc34ce1013cf3ceef05"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e2adb09778797da09d2b5ebdbceebf7dd32e2c96f79da9052b2e87b6ea495895"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b1f711e2c6dcd4edd372cf5dec5c5a30d23bba06ee012093267b3376c079ec83"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-win32.whl", hash = "sha256:76a095cfe6045c7d0ca77db9934e8f7b71b14645f0094ffcd842349ada5c5fb9"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:b4f87d4ed9064b2628da63830986c3d2dca7501e6018347798313fcf028e2fd4"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fec03caabbc6b59ea4a638bee5fce7117be8e99a4103d9d5ad77f15d6f81020c"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02c83f957782cbbe8136bee26416686a6ae998c7b6191711a04da776dc9e47d4"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:009e6bb1f1935a62889ddc8541514b6a9e1fcf302667dcb049a0be5c8f613e56"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75fc59fc450050b1b3c203c35020bc41bd2695ed692a392924c6ce180c6f1dc9"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:782e2c9b2aab1708ffb07d4bf377d12901d7a1d99e5e410d648d892f8967ab1f"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-win32.whl", hash = "sha256:edb45bb8278574710e68a6b021599a10ce730d156e5b254941754a9cc0b17d03"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:e271058822765ad5e3bca7f05f2ace0de58a3f4e62045a8c90a0dfd2f8ad8cc6"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e98c8af98d5707dcdecc9ab0863c0ea6e88545d42ca7c3feffb6b4d1e370c7ba"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:952c81d415b9b80ea261d2372d2a4a2332a3890c2b83e0535f263ddfe43f0d43"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80b39d3a151309efc8cc48675918891b865bdf742a8616a337cb0090791a0de9"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e221060b701e2aa2ea991542900dd13907a5c90fa80e199dbf5a03359019e7a3"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:92f09ff65ecff3108e56526f9e2481b8116c0b9e1425325e13245abfd79bdb1b"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-win32.whl", hash = "sha256:3ad54b9ddbe20ae9f7c1b29e52f123120772b06dbb18ec6be9101369d63a4074"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:127a789c75151db6af398b8972178afe6bda7d6f68730c057fbbc2e96b08d282"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4ed0518a14dd26092614412936920ad081a424bdcb54cc13349a8e2c6d106a"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ad9e6ed739285919aa9661a5bbed0aaf410aa60231373c5579c6b4801bd883c"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fc0a92c02fa1ca1e84fc60fa258458e5bf89d90a1ddaeb8ed9cc3147f417255"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0aefc7591920bbd360d57ea03c995cebc204b424524a5bd78406f6e1b8b2a5d8"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5faf03a7d8942bb4476e3b62fd0f4cf94eaf4618e304a19865abf89a35c0bbee"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-win32.whl", hash = "sha256:e333e2324307a7b5d86adfa835bb500ee70bfcd1447384a822e96495796b0ca4"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:cb73507defd385b7705c599a94474b1d5222a508e502553ef94114a143ec6696"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:366c32fe5355ef5fc8a232c5436f4cc66e9d3e8967c01fb2e6302fd6627e3d94"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2297f08f08a2bb0d32a4265e98a006643cd7233fb7983032bd61ac7a02956b3b"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18dd842b49456aaa9a7cf535b04ca4571a302ff72ed8740d06b5adcd41fe0757"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:217138197c170a2a74ca0e05bddcd5f1796c735c37d0eee33e43259b192aa424"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9a3a87cf1e133e5b1994144c12ca4aa3d9698517fe1e2ca82977781b16955658"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-win32.whl", hash = "sha256:30b339b2a743c5288405aa79a69e706a06e02958eab31859f7f3c04980853b70"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:a899b10e17743683b293a729d3a11f2f399e8a90c73b089e29f5d0fe3509f0dd"}, + {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"}, +] + [[package]] name = "markupsafe" version = "2.1.1" @@ -1422,64 +1521,80 @@ files = [ [[package]] name = "moto" -version = "4.1.2" +version = "5.0.6" description = "" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"}, - {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"}, + {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"}, + {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"}, ] [package.dependencies] +antlr4-python3-runtime = {version = "*", optional = true, markers = "extra == \"server\""} aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""} boto3 = ">=1.9.201" -botocore = ">=1.12.201" +botocore = ">=1.14.0" cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""} cryptography = ">=3.3.1" -docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""} -ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""} +docker = {version = ">=3.0.0", optional = true, markers = "extra == \"server\""} flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""} flask-cors = {version = "*", optional = true, markers = "extra == \"server\""} graphql-core = {version = "*", optional = true, markers = "extra == \"server\""} Jinja2 = ">=2.10.1" +joserfc = {version = ">=0.9.0", optional = true, markers = "extra == \"server\""} jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} -openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""} +jsonpath-ng = {version = "*", optional = true, markers = "extra == \"server\""} +openapi-spec-validator = {version = ">=0.5.0", optional = true, markers = "extra == \"server\""} +py-partiql-parser = {version = "0.5.4", optional = true, markers = "extra == \"server\""} pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""} python-dateutil = ">=2.1,<3.0.0" -python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""} PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""} requests = ">=2.5" -responses = ">=0.13.0" +responses = ">=0.15.0" setuptools = {version = "*", optional = true, markers = "extra == \"server\""} -sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""} werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] -apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] -apigatewayv2 = ["PyYAML (>=5.1)"] +all = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +apigateway = ["PyYAML (>=5.1)", "joserfc (>=0.9.0)", "openapi-spec-validator (>=0.5.0)"] +apigatewayv2 = ["PyYAML (>=5.1)", "openapi-spec-validator (>=0.5.0)"] appsync = ["graphql-core"] -awslambda = ["docker (>=2.5.1)"] -batch = ["docker (>=2.5.1)"] -cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] -cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] -ds = ["sshpubkeys (>=3.1.0)"] -dynamodb = ["docker (>=2.5.1)"] -dynamodbstreams = ["docker (>=2.5.1)"] -ebs = ["sshpubkeys (>=3.1.0)"] -ec2 = ["sshpubkeys (>=3.1.0)"] -efs = ["sshpubkeys (>=3.1.0)"] -eks = ["sshpubkeys (>=3.1.0)"] +awslambda = ["docker (>=3.0.0)"] +batch = ["docker (>=3.0.0)"] +cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +cognitoidp = ["joserfc (>=0.9.0)"] +dynamodb = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"] +dynamodbstreams = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"] glue = ["pyparsing (>=3.0.7)"] iotdata = ["jsondiff (>=1.1.2)"] -route53resolver = ["sshpubkeys (>=3.1.0)"] -s3 = ["PyYAML (>=5.1)"] -server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +proxy = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +resourcegroupstaggingapi = ["PyYAML (>=5.1)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)"] +s3 = ["PyYAML (>=5.1)", "py-partiql-parser (==0.5.4)"] +s3crc32c = ["PyYAML (>=5.1)", "crc32c", "py-partiql-parser (==0.5.4)"] +server = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] ssm = ["PyYAML (>=5.1)"] +stepfunctions = ["antlr4-python3-runtime", "jsonpath-ng"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] +[[package]] +name = "mpmath" +version = "1.3.0" +description = "Python library for arbitrary-precision floating-point arithmetic" +optional = false +python-versions = "*" +files = [ + {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, + {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, +] + +[package.extras] +develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] +docs = ["sphinx"] +gmpy = ["gmpy2 (>=2.1.0a4)"] +tests = ["pytest (>=4.6)"] + [[package]] name = "multidict" version = "6.0.4" @@ -1654,42 +1769,38 @@ test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"] [[package]] name = "openapi-schema-validator" -version = "0.2.3" +version = "0.4.4" description = "OpenAPI schema validation for Python" optional = false python-versions = ">=3.7.0,<4.0.0" files = [ - {file = "openapi-schema-validator-0.2.3.tar.gz", hash = "sha256:2c64907728c3ef78e23711c8840a423f0b241588c9ed929855e4b2d1bb0cf5f2"}, - {file = "openapi_schema_validator-0.2.3-py3-none-any.whl", hash = "sha256:9bae709212a19222892cabcc60cafd903cbf4b220223f48583afa3c0e3cc6fc4"}, + {file = "openapi_schema_validator-0.4.4-py3-none-any.whl", hash = "sha256:79f37f38ef9fd5206b924ed7a6f382cea7b649b3b56383c47f1906082b7b9015"}, + {file = "openapi_schema_validator-0.4.4.tar.gz", hash = "sha256:c573e2be2c783abae56c5a1486ab716ca96e09d1c3eab56020d1dc680aa57bf8"}, ] [package.dependencies] -jsonschema = ">=3.0.0,<5.0.0" +jsonschema = ">=4.0.0,<4.18.0" +rfc3339-validator = "*" [package.extras] -isodate = ["isodate"] -rfc3339-validator = ["rfc3339-validator"] -strict-rfc3339 = ["strict-rfc3339"] +docs = ["sphinx (>=5.3.0,<6.0.0)", "sphinx-immaterial (>=0.11.0,<0.12.0)"] [[package]] name = "openapi-spec-validator" -version = "0.4.0" -description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3.0 spec validator" +version = "0.5.7" +description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3 spec validator" optional = false python-versions = ">=3.7.0,<4.0.0" files = [ - {file = "openapi-spec-validator-0.4.0.tar.gz", hash = "sha256:97f258850afc97b048f7c2653855e0f88fa66ac103c2be5077c7960aca2ad49a"}, - {file = "openapi_spec_validator-0.4.0-py3-none-any.whl", hash = "sha256:06900ac4d546a1df3642a779da0055be58869c598e3042a2fef067cfd99d04d0"}, + {file = "openapi_spec_validator-0.5.7-py3-none-any.whl", hash = "sha256:8712d2879db7692974ef89c47a3ebfc79436442921ec3a826ac0ce80cde8c549"}, + {file = "openapi_spec_validator-0.5.7.tar.gz", hash = "sha256:6c2d42180045a80fd6314de848b94310bdb0fa4949f4b099578b69f79d9fa5ac"}, ] [package.dependencies] -jsonschema = ">=3.2.0,<5.0.0" -openapi-schema-validator = ">=0.2.0,<0.3.0" -PyYAML = ">=5.1" -setuptools = "*" - -[package.extras] -requests = ["requests"] +jsonschema = ">=4.0.0,<4.18.0" +jsonschema-spec = ">=0.1.1,<0.2.0" +lazy-object-proxy = ">=1.7.1,<2.0.0" +openapi-schema-validator = ">=0.4.2,<0.5.0" [[package]] name = "packaging" @@ -1702,6 +1813,17 @@ files = [ {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, ] +[[package]] +name = "pathable" +version = "0.4.3" +description = "Object-oriented paths" +optional = false +python-versions = ">=3.7.0,<4.0.0" +files = [ + {file = "pathable-0.4.3-py3-none-any.whl", hash = "sha256:cdd7b1f9d7d5c8b8d3315dbf5a86b2596053ae845f056f57d97c0eefff84da14"}, + {file = "pathable-0.4.3.tar.gz", hash = "sha256:5c869d315be50776cc8a993f3af43e0c60dc01506b399643f919034ebf4cdcab"}, +] + [[package]] name = "pbr" version = "5.9.0" @@ -1728,6 +1850,17 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "ply" +version = "3.11" +description = "Python Lex & Yacc" +optional = false +python-versions = "*" +files = [ + {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, + {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, +] + [[package]] name = "prometheus-client" version = "0.14.1" @@ -1840,16 +1973,19 @@ files = [ ] [[package]] -name = "pyasn1" -version = "0.4.8" -description = "ASN.1 types and codecs" +name = "py-partiql-parser" +version = "0.5.4" +description = "Pure Python PartiQL Parser" optional = false python-versions = "*" files = [ - {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, - {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, + {file = "py_partiql_parser-0.5.4-py2.py3-none-any.whl", hash = "sha256:3dc4295a47da9587681a96b35c6e151886fdbd0a4acbe0d97c4c68e5f689d315"}, + {file = "py_partiql_parser-0.5.4.tar.gz", hash = "sha256:72e043919538fa63edae72fb59afc7e3fd93adbde656718a7d2b4666f23dd114"}, ] +[package.extras] +dev = ["black (==22.6.0)", "flake8", "mypy", "pytest"] + [[package]] name = "pycparser" version = "2.21" @@ -1861,6 +1997,116 @@ files = [ {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] +[[package]] +name = "pydantic" +version = "2.7.1" +description = "Data validation using Python type hints" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydantic-2.7.1-py3-none-any.whl", hash = "sha256:e029badca45266732a9a79898a15ae2e8b14840b1eabbb25844be28f0b33f3d5"}, + {file = "pydantic-2.7.1.tar.gz", hash = "sha256:e9dbb5eada8abe4d9ae5f46b9939aead650cd2b68f249bb3a8139dbe125803cc"}, +] + +[package.dependencies] +annotated-types = ">=0.4.0" +pydantic-core = "2.18.2" +typing-extensions = ">=4.6.1" + +[package.extras] +email = ["email-validator (>=2.0.0)"] + +[[package]] +name = "pydantic-core" +version = "2.18.2" +description = "Core functionality for Pydantic validation and serialization" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydantic_core-2.18.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:9e08e867b306f525802df7cd16c44ff5ebbe747ff0ca6cf3fde7f36c05a59a81"}, + {file = "pydantic_core-2.18.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f0a21cbaa69900cbe1a2e7cad2aa74ac3cf21b10c3efb0fa0b80305274c0e8a2"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0680b1f1f11fda801397de52c36ce38ef1c1dc841a0927a94f226dea29c3ae3d"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:95b9d5e72481d3780ba3442eac863eae92ae43a5f3adb5b4d0a1de89d42bb250"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fcf5cd9c4b655ad666ca332b9a081112cd7a58a8b5a6ca7a3104bc950f2038"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b5155ff768083cb1d62f3e143b49a8a3432e6789a3abee8acd005c3c7af1c74"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:553ef617b6836fc7e4df130bb851e32fe357ce36336d897fd6646d6058d980af"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89ed9eb7d616ef5714e5590e6cf7f23b02d0d539767d33561e3675d6f9e3857"}, + {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:75f7e9488238e920ab6204399ded280dc4c307d034f3924cd7f90a38b1829563"}, + {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ef26c9e94a8c04a1b2924149a9cb081836913818e55681722d7f29af88fe7b38"}, + {file = "pydantic_core-2.18.2-cp310-none-win32.whl", hash = "sha256:182245ff6b0039e82b6bb585ed55a64d7c81c560715d1bad0cbad6dfa07b4027"}, + {file = "pydantic_core-2.18.2-cp310-none-win_amd64.whl", hash = "sha256:e23ec367a948b6d812301afc1b13f8094ab7b2c280af66ef450efc357d2ae543"}, + {file = "pydantic_core-2.18.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:219da3f096d50a157f33645a1cf31c0ad1fe829a92181dd1311022f986e5fbe3"}, + {file = "pydantic_core-2.18.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc1cfd88a64e012b74e94cd00bbe0f9c6df57049c97f02bb07d39e9c852e19a4"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b7133a6e6aeb8df37d6f413f7705a37ab4031597f64ab56384c94d98fa0e90"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:224c421235f6102e8737032483f43c1a8cfb1d2f45740c44166219599358c2cd"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b14d82cdb934e99dda6d9d60dc84a24379820176cc4a0d123f88df319ae9c150"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2728b01246a3bba6de144f9e3115b532ee44bd6cf39795194fb75491824a1413"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:470b94480bb5ee929f5acba6995251ada5e059a5ef3e0dfc63cca287283ebfa6"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:997abc4df705d1295a42f95b4eec4950a37ad8ae46d913caeee117b6b198811c"}, + {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75250dbc5290e3f1a0f4618db35e51a165186f9034eff158f3d490b3fed9f8a0"}, + {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4456f2dca97c425231d7315737d45239b2b51a50dc2b6f0c2bb181fce6207664"}, + {file = "pydantic_core-2.18.2-cp311-none-win32.whl", hash = "sha256:269322dcc3d8bdb69f054681edff86276b2ff972447863cf34c8b860f5188e2e"}, + {file = "pydantic_core-2.18.2-cp311-none-win_amd64.whl", hash = "sha256:800d60565aec896f25bc3cfa56d2277d52d5182af08162f7954f938c06dc4ee3"}, + {file = "pydantic_core-2.18.2-cp311-none-win_arm64.whl", hash = "sha256:1404c69d6a676245199767ba4f633cce5f4ad4181f9d0ccb0577e1f66cf4c46d"}, + {file = "pydantic_core-2.18.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:fb2bd7be70c0fe4dfd32c951bc813d9fe6ebcbfdd15a07527796c8204bd36242"}, + {file = "pydantic_core-2.18.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6132dd3bd52838acddca05a72aafb6eab6536aa145e923bb50f45e78b7251043"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d904828195733c183d20a54230c0df0eb46ec746ea1a666730787353e87182"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c9bd70772c720142be1020eac55f8143a34ec9f82d75a8e7a07852023e46617f"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8ed04b3582771764538f7ee7001b02e1170223cf9b75dff0bc698fadb00cf3"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e6dac87ddb34aaec85f873d737e9d06a3555a1cc1a8e0c44b7f8d5daeb89d86f"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ca4ae5a27ad7a4ee5170aebce1574b375de390bc01284f87b18d43a3984df72"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:886eec03591b7cf058467a70a87733b35f44707bd86cf64a615584fd72488b7c"}, + {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ca7b0c1f1c983e064caa85f3792dd2fe3526b3505378874afa84baf662e12241"}, + {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b4356d3538c3649337df4074e81b85f0616b79731fe22dd11b99499b2ebbdf3"}, + {file = "pydantic_core-2.18.2-cp312-none-win32.whl", hash = "sha256:8b172601454f2d7701121bbec3425dd71efcb787a027edf49724c9cefc14c038"}, + {file = "pydantic_core-2.18.2-cp312-none-win_amd64.whl", hash = "sha256:b1bd7e47b1558ea872bd16c8502c414f9e90dcf12f1395129d7bb42a09a95438"}, + {file = "pydantic_core-2.18.2-cp312-none-win_arm64.whl", hash = "sha256:98758d627ff397e752bc339272c14c98199c613f922d4a384ddc07526c86a2ec"}, + {file = "pydantic_core-2.18.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:9fdad8e35f278b2c3eb77cbdc5c0a49dada440657bf738d6905ce106dc1de439"}, + {file = "pydantic_core-2.18.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1d90c3265ae107f91a4f279f4d6f6f1d4907ac76c6868b27dc7fb33688cfb347"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:390193c770399861d8df9670fb0d1874f330c79caaca4642332df7c682bf6b91"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:82d5d4d78e4448683cb467897fe24e2b74bb7b973a541ea1dcfec1d3cbce39fb"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4774f3184d2ef3e14e8693194f661dea5a4d6ca4e3dc8e39786d33a94865cefd"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4d938ec0adf5167cb335acb25a4ee69a8107e4984f8fbd2e897021d9e4ca21b"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0e8b1be28239fc64a88a8189d1df7fad8be8c1ae47fcc33e43d4be15f99cc70"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:868649da93e5a3d5eacc2b5b3b9235c98ccdbfd443832f31e075f54419e1b96b"}, + {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:78363590ef93d5d226ba21a90a03ea89a20738ee5b7da83d771d283fd8a56761"}, + {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:852e966fbd035a6468fc0a3496589b45e2208ec7ca95c26470a54daed82a0788"}, + {file = "pydantic_core-2.18.2-cp38-none-win32.whl", hash = "sha256:6a46e22a707e7ad4484ac9ee9f290f9d501df45954184e23fc29408dfad61350"}, + {file = "pydantic_core-2.18.2-cp38-none-win_amd64.whl", hash = "sha256:d91cb5ea8b11607cc757675051f61b3d93f15eca3cefb3e6c704a5d6e8440f4e"}, + {file = "pydantic_core-2.18.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ae0a8a797a5e56c053610fa7be147993fe50960fa43609ff2a9552b0e07013e8"}, + {file = "pydantic_core-2.18.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:042473b6280246b1dbf530559246f6842b56119c2926d1e52b631bdc46075f2a"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a388a77e629b9ec814c1b1e6b3b595fe521d2cdc625fcca26fbc2d44c816804"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25add29b8f3b233ae90ccef2d902d0ae0432eb0d45370fe315d1a5cf231004b"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f459a5ce8434614dfd39bbebf1041952ae01da6bed9855008cb33b875cb024c0"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eff2de745698eb46eeb51193a9f41d67d834d50e424aef27df2fcdee1b153845"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8309f67285bdfe65c372ea3722b7a5642680f3dba538566340a9d36e920b5f0"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f93a8a2e3938ff656a7c1bc57193b1319960ac015b6e87d76c76bf14fe0244b4"}, + {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:22057013c8c1e272eb8d0eebc796701167d8377441ec894a8fed1af64a0bf399"}, + {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfeecd1ac6cc1fb2692c3d5110781c965aabd4ec5d32799773ca7b1456ac636b"}, + {file = "pydantic_core-2.18.2-cp39-none-win32.whl", hash = "sha256:0d69b4c2f6bb3e130dba60d34c0845ba31b69babdd3f78f7c0c8fae5021a253e"}, + {file = "pydantic_core-2.18.2-cp39-none-win_amd64.whl", hash = "sha256:d9319e499827271b09b4e411905b24a426b8fb69464dfa1696258f53a3334641"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a1874c6dd4113308bd0eb568418e6114b252afe44319ead2b4081e9b9521fe75"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ccdd111c03bfd3666bd2472b674c6899550e09e9f298954cfc896ab92b5b0e6d"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e18609ceaa6eed63753037fc06ebb16041d17d28199ae5aba0052c51449650a9"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5c584d357c4e2baf0ff7baf44f4994be121e16a2c88918a5817331fc7599d7"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43f0f463cf89ace478de71a318b1b4f05ebc456a9b9300d027b4b57c1a2064fb"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e1b395e58b10b73b07b7cf740d728dd4ff9365ac46c18751bf8b3d8cca8f625a"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0098300eebb1c837271d3d1a2cd2911e7c11b396eac9661655ee524a7f10587b"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:36789b70d613fbac0a25bb07ab3d9dba4d2e38af609c020cf4d888d165ee0bf3"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f9a801e7c8f1ef8718da265bba008fa121243dfe37c1cea17840b0944dfd72c"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:3a6515ebc6e69d85502b4951d89131ca4e036078ea35533bb76327f8424531ce"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20aca1e2298c56ececfd8ed159ae4dde2df0781988c97ef77d5c16ff4bd5b400"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:223ee893d77a310a0391dca6df00f70bbc2f36a71a895cecd9a0e762dc37b349"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2334ce8c673ee93a1d6a65bd90327588387ba073c17e61bf19b4fd97d688d63c"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cbca948f2d14b09d20268cda7b0367723d79063f26c4ffc523af9042cad95592"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b3ef08e20ec49e02d5c6717a91bb5af9b20f1805583cb0adfe9ba2c6b505b5ae"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6fdc8627910eed0c01aed6a390a252fe3ea6d472ee70fdde56273f198938374"}, + {file = "pydantic_core-2.18.2.tar.gz", hash = "sha256:2e29d20810dfc3043ee13ac7d9e25105799817683348823f305ab3f349b9386e"}, +] + +[package.dependencies] +typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" + [[package]] name = "pyjwt" version = "2.4.0" @@ -2115,28 +2361,6 @@ files = [ [package.dependencies] six = ">=1.5" -[[package]] -name = "python-jose" -version = "3.3.0" -description = "JOSE implementation in Python" -optional = false -python-versions = "*" -files = [ - {file = "python-jose-3.3.0.tar.gz", hash = "sha256:55779b5e6ad599c6336191246e95eb2293a9ddebd555f796a65f838f07e5d78a"}, - {file = "python_jose-3.3.0-py2.py3-none-any.whl", hash = "sha256:9b1376b023f8b298536eedd47ae1089bcdb848f1535ab30555cd92002d78923a"}, -] - -[package.dependencies] -cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"cryptography\""} -ecdsa = "!=0.15" -pyasn1 = "*" -rsa = "*" - -[package.extras] -cryptography = ["cryptography (>=3.4.0)"] -pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"] -pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"] - [[package]] name = "pywin32" version = "301" @@ -2181,7 +2405,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2216,6 +2439,94 @@ files = [ {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, ] +[[package]] +name = "regex" +version = "2024.4.28" +description = "Alternative regular expression module, to replace re." +optional = false +python-versions = ">=3.8" +files = [ + {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"}, + {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"}, + {file = "regex-2024.4.28-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:47af45b6153522733aa6e92543938e97a70ce0900649ba626cf5aad290b737b6"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99d6a550425cc51c656331af0e2b1651e90eaaa23fb4acde577cf15068e2e20f"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf29304a8011feb58913c382902fde3395957a47645bf848eea695839aa101b7"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:92da587eee39a52c91aebea8b850e4e4f095fe5928d415cb7ed656b3460ae79a"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6277d426e2f31bdbacb377d17a7475e32b2d7d1f02faaecc48d8e370c6a3ff31"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28e1f28d07220c0f3da0e8fcd5a115bbb53f8b55cecf9bec0c946eb9a059a94c"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:aaa179975a64790c1f2701ac562b5eeb733946eeb036b5bcca05c8d928a62f10"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6f435946b7bf7a1b438b4e6b149b947c837cb23c704e780c19ba3e6855dbbdd3"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:19d6c11bf35a6ad077eb23852827f91c804eeb71ecb85db4ee1386825b9dc4db"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:fdae0120cddc839eb8e3c15faa8ad541cc6d906d3eb24d82fb041cfe2807bc1e"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e672cf9caaf669053121f1766d659a8813bd547edef6e009205378faf45c67b8"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f57515750d07e14743db55d59759893fdb21d2668f39e549a7d6cad5d70f9fea"}, + {file = "regex-2024.4.28-cp310-cp310-win32.whl", hash = "sha256:a1409c4eccb6981c7baabc8888d3550df518add6e06fe74fa1d9312c1838652d"}, + {file = "regex-2024.4.28-cp310-cp310-win_amd64.whl", hash = "sha256:1f687a28640f763f23f8a9801fe9e1b37338bb1ca5d564ddd41619458f1f22d1"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:84077821c85f222362b72fdc44f7a3a13587a013a45cf14534df1cbbdc9a6796"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b45d4503de8f4f3dc02f1d28a9b039e5504a02cc18906cfe744c11def942e9eb"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:457c2cd5a646dd4ed536c92b535d73548fb8e216ebee602aa9f48e068fc393f3"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b51739ddfd013c6f657b55a508de8b9ea78b56d22b236052c3a85a675102dc6"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:459226445c7d7454981c4c0ce0ad1a72e1e751c3e417f305722bbcee6697e06a"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:670fa596984b08a4a769491cbdf22350431970d0112e03d7e4eeaecaafcd0fec"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe00f4fe11c8a521b173e6324d862ee7ee3412bf7107570c9b564fe1119b56fb"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36f392dc7763fe7924575475736bddf9ab9f7a66b920932d0ea50c2ded2f5636"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:23a412b7b1a7063f81a742463f38821097b6a37ce1e5b89dd8e871d14dbfd86b"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f1d6e4b7b2ae3a6a9df53efbf199e4bfcff0959dbdb5fd9ced34d4407348e39a"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:499334ad139557de97cbc4347ee921c0e2b5e9c0f009859e74f3f77918339257"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:0940038bec2fe9e26b203d636c44d31dd8766abc1fe66262da6484bd82461ccf"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:66372c2a01782c5fe8e04bff4a2a0121a9897e19223d9eab30c54c50b2ebeb7f"}, + {file = "regex-2024.4.28-cp311-cp311-win32.whl", hash = "sha256:c77d10ec3c1cf328b2f501ca32583625987ea0f23a0c2a49b37a39ee5c4c4630"}, + {file = "regex-2024.4.28-cp311-cp311-win_amd64.whl", hash = "sha256:fc0916c4295c64d6890a46e02d4482bb5ccf33bf1a824c0eaa9e83b148291f90"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:08a1749f04fee2811c7617fdd46d2e46d09106fa8f475c884b65c01326eb15c5"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b8eb28995771c087a73338f695a08c9abfdf723d185e57b97f6175c5051ff1ae"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dd7ef715ccb8040954d44cfeff17e6b8e9f79c8019daae2fd30a8806ef5435c0"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb0315a2b26fde4005a7c401707c5352df274460f2f85b209cf6024271373013"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f2fc053228a6bd3a17a9b0a3f15c3ab3cf95727b00557e92e1cfe094b88cc662"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fe9739a686dc44733d52d6e4f7b9c77b285e49edf8570754b322bca6b85b4cc"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74fcf77d979364f9b69fcf8200849ca29a374973dc193a7317698aa37d8b01c"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:965fd0cf4694d76f6564896b422724ec7b959ef927a7cb187fc6b3f4e4f59833"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2fef0b38c34ae675fcbb1b5db760d40c3fc3612cfa186e9e50df5782cac02bcd"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bc365ce25f6c7c5ed70e4bc674f9137f52b7dd6a125037f9132a7be52b8a252f"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:ac69b394764bb857429b031d29d9604842bc4cbfd964d764b1af1868eeebc4f0"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:144a1fc54765f5c5c36d6d4b073299832aa1ec6a746a6452c3ee7b46b3d3b11d"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2630ca4e152c221072fd4a56d4622b5ada876f668ecd24d5ab62544ae6793ed6"}, + {file = "regex-2024.4.28-cp312-cp312-win32.whl", hash = "sha256:7f3502f03b4da52bbe8ba962621daa846f38489cae5c4a7b5d738f15f6443d17"}, + {file = "regex-2024.4.28-cp312-cp312-win_amd64.whl", hash = "sha256:0dd3f69098511e71880fb00f5815db9ed0ef62c05775395968299cb400aeab82"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:374f690e1dd0dbdcddea4a5c9bdd97632cf656c69113f7cd6a361f2a67221cb6"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f87ae6b96374db20f180eab083aafe419b194e96e4f282c40191e71980c666"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5dbc1bcc7413eebe5f18196e22804a3be1bfdfc7e2afd415e12c068624d48247"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f85151ec5a232335f1be022b09fbbe459042ea1951d8a48fef251223fc67eee1"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57ba112e5530530fd175ed550373eb263db4ca98b5f00694d73b18b9a02e7185"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:224803b74aab56aa7be313f92a8d9911dcade37e5f167db62a738d0c85fdac4b"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a54a047b607fd2d2d52a05e6ad294602f1e0dec2291152b745870afc47c1397"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a2a512d623f1f2d01d881513af9fc6a7c46e5cfffb7dc50c38ce959f9246c94"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c06bf3f38f0707592898428636cbb75d0a846651b053a1cf748763e3063a6925"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1031a5e7b048ee371ab3653aad3030ecfad6ee9ecdc85f0242c57751a05b0ac4"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d7a353ebfa7154c871a35caca7bfd8f9e18666829a1dc187115b80e35a29393e"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7e76b9cfbf5ced1aca15a0e5b6f229344d9b3123439ffce552b11faab0114a02"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5ce479ecc068bc2a74cb98dd8dba99e070d1b2f4a8371a7dfe631f85db70fe6e"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7d77b6f63f806578c604dca209280e4c54f0fa9a8128bb8d2cc5fb6f99da4150"}, + {file = "regex-2024.4.28-cp38-cp38-win32.whl", hash = "sha256:d84308f097d7a513359757c69707ad339da799e53b7393819ec2ea36bc4beb58"}, + {file = "regex-2024.4.28-cp38-cp38-win_amd64.whl", hash = "sha256:2cc1b87bba1dd1a898e664a31012725e48af826bf3971e786c53e32e02adae6c"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7413167c507a768eafb5424413c5b2f515c606be5bb4ef8c5dee43925aa5718b"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:108e2dcf0b53a7c4ab8986842a8edcb8ab2e59919a74ff51c296772e8e74d0ae"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f1c5742c31ba7d72f2dedf7968998730664b45e38827637e0f04a2ac7de2f5f1"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecc6148228c9ae25ce403eade13a0961de1cb016bdb35c6eafd8e7b87ad028b1"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7d893c8cf0e2429b823ef1a1d360a25950ed11f0e2a9df2b5198821832e1947"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4290035b169578ffbbfa50d904d26bec16a94526071ebec3dadbebf67a26b25e"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a22ae1cfd82e4ffa2066eb3390777dc79468f866f0625261a93e44cdf6482b"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd24fd140b69f0b0bcc9165c397e9b2e89ecbeda83303abf2a072609f60239e2"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:39fb166d2196413bead229cd64a2ffd6ec78ebab83fff7d2701103cf9f4dfd26"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9301cc6db4d83d2c0719f7fcda37229691745168bf6ae849bea2e85fc769175d"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7c3d389e8d76a49923683123730c33e9553063d9041658f23897f0b396b2386f"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:99ef6289b62042500d581170d06e17f5353b111a15aa6b25b05b91c6886df8fc"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:b91d529b47798c016d4b4c1d06cc826ac40d196da54f0de3c519f5a297c5076a"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:43548ad74ea50456e1c68d3c67fff3de64c6edb85bcd511d1136f9b5376fc9d1"}, + {file = "regex-2024.4.28-cp39-cp39-win32.whl", hash = "sha256:05d9b6578a22db7dedb4df81451f360395828b04f4513980b6bd7a1412c679cc"}, + {file = "regex-2024.4.28-cp39-cp39-win_amd64.whl", hash = "sha256:3986217ec830c2109875be740531feb8ddafe0dfa49767cdcd072ed7e8927962"}, + {file = "regex-2024.4.28.tar.gz", hash = "sha256:83ab366777ea45d58f72593adf35d36ca911ea8bd838483c1823b883a121b0e4"}, +] + [[package]] name = "requests" version = "2.31.0" @@ -2256,18 +2567,18 @@ urllib3 = ">=1.25.10" tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-localserver", "types-mock", "types-requests"] [[package]] -name = "rsa" -version = "4.9" -description = "Pure-Python RSA implementation" +name = "rfc3339-validator" +version = "0.1.4" +description = "A pure python RFC3339 validator" optional = false -python-versions = ">=3.6,<4" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ - {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"}, - {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"}, + {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"}, + {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"}, ] [package.dependencies] -pyasn1 = ">=0.1.3" +six = "*" [[package]] name = "ruff" @@ -2366,22 +2677,18 @@ files = [ ] [[package]] -name = "sshpubkeys" -version = "3.3.1" -description = "SSH public key parser" +name = "sympy" +version = "1.12" +description = "Computer algebra system (CAS) in Python" optional = false -python-versions = ">=3" +python-versions = ">=3.8" files = [ - {file = "sshpubkeys-3.3.1-py2.py3-none-any.whl", hash = "sha256:946f76b8fe86704b0e7c56a00d80294e39bc2305999844f079a217885060b1ac"}, - {file = "sshpubkeys-3.3.1.tar.gz", hash = "sha256:3020ed4f8c846849299370fbe98ff4157b0ccc1accec105e07cfa9ae4bb55064"}, + {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"}, + {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"}, ] [package.dependencies] -cryptography = ">=2.1.4" -ecdsa = ">=0.13" - -[package.extras] -dev = ["twine", "wheel", "yapf"] +mpmath = ">=0.19" [[package]] name = "toml" @@ -2652,16 +2959,6 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, - {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, - {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -2899,4 +3196,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "496d6d9f722983bda4d1265370bc8ba75560da74ab5d6b68c94a03290815e1eb" +content-hash = "dcde14c58a32bda5f123319a069352c458b3719f3c62977991eebb9803a46a9e" diff --git a/pyproject.toml b/pyproject.toml index 4ec8efc2ff..ac7f9b061c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ types-requests = "^2.31.0.0" types-psycopg2 = "^2.9.21.10" boto3 = "^1.34.11" boto3-stubs = {extras = ["s3"], version = "^1.26.16"} -moto = {extras = ["server"], version = "^4.1.2"} +moto = {extras = ["server"], version = "^5.0.6"} backoff = "^2.2.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 925e1b450f..132d2450a7 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -50,7 +50,7 @@ class MockS3Server: # XXX: do not use `shell=True` or add `exec ` to the command here otherwise. # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux # if a process is started from the shell process. - self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"]) + self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", f"-p{port}"]) error = None try: return_code = self.subprocess.poll() From 8728d5a5fd261a483560fcc53ece7cc51ec82600 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 8 May 2024 16:32:21 +0200 Subject: [PATCH 004/130] neon_local: use `pageserver.toml` as source of truth for `struct PageServerConf` (#7642) Before this PR, `neon_local` would store a copy of a subset of the initial `pageserver.toml` in its `.neon/config`, e.g, `listen_pg_addr`. That copy is represented as `struct PageServerConf`. This copy was used to inform e.g., `neon_local endpoint` and other commands that depend on Pageserver about which port to connect to. The problem with that scheme is that the duplicated information in `.neon/config` can get stale if `pageserver.toml` is changed. This PR fixes that by eliminating populating `struct PageServerConf` from the `pageserver.toml`s. The `[[pageservers]]` TOML table in the `.neon/config` is obsolete. As of this PR, `neon_local` will fail to start and print an error informing about this change. Code-level changes: - Remove the `--pg-version` flag, it was only used for some checks during `neon_local init` - Remove the warn-but-continue behavior for when auth key creation fails but auth keys are not required. It's just complexity that is unjustified for a tool like `neon_local`. - Introduce a type-system-level distinction between the runtime state and the two (!) toml formats that are almost the same but not quite. - runtime state: `struct PageServerConf`, now without `serde` derives - toml format 1: the state in `.neon/config` => `struct OnDiskState` - toml format 2: the `neon_local init --config TMPFILE` that, unlike `struct OnDiskState`, allows specifying `pageservers` - Remove `[[pageservers]]` from the `struct OnDiskState` and load the data from the individual `pageserver.toml`s instead. --- control_plane/src/bin/neon_local.rs | 151 +++---- control_plane/src/local_env.rs | 468 +++++++++++++-------- control_plane/src/pageserver.rs | 81 +--- test_runner/fixtures/neon_fixtures.py | 54 +-- test_runner/regress/test_pageserver_api.py | 35 +- 5 files changed, 412 insertions(+), 377 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 179a756135..18e395e2b5 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -9,8 +9,11 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum}; use compute_api::spec::ComputeMode; use control_plane::endpoint::ComputeControlPlane; -use control_plane::local_env::{InitForceMode, LocalEnv}; -use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR}; +use control_plane::local_env::{ + InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf, + SafekeeperConf, +}; +use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage_controller::StorageController; use control_plane::{broker, local_env}; @@ -52,44 +55,6 @@ const DEFAULT_PG_VERSION: &str = "15"; const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; -fn default_conf(num_pageservers: u16) -> String { - let mut template = format!( - r#" -# Default built-in configuration, defined in main.rs -control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}' - -[broker] -listen_addr = '{DEFAULT_BROKER_ADDR}' - -[[safekeepers]] -id = {DEFAULT_SAFEKEEPER_ID} -pg_port = {DEFAULT_SAFEKEEPER_PG_PORT} -http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT} - -"#, - ); - - for i in 0..num_pageservers { - let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64); - let pg_port = DEFAULT_PAGESERVER_PG_PORT + i; - let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i; - - template += &format!( - r#" -[[pageservers]] -id = {pageserver_id} -listen_pg_addr = '127.0.0.1:{pg_port}' -listen_http_addr = '127.0.0.1:{http_port}' -pg_auth_type = '{trust_auth}' -http_auth_type = '{trust_auth}' -"#, - trust_auth = AuthType::Trust, - ) - } - - template -} - /// /// Timelines tree element used as a value in the HashMap. /// @@ -152,7 +117,7 @@ fn main() -> Result<()> { }; match subcommand_result { - Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?, + Ok(Some(updated_env)) => updated_env.persist_config()?, Ok(None) => (), Err(e) => { eprintln!("command failed: {e:?}"); @@ -341,55 +306,65 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result anyhow::Result { - let num_pageservers = init_match - .get_one::("num-pageservers") - .expect("num-pageservers arg has a default"); - // Create config file - let toml_file: String = if let Some(config_path) = init_match.get_one::("config") { + let num_pageservers = init_match.get_one::("num-pageservers"); + + let force = init_match.get_one("force").expect("we set a default value"); + + // Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`. + let init_conf: NeonLocalInitConf = if let Some(config_path) = + init_match.get_one::("config") + { + // User (likely the Python test suite) provided a description of the environment. + if num_pageservers.is_some() { + bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead"); + } // load and parse the file - std::fs::read_to_string(config_path).with_context(|| { + let contents = std::fs::read_to_string(config_path).with_context(|| { format!( "Could not read configuration file '{}'", config_path.display() ) - })? + })?; + toml_edit::de::from_str(&contents)? } else { - // Built-in default config - default_conf(*num_pageservers) + // User (likely interactive) did not provide a description of the environment, give them the default + NeonLocalInitConf { + control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())), + broker: NeonBroker { + listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(), + }, + safekeepers: vec![SafekeeperConf { + id: DEFAULT_SAFEKEEPER_ID, + pg_port: DEFAULT_SAFEKEEPER_PG_PORT, + http_port: DEFAULT_SAFEKEEPER_HTTP_PORT, + ..Default::default() + }], + pageservers: (0..num_pageservers.copied().unwrap_or(1)) + .map(|i| { + let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64); + let pg_port = DEFAULT_PAGESERVER_PG_PORT + i; + let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i; + NeonLocalInitPageserverConf { + id: pageserver_id, + listen_pg_addr: format!("127.0.0.1:{pg_port}"), + listen_http_addr: format!("127.0.0.1:{http_port}"), + pg_auth_type: AuthType::Trust, + http_auth_type: AuthType::Trust, + other: Default::default(), + } + }) + .collect(), + pg_distrib_dir: None, + neon_distrib_dir: None, + default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)), + storage_controller: None, + control_plane_compute_hook_api: None, + } }; - let pageserver_config: toml_edit::Document = - if let Some(path) = init_match.get_one::("pageserver-config") { - std::fs::read_to_string(path)?.parse()? - } else { - toml_edit::Document::new() - }; - - let pg_version = init_match - .get_one::("pg-version") - .copied() - .context("Failed to parse postgres version from the argument string")?; - - let mut env = - LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; - let force = init_match.get_one("force").expect("we set a default value"); - env.init(pg_version, force) - .context("Failed to initialize neon repository")?; - - // Create remote storage location for default LocalFs remote storage - std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; - - // Initialize pageserver, create initial tenant and timeline. - for ps_conf in &env.pageservers { - PageServerNode::from_env(&env, ps_conf) - .initialize(pageserver_config.clone()) - .unwrap_or_else(|e| { - eprintln!("pageserver init failed: {e:?}"); - exit(1); - }); - } - - Ok(env) + LocalEnv::init(init_conf, force) + .context("materialize initial neon_local environment on disk")?; + Ok(LocalEnv::load_config().expect("freshly written config should be loadable")) } /// The default pageserver is the one where CLI tenant/timeline operations are sent by default. @@ -1418,9 +1393,7 @@ fn cli() -> Command { let num_pageservers_arg = Arg::new("num-pageservers") .value_parser(value_parser!(u16)) .long("num-pageservers") - .help("How many pageservers to create (default 1)") - .required(false) - .default_value("1"); + .help("How many pageservers to create (default 1)"); let update_catalog = Arg::new("update-catalog") .value_parser(value_parser!(bool)) @@ -1454,14 +1427,6 @@ fn cli() -> Command { .value_parser(value_parser!(PathBuf)) .value_name("config") ) - .arg( - Arg::new("pageserver-config") - .long("pageserver-config") - .required(false) - .value_parser(value_parser!(PathBuf)) - .value_name("pageserver-config") - .help("Merge the provided pageserver config into the one generated by neon_local."), - ) .arg(pg_version_arg.clone()) .arg(force_arg) ) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 7abbbce95a..d13884198e 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -3,7 +3,7 @@ //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. -use anyhow::{bail, ensure, Context}; +use anyhow::{bail, Context}; use clap::ValueEnum; use postgres_backend::AuthType; @@ -23,6 +23,8 @@ use utils::{ id::{NodeId, TenantId, TenantTimelineId, TimelineId}, }; +use crate::pageserver::PageServerNode; +use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR; use crate::safekeeper::SafekeeperNode; pub const DEFAULT_PG_VERSION: u32 = 15; @@ -34,7 +36,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15; // to 'neon_local init --config=' option. See control_plane/simple.conf for // an example. // -#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +#[derive(PartialEq, Eq, Clone, Debug)] pub struct LocalEnv { // Base directory for all the nodes (the pageserver, safekeepers and // compute endpoints). @@ -42,59 +44,99 @@ pub struct LocalEnv { // This is not stored in the config file. Rather, this is the path where the // config file itself is. It is read from the NEON_REPO_DIR env variable or // '.neon' if not given. - #[serde(skip)] pub base_data_dir: PathBuf, // Path to postgres distribution. It's expected that "bin", "include", // "lib", "share" from postgres distribution are there. If at some point // in time we will be able to run against vanilla postgres we may split that // to four separate paths and match OS-specific installation layout. - #[serde(default)] pub pg_distrib_dir: PathBuf, // Path to pageserver binary. - #[serde(default)] pub neon_distrib_dir: PathBuf, // Default tenant ID to use with the 'neon_local' command line utility, when // --tenant_id is not explicitly specified. - #[serde(default)] pub default_tenant_id: Option, // used to issue tokens during e.g pg start - #[serde(default)] pub private_key_path: PathBuf, pub broker: NeonBroker, // Configuration for the storage controller (1 per neon_local environment) - #[serde(default)] pub storage_controller: NeonStorageControllerConf, /// This Vec must always contain at least one pageserver + /// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s. + /// NB: not used anymore except for informing users that they need to change their `.neon/config`. pub pageservers: Vec, - #[serde(default)] pub safekeepers: Vec, // Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will // be propagated into each pageserver's configuration. - #[serde(default)] pub control_plane_api: Option, // Control plane upcall API for storage controller. If set, this will be propagated into the // storage controller's configuration. - #[serde(default)] pub control_plane_compute_hook_api: Option, /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. - #[serde(default)] // A `HashMap>` would be more appropriate here, // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". + pub branch_name_mappings: HashMap>, +} + +/// On-disk state stored in `.neon/config`. +#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)] +#[serde(default, deny_unknown_fields)] +pub struct OnDiskConfig { + pub pg_distrib_dir: PathBuf, + pub neon_distrib_dir: PathBuf, + pub default_tenant_id: Option, + pub private_key_path: PathBuf, + pub broker: NeonBroker, + pub storage_controller: NeonStorageControllerConf, + #[serde( + skip_serializing, + deserialize_with = "fail_if_pageservers_field_specified" + )] + pub pageservers: Vec, + pub safekeepers: Vec, + pub control_plane_api: Option, + pub control_plane_compute_hook_api: Option, branch_name_mappings: HashMap>, } +fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + Err(serde::de::Error::custom( + "The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \ + Please remove the `pageservers` from your .neon/config.", + )) +} + +/// The description of the neon_local env to be initialized by `neon_local init --config`. +#[derive(Clone, Debug, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct NeonLocalInitConf { + // TODO: do we need this? Seems unused + pub pg_distrib_dir: Option, + // TODO: do we need this? Seems unused + pub neon_distrib_dir: Option, + pub default_tenant_id: TenantId, + pub broker: NeonBroker, + pub storage_controller: Option, + pub pageservers: Vec, + pub safekeepers: Vec, + pub control_plane_api: Option>, + pub control_plane_compute_hook_api: Option>, +} + /// Broker config for cluster internal communication. #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] @@ -141,24 +183,18 @@ impl NeonBroker { } } +// neon_local needs to know this subset of pageserver configuration. +// For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`. +// It can get stale if `pageserver.toml` is changed. +// TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml` #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default, deny_unknown_fields)] pub struct PageServerConf { - // node id pub id: NodeId, - - // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, - - // auth type used for the PG and HTTP ports pub pg_auth_type: AuthType, pub http_auth_type: AuthType, - - pub(crate) virtual_file_io_engine: Option, - pub(crate) get_vectored_impl: Option, - pub(crate) get_impl: Option, - pub(crate) validate_vectored_get: Option, } impl Default for PageServerConf { @@ -169,10 +205,40 @@ impl Default for PageServerConf { listen_http_addr: String::new(), pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, - virtual_file_io_engine: None, - get_vectored_impl: None, - get_impl: None, - validate_vectored_get: None, + } + } +} + +/// The toml that can be passed to `neon_local init --config`. +/// This is a subset of the `pageserver.toml` configuration. +// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656) +#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] +pub struct NeonLocalInitPageserverConf { + pub id: NodeId, + pub listen_pg_addr: String, + pub listen_http_addr: String, + pub pg_auth_type: AuthType, + pub http_auth_type: AuthType, + #[serde(flatten)] + pub other: HashMap, +} + +impl From<&NeonLocalInitPageserverConf> for PageServerConf { + fn from(conf: &NeonLocalInitPageserverConf) -> Self { + let NeonLocalInitPageserverConf { + id, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + other: _, + } = conf; + Self { + id: *id, + listen_pg_addr: listen_pg_addr.clone(), + listen_http_addr: listen_http_addr.clone(), + pg_auth_type: *pg_auth_type, + http_auth_type: *http_auth_type, } } } @@ -360,44 +426,7 @@ impl LocalEnv { .collect() } - /// Create a LocalEnv from a config file. - /// - /// Unlike 'load_config', this function fills in any defaults that are missing - /// from the config file. - pub fn parse_config(toml: &str) -> anyhow::Result { - let mut env: LocalEnv = toml::from_str(toml)?; - - // Find postgres binaries. - // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". - // Note that later in the code we assume, that distrib dirs follow the same pattern - // for all postgres versions. - if env.pg_distrib_dir == Path::new("") { - if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { - env.pg_distrib_dir = postgres_bin.into(); - } else { - let cwd = env::current_dir()?; - env.pg_distrib_dir = cwd.join("pg_install") - } - } - - // Find neon binaries. - if env.neon_distrib_dir == Path::new("") { - env::current_exe()? - .parent() - .unwrap() - .clone_into(&mut env.neon_distrib_dir); - } - - if env.pageservers.is_empty() { - anyhow::bail!("Configuration must contain at least one pageserver"); - } - - env.base_data_dir = base_path(); - - Ok(env) - } - - /// Locate and load config + /// Construct `Self` from on-disk state. pub fn load_config() -> anyhow::Result { let repopath = base_path(); @@ -411,38 +440,129 @@ impl LocalEnv { // TODO: check that it looks like a neon repository // load and parse file - let config = fs::read_to_string(repopath.join("config"))?; - let mut env: LocalEnv = toml::from_str(config.as_str())?; + let config_file_contents = fs::read_to_string(repopath.join("config"))?; + let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?; + let mut env = { + let OnDiskConfig { + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + private_key_path, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + branch_name_mappings, + } = on_disk_config; + LocalEnv { + base_data_dir: repopath.clone(), + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + private_key_path, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + branch_name_mappings, + } + }; - env.base_data_dir = repopath; + // The source of truth for pageserver configuration is the pageserver.toml. + assert!( + env.pageservers.is_empty(), + "we ensure this during deserialization" + ); + env.pageservers = { + let iter = std::fs::read_dir(&repopath).context("open dir")?; + let mut pageservers = Vec::new(); + for res in iter { + let dentry = res?; + const PREFIX: &str = "pageserver_"; + let dentry_name = dentry + .file_name() + .into_string() + .ok() + .with_context(|| format!("non-utf8 dentry: {:?}", dentry.path())) + .unwrap(); + if !dentry_name.starts_with(PREFIX) { + continue; + } + if !dentry.file_type().context("determine file type")?.is_dir() { + anyhow::bail!("expected a directory, got {:?}", dentry.path()); + } + let id = dentry_name[PREFIX.len()..] + .parse::() + .with_context(|| format!("parse id from {:?}", dentry.path()))?; + // TODO(christian): use pageserver_api::config::ConfigToml (PR #7656) + #[derive(serde::Serialize, serde::Deserialize)] + // (allow unknown fields, unlike PageServerConf) + struct PageserverConfigTomlSubset { + id: NodeId, + listen_pg_addr: String, + listen_http_addr: String, + pg_auth_type: AuthType, + http_auth_type: AuthType, + } + let config_toml_path = dentry.path().join("pageserver.toml"); + let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str( + &std::fs::read_to_string(&config_toml_path) + .with_context(|| format!("read {:?}", config_toml_path))?, + ) + .context("parse pageserver.toml")?; + let PageserverConfigTomlSubset { + id: config_toml_id, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + } = config_toml; + let conf = PageServerConf { + id: { + anyhow::ensure!( + config_toml_id == id, + "id mismatch: config_toml.id={config_toml_id} id={id}", + ); + id + }, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + }; + pageservers.push(conf); + } + pageservers + }; Ok(env) } - pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { - // Currently, the user first passes a config file with 'neon_local init --config=' - // We read that in, in `create_config`, and fill any missing defaults. Then it's saved - // to .neon/config. TODO: We lose any formatting and comments along the way, which is - // a bit sad. - let mut conf_content = r#"# This file describes a local deployment of the page server -# and safekeeeper node. It is read by the 'neon_local' command-line -# utility. -"# - .to_string(); - - // Convert the LocalEnv to a toml file. - // - // This could be as simple as this: - // - // conf_content += &toml::to_string_pretty(env)?; - // - // But it results in a "values must be emitted before tables". I'm not sure - // why, AFAICS the table, i.e. 'safekeepers: Vec' is last. - // Maybe rust reorders the fields to squeeze avoid padding or something? - // In any case, converting to toml::Value first, and serializing that, works. - // See https://github.com/alexcrichton/toml-rs/issues/142 - conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?; + pub fn persist_config(&self) -> anyhow::Result<()> { + Self::persist_config_impl( + &self.base_data_dir, + &OnDiskConfig { + pg_distrib_dir: self.pg_distrib_dir.clone(), + neon_distrib_dir: self.neon_distrib_dir.clone(), + default_tenant_id: self.default_tenant_id, + private_key_path: self.private_key_path.clone(), + broker: self.broker.clone(), + storage_controller: self.storage_controller.clone(), + pageservers: vec![], // it's skip_serializing anyway + safekeepers: self.safekeepers.clone(), + control_plane_api: self.control_plane_api.clone(), + control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(), + branch_name_mappings: self.branch_name_mappings.clone(), + }, + ) + } + pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> { + let conf_content = &toml::to_string_pretty(config)?; let target_config_path = base_path.join("config"); fs::write(&target_config_path, conf_content).with_context(|| { format!( @@ -467,17 +587,13 @@ impl LocalEnv { } } - // - // Initialize a new Neon repository - // - pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> { - // check if config already exists - let base_path = &self.base_data_dir; - ensure!( - base_path != Path::new(""), - "repository base path is missing" - ); + /// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`]. + pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> { + let base_path = base_path(); + assert_ne!(base_path, Path::new("")); + let base_path = &base_path; + // create base_path dir if base_path.exists() { match force { InitForceMode::MustNotExist => { @@ -509,74 +625,96 @@ impl LocalEnv { } } } - - if !self.pg_bin_dir(pg_version)?.join("postgres").exists() { - bail!( - "Can't find postgres binary at {}", - self.pg_bin_dir(pg_version)?.display() - ); - } - for binary in ["pageserver", "safekeeper"] { - if !self.neon_distrib_dir.join(binary).exists() { - bail!( - "Can't find binary '{binary}' in neon distrib dir '{}'", - self.neon_distrib_dir.display() - ); - } - } - if !base_path.exists() { fs::create_dir(base_path)?; } + let NeonLocalInitConf { + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + } = conf; + + // Find postgres binaries. + // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". + // Note that later in the code we assume, that distrib dirs follow the same pattern + // for all postgres versions. + let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| { + if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { + postgres_bin.into() + } else { + let cwd = env::current_dir().unwrap(); + cwd.join("pg_install") + } + }); + + // Find neon binaries. + let neon_distrib_dir = neon_distrib_dir + .unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned()); + // Generate keypair for JWT. // // The keypair is only needed if authentication is enabled in any of the // components. For convenience, we generate the keypair even if authentication // is not enabled, so that you can easily enable it after the initialization - // step. However, if the key generation fails, we treat it as non-fatal if - // authentication was not enabled. - if self.private_key_path == PathBuf::new() { - match generate_auth_keys( - base_path.join("auth_private_key.pem").as_path(), - base_path.join("auth_public_key.pem").as_path(), - ) { - Ok(()) => { - self.private_key_path = PathBuf::from("auth_private_key.pem"); - } - Err(e) => { - if !self.auth_keys_needed() { - eprintln!("Could not generate keypair for JWT authentication: {e}"); - eprintln!("Continuing anyway because authentication was not enabled"); - self.private_key_path = PathBuf::from("auth_private_key.pem"); - } else { - return Err(e); - } - } - } + // step. + generate_auth_keys( + base_path.join("auth_private_key.pem").as_path(), + base_path.join("auth_public_key.pem").as_path(), + ) + .context("generate auth keys")?; + let private_key_path = PathBuf::from("auth_private_key.pem"); + + // create the runtime type because the remaining initialization code below needs + // a LocalEnv instance op operation + // TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state + let env = LocalEnv { + base_data_dir: base_path.clone(), + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id: Some(default_tenant_id), + private_key_path, + broker, + storage_controller: storage_controller.unwrap_or_default(), + pageservers: pageservers.iter().map(Into::into).collect(), + safekeepers, + control_plane_api: control_plane_api.unwrap_or_default(), + control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(), + branch_name_mappings: Default::default(), + }; + + // create endpoints dir + fs::create_dir_all(env.endpoints_path())?; + + // create safekeeper dirs + for safekeeper in &env.safekeepers { + fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?; } - fs::create_dir_all(self.endpoints_path())?; - - for safekeeper in &self.safekeepers { - fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?; + // initialize pageserver state + for (i, ps) in pageservers.into_iter().enumerate() { + let runtime_ps = &env.pageservers[i]; + assert_eq!(&PageServerConf::from(&ps), runtime_ps); + fs::create_dir(env.pageserver_data_dir(ps.id))?; + PageServerNode::from_env(&env, runtime_ps) + .initialize(ps) + .context("pageserver init failed")?; } - for ps in &self.pageservers { - fs::create_dir(self.pageserver_data_dir(ps.id))?; - } + // setup remote remote location for default LocalFs remote storage + std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; - self.persist_config(base_path) - } - - fn auth_keys_needed(&self) -> bool { - self.pageservers.iter().any(|ps| { - ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT - }) || self.safekeepers.iter().any(|sk| sk.auth_enabled) + env.persist_config() } } -fn base_path() -> PathBuf { +pub fn base_path() -> PathBuf { match std::env::var_os("NEON_REPO_DIR") { Some(val) => PathBuf::from(val), None => PathBuf::from(".neon"), @@ -619,31 +757,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow } Ok(()) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn simple_conf_parsing() { - let simple_conf_toml = include_str!("../simple.conf"); - let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml); - assert!( - simple_conf_parse_result.is_ok(), - "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}" - ); - - let string_to_replace = "listen_addr = '127.0.0.1:50051'"; - let spoiled_url_str = "listen_addr = '!@$XOXO%^&'"; - let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str); - assert!( - spoiled_url_toml.contains(spoiled_url_str), - "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}" - ); - let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml); - assert!( - spoiled_url_parse_result.is_err(), - "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}" - ); - } -} diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 6046c93bad..5a84763697 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -30,7 +30,7 @@ use utils::{ lsn::Lsn, }; -use crate::local_env::PageServerConf; +use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf}; use crate::{background_process, local_env::LocalEnv}; /// Directory within .neon which will be used by default for LocalFs remote storage. @@ -76,9 +76,11 @@ impl PageServerNode { fn pageserver_init_make_toml( &self, - cli_overrides: toml_edit::Document, + conf: NeonLocalInitPageserverConf, ) -> anyhow::Result { - // TODO: this is a legacy code, it should be refactored to use toml_edit directly. + assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully"); + + // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656) // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = format!( @@ -86,61 +88,9 @@ impl PageServerNode { self.env.pg_distrib_dir_raw().display() ); - let PageServerConf { - id, - listen_pg_addr, - listen_http_addr, - pg_auth_type, - http_auth_type, - virtual_file_io_engine, - get_vectored_impl, - get_impl, - validate_vectored_get, - } = &self.conf; - - let id = format!("id={}", id); - - let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type); - let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr); - - let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type); - let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr); - let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine { - format!("virtual_file_io_engine='{virtual_file_io_engine}'") - } else { - String::new() - }; - let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl { - format!("get_vectored_impl='{get_vectored_impl}'") - } else { - String::new() - }; - let get_impl = if let Some(get_impl) = get_impl { - format!("get_impl='{get_impl}'") - } else { - String::new() - }; - let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get { - format!("validate_vectored_get={validate_vectored_get}") - } else { - String::new() - }; - let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url()); - let mut overrides = vec![ - id, - pg_distrib_dir_param, - http_auth_type_param, - pg_auth_type_param, - listen_http_addr_param, - listen_pg_addr_param, - broker_endpoint_param, - virtual_file_io_engine, - get_vectored_impl, - get_impl, - validate_vectored_get, - ]; + let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param]; if let Some(control_plane_api) = &self.env.control_plane_api { overrides.push(format!( @@ -150,7 +100,7 @@ impl PageServerNode { // Storage controller uses the same auth as pageserver: if JWT is enabled // for us, we will also need it to talk to them. - if matches!(http_auth_type, AuthType::NeonJWT) { + if matches!(conf.http_auth_type, AuthType::NeonJWT) { let jwt_token = self .env .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) @@ -159,20 +109,23 @@ impl PageServerNode { } } - if !cli_overrides.contains_key("remote_storage") { + if !conf.other.contains_key("remote_storage") { overrides.push(format!( "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}" )); } - if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust { + if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust { // Keys are generated in the toplevel repo dir, pageservers' workdirs // are one level below that, so refer to keys with ../ overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned()); } // Apply the user-provided overrides - overrides.push(cli_overrides.to_string()); + overrides.push( + toml_edit::ser::to_string_pretty(&conf) + .expect("we deserialized this from toml earlier"), + ); // Turn `overrides` into a toml document. // TODO: above code is legacy code, it should be refactored to use toml_edit directly. @@ -188,8 +141,8 @@ impl PageServerNode { } /// Initializes a pageserver node by creating its config with the overrides provided. - pub fn initialize(&self, config_overrides: toml_edit::Document) -> anyhow::Result<()> { - self.pageserver_init(config_overrides) + pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { + self.pageserver_init(conf) .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id)) } @@ -209,7 +162,7 @@ impl PageServerNode { self.start_node().await } - fn pageserver_init(&self, cli_overrides: toml_edit::Document) -> anyhow::Result<()> { + fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { let datadir = self.repo_path(); let node_id = self.conf.id; println!( @@ -221,7 +174,7 @@ impl PageServerNode { io::stdout().flush()?; let config = self - .pageserver_init_make_toml(cli_overrides) + .pageserver_init_make_toml(conf) .context("make pageserver toml")?; let config_file_path = datadir.join("pageserver.toml"); let mut config_file = std::fs::OpenOptions::new() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 30cec4c726..f618c508bc 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -14,7 +14,7 @@ import textwrap import threading import time import uuid -from contextlib import ExitStack, closing, contextmanager +from contextlib import closing, contextmanager from dataclasses import dataclass from datetime import datetime from enum import Enum @@ -1054,14 +1054,14 @@ class NeonEnv: self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_aux_file_policy = config.pageserver_aux_file_policy - # Create a config file corresponding to the options + # Create the neon_local's `NeonLocalInitConf` cfg: Dict[str, Any] = { "default_tenant_id": str(self.initial_tenant), "broker": { "listen_addr": self.broker.listen_addr(), }, - "pageservers": [], "safekeepers": [], + "pageservers": [], } if self.control_plane_api is not None: @@ -1100,6 +1100,17 @@ class NeonEnv: if config.pageserver_validate_vectored_get is not None: ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get + if self.pageserver_remote_storage is not None: + ps_cfg["remote_storage"] = remote_storage_to_toml_dict( + self.pageserver_remote_storage + ) + + if config.pageserver_config_override is not None: + for o in config.pageserver_config_override.split(";"): + override = toml.loads(o) + for key, value in override.items(): + ps_cfg[key] = value + # Create a corresponding NeonPageserver object self.pageservers.append( NeonPageserver( @@ -1136,7 +1147,6 @@ class NeonEnv: self.neon_cli.init( cfg, force=config.config_init_force, - pageserver_config_override=config.pageserver_config_override, ) def start(self): @@ -1722,46 +1732,22 @@ class NeonCli(AbstractNeonCli): def init( self, - config: Dict[str, Any], + init_config: Dict[str, Any], force: Optional[str] = None, - pageserver_config_override: Optional[str] = None, ) -> "subprocess.CompletedProcess[str]": - remote_storage = self.env.pageserver_remote_storage - - ps_config = {} - if remote_storage is not None: - ps_config["remote_storage"] = remote_storage_to_toml_dict(remote_storage) - - if pageserver_config_override is not None: - for o in pageserver_config_override.split(";"): - override = toml.loads(o) - for key, value in override.items(): - ps_config[key] = value - - with ExitStack() as stack: - ps_config_file = stack.enter_context(tempfile.NamedTemporaryFile(mode="w+")) - ps_config_file.write(toml.dumps(ps_config)) - ps_config_file.flush() - - neon_local_config = stack.enter_context(tempfile.NamedTemporaryFile(mode="w+")) - neon_local_config.write(toml.dumps(config)) - neon_local_config.flush() + with tempfile.NamedTemporaryFile(mode="w+") as init_config_tmpfile: + init_config_tmpfile.write(toml.dumps(init_config)) + init_config_tmpfile.flush() cmd = [ "init", - f"--config={neon_local_config.name}", - "--pg-version", - self.env.pg_version, - f"--pageserver-config={ps_config_file.name}", + f"--config={init_config_tmpfile.name}", ] if force is not None: cmd.extend(["--force", force]) - s3_env_vars = None - if isinstance(remote_storage, S3Storage): - s3_env_vars = remote_storage.access_env_vars() - res = self.raw_cli(cmd, extra_env_vars=s3_env_vars) + res = self.raw_cli(cmd) res.check_returncode() return res diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index be351db429..bd7e4f118f 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -2,6 +2,7 @@ import subprocess from pathlib import Path from typing import Optional +import toml from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, @@ -12,9 +13,11 @@ from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until -def test_pageserver_init_node_id( - neon_simple_env: NeonEnv, neon_binpath: Path, pg_distrib_dir: Path -): +def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path): + """ + NB: The neon_local doesn't use `--init` mode anymore, but our production + deployment still does => https://github.com/neondatabase/aws/pull/1322 + """ workdir = neon_simple_env.pageserver.workdir pageserver_config = workdir / "pageserver.toml" pageserver_bin = neon_binpath / "pageserver" @@ -28,18 +31,36 @@ def test_pageserver_init_node_id( stderr=subprocess.PIPE, ) - # remove initial config and stop existing pageserver - pageserver_config.unlink() neon_simple_env.pageserver.stop() - bad_init = run_pageserver(["--init", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']) + with open(neon_simple_env.pageserver.config_toml_path, "r") as f: + ps_config = toml.load(f) + + required_config_keys = [ + "pg_distrib_dir", + "listen_pg_addr", + "listen_http_addr", + "pg_auth_type", + "http_auth_type", + ] + required_config_overrides = [ + f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys + ] + + pageserver_config.unlink() + + bad_init = run_pageserver(["--init", *required_config_overrides]) assert ( bad_init.returncode == 1 ), "pageserver should not be able to init new config without the node id" assert 'missing config value "id"' in bad_init.stderr assert not pageserver_config.exists(), "config file should not be created after init error" - good_init_cmd = ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] + good_init_cmd = [ + "--init", + f"--config-override=id={ps_config['id']}", + *required_config_overrides, + ] completed_init = run_pageserver(good_init_cmd) assert ( completed_init.returncode == 0 From 0457980728d93e6c3a4fc25b6f5b6052bdff1457 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 8 May 2024 16:22:13 +0100 Subject: [PATCH 005/130] Fix flaky test_gc_of_remote_layers (#7647) Fixes flaky test `test_gc_of_remote_layers`, which was failing because of the `Nothing to GC` pageserver log. I looked into the fails, it seems that backround `gc_loop` sometimes started GC for initial tenant, which wasn't configured to disable GC. The fix is to not create initial tenant with enabled gc at all. Fixes #7538 --- test_runner/regress/test_layer_eviction.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 5c967fd72e..b178baea11 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -159,7 +159,9 @@ def test_basic_eviction( def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() + # don't create initial tenant, we'll create it manually with custom config + env = neon_env_builder.init_configs() + env.start() tenant_config = { "pitr_interval": "1s", # set to non-zero, so GC actually does something From 21e1a496a3f706097578de396a9107813c541001 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Wed, 8 May 2024 08:49:57 -0700 Subject: [PATCH 006/130] Expose LSN and replication delay as metrics (#7610) ## Problem We currently have no way to see what the current LSN of a compute its, and in case of read replicas, we don't know what the difference in LSNs is. ## Summary of changes Adds these metrics --- vm-image-spec.yaml | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 41ca16f16b..56538630ac 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -244,6 +244,49 @@ files: values: [approximate_working_set_size] query: | select neon.approximate_working_set_size(false) as approximate_working_set_size; + + - metric_name: current_lsn + type: gauge + help: 'Current LSN of the database' + key_labels: + values: [lsn] + query: | + select + case + when pg_catalog.pg_is_in_recovery() + then pg_last_wal_replay_lsn() + else pg_current_wal_lsn() + end as lsn; + + - metric_name: replication_delay_bytes + type: gauge + help: 'Bytes between received and replayed LSN' + key_labels: + values: [replication_delay_bytes] + query: | + SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes; + + - metric_name: replication_delay_seconds + type: gauge + help: 'Time since last LSN was replayed' + key_labels: + values: [replication_delay_seconds] + query: | + SELECT + CASE + WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 + ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) + END AS replication_delay_seconds; + + - metric_name: checkpoint_stats + type: gauge + help: 'Number of requested and scheduled checkpoints' + key_labels: + values: + - checkpoints_req + - checkpoints_timed + query: | + SELECT checkpoints_req, checkpoints_timed FROM pg_stat_bgwriter; - filename: neon_collector_autoscaling.yml content: | collector_name: neon_collector_autoscaling @@ -295,7 +338,6 @@ files: values: [approximate_working_set_size] query: | select neon.approximate_working_set_size(false) as approximate_working_set_size; - build: | # Build cgroup-tools # From 1173ee6a7e1168e671a6847eb94807b45c703490 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Wed, 8 May 2024 11:53:54 -0400 Subject: [PATCH 007/130] chore(neon_test_utils): restrict installation to superuser (#7624) The test utils should only be used during tests. Users should not be able to create this extension on their own. Signed-off-by: Alex Chi Z --- pgxn/neon_test_utils/neon_test_utils.control | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control index 5f6d640835..8c5b9b5dfe 100644 --- a/pgxn/neon_test_utils/neon_test_utils.control +++ b/pgxn/neon_test_utils/neon_test_utils.control @@ -3,4 +3,5 @@ comment = 'helpers for neon testing and debugging' default_version = '1.1' module_pathname = '$libdir/neon_test_utils' relocatable = true -trusted = true +trusted = false +superuser = true From ca154d9cd843dcc10d234266be0effff091e71e7 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 8 May 2024 17:50:21 +0100 Subject: [PATCH 008/130] pageserver: local layer path followups (#7640) - Rename "filename" types which no longer map directly to a filename (LayerFileName -> LayerName) - Add a -v1- part to local layer paths to smooth the path to future updates (we anticipate a -v2- that uses checksums later) - Rename methods that refer to the string-ized version of a LayerName to no longer be called "filename" - Refactor reconcile() function to use a LocalLayerFileMetadata type that includes the local path, rather than carrying local path separately in a tuple and unwrap()'ing it later. --- pageserver/benches/bench_layer_map.rs | 4 +- pageserver/ctl/src/index_part.rs | 4 +- pageserver/src/deletion_queue.rs | 16 +-- pageserver/src/deletion_queue/list_writer.rs | 4 +- pageserver/src/disk_usage_eviction_task.rs | 8 +- pageserver/src/http/routes.rs | 6 +- .../src/tenant/remote_timeline_client.rs | 64 +++++----- .../tenant/remote_timeline_client/download.rs | 4 +- .../tenant/remote_timeline_client/index.rs | 6 +- pageserver/src/tenant/secondary.rs | 4 +- pageserver/src/tenant/secondary/downloader.rs | 12 +- pageserver/src/tenant/secondary/heatmap.rs | 8 +- pageserver/src/tenant/storage_layer.rs | 20 +-- .../src/tenant/storage_layer/delta_layer.rs | 14 ++- .../src/tenant/storage_layer/image_layer.rs | 21 ++-- pageserver/src/tenant/storage_layer/layer.rs | 39 +++--- .../src/tenant/storage_layer/layer_desc.rs | 28 ++--- .../{filename.rs => layer_name.rs} | 115 +++++++++--------- pageserver/src/tenant/timeline.rs | 71 +++++------ .../src/tenant/timeline/detach_ancestor.rs | 2 +- pageserver/src/tenant/timeline/init.rs | 93 ++++++++------ .../src/tenant/timeline/layer_manager.rs | 2 +- pageserver/src/tenant/upload_queue.rs | 8 +- s3_scrubber/src/checks.rs | 23 ++-- s3_scrubber/src/tenant_snapshot.rs | 18 +-- test_runner/fixtures/neon_fixtures.py | 4 +- test_runner/fixtures/pageserver/types.py | 29 +++-- .../regress/test_layers_from_future.py | 8 +- .../regress/test_pageserver_generations.py | 4 +- 29 files changed, 324 insertions(+), 315 deletions(-) rename pageserver/src/tenant/storage_layer/{filename.rs => layer_name.rs} (72%) diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 5d05af0c00..1d02aa7709 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,7 +1,7 @@ use pageserver::keyspace::{KeyPartitioning, KeySpace}; use pageserver::repository::Key; use pageserver::tenant::layer_map::LayerMap; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::storage_layer::PersistentLayerDesc; use pageserver_api::shard::TenantShardId; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; @@ -28,7 +28,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap { let mut updates = layer_map.batch_update(); for fname in filenames { let fname = fname.unwrap(); - let fname = LayerFileName::from_str(&fname).unwrap(); + let fname = LayerName::from_str(&fname).unwrap(); let layer = PersistentLayerDesc::from(fname); let lsn_range = layer.get_lsn_range(); diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs index 20e5572914..0d010eb009 100644 --- a/pageserver/ctl/src/index_part.rs +++ b/pageserver/ctl/src/index_part.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use anyhow::Context; use camino::Utf8PathBuf; use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::{metadata::TimelineMetadata, IndexPart}; use utils::lsn::Lsn; @@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?; #[derive(serde::Serialize)] struct Output<'a> { - layer_metadata: &'a HashMap, + layer_metadata: &'a HashMap, disk_consistent_lsn: Lsn, timeline_metadata: &'a TimelineMetadata, } diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index e3c11cb299..c937309d83 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -38,7 +38,7 @@ use deleter::DeleterMessage; use list_writer::ListWriterQueueMessage; use validator::ValidatorQueueMessage; -use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName}; +use crate::{config::PageServerConf, tenant::storage_layer::LayerName}; // TODO: configurable for how long to wait before executing deletions @@ -479,7 +479,7 @@ impl DeletionQueueClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { if current_generation.is_none() { debug!("Enqueuing deletions in legacy mode, skipping queue"); @@ -511,7 +511,7 @@ impl DeletionQueueClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { metrics::DELETION_QUEUE .keys_submitted @@ -734,20 +734,20 @@ mod test { use crate::{ control_plane_client::RetryForeverError, repository::Key, - tenant::{harness::TenantHarness, storage_layer::DeltaFileName}, + tenant::{harness::TenantHarness, storage_layer::DeltaLayerName}, }; use super::*; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); - pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName { + pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51), }); // When you need a second layer in a test. - pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName { + pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61), }); @@ -797,7 +797,7 @@ mod test { /// Returns remote layer file name, suitable for use in assert_remote_files fn write_remote_layer( &self, - file_name: LayerFileName, + file_name: LayerName, gen: Generation, ) -> anyhow::Result { let tenant_shard_id = self.harness.tenant_shard_id; @@ -952,7 +952,7 @@ mod test { let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; - let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); let tenant_shard_id = ctx.harness.tenant_shard_id; let content: Vec = "victim1 contents".into(); diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index 3a3d600ac2..ae3b2c9180 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -34,7 +34,7 @@ use crate::deletion_queue::TEMP_SUFFIX; use crate::metrics; use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::LayerFileMetadata; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::LayerName; use crate::virtual_file::on_fatal_io_error; use crate::virtual_file::MaybeFatalIo; @@ -59,7 +59,7 @@ pub(super) struct DeletionOp { // `layers` and `objects` are both just lists of objects. `layers` is used if you do not // have a config object handy to project it to a remote key, and need the consuming worker // to do it for you. - pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>, + pub(super) layers: Vec<(LayerName, LayerFileMetadata)>, pub(super) objects: Vec, /// The _current_ generation of the Tenant shard attachment in which we are enqueuing diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 400930245b..ebeb8bbb20 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -64,7 +64,7 @@ use crate::{ mgr::TenantManager, remote_timeline_client::LayerFileMetadata, secondary::SecondaryTenant, - storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName}, + storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName}, }, }; @@ -604,7 +604,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( pub(crate) struct EvictionSecondaryLayer { pub(crate) secondary_tenant: Arc, pub(crate) timeline_id: TimelineId, - pub(crate) name: LayerFileName, + pub(crate) name: LayerName, pub(crate) metadata: LayerFileMetadata, } @@ -637,9 +637,9 @@ impl EvictionLayer { } } - pub(crate) fn get_name(&self) -> LayerFileName { + pub(crate) fn get_name(&self) -> LayerName { match self { - Self::Attached(l) => l.layer_desc().filename(), + Self::Attached(l) => l.layer_desc().layer_name(), Self::Secondary(sl) => sl.name.clone(), } } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 83b7b8a45e..a8ca642dc5 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -63,7 +63,7 @@ use crate::tenant::remote_timeline_client::list_remote_timelines; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::Timeline; use crate::tenant::SpawnMode; @@ -1229,7 +1229,7 @@ async fn layer_download_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let layer_name = LayerFileName::from_str(layer_file_name) + let layer_name = LayerName::from_str(layer_file_name) .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; let state = get_state(&request); @@ -1261,7 +1261,7 @@ async fn evict_timeline_layer_handler( let layer_file_name = get_request_param(&request, "layer_file_name")?; let state = get_state(&request); - let layer_name = LayerFileName::from_str(layer_file_name) + let layer_name = LayerName::from_str(layer_file_name) .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; let timeline = diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 356a0dc51c..bbe4e16378 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -240,7 +240,7 @@ use utils::id::{TenantId, TimelineId}; use self::index::IndexPart; use super::metadata::MetadataUpdate; -use super::storage_layer::{Layer, LayerFileName, ResidentLayer}; +use super::storage_layer::{Layer, LayerName, ResidentLayer}; use super::upload_queue::SetDeletedFlagProgress; use super::Generation; @@ -503,7 +503,7 @@ impl RemoteTimelineClient { /// On success, returns the size of the downloaded file. pub async fn download_layer_file( &self, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, layer_metadata: &LayerFileMetadata, cancel: &CancellationToken, ctx: &RequestContext, @@ -677,7 +677,7 @@ impl RemoteTimelineClient { for layer in layers { upload_queue .latest_files - .insert(layer.layer_desc().filename(), layer.metadata()); + .insert(layer.layer_desc().layer_name(), layer.metadata()); } self.schedule_index_upload(upload_queue); @@ -713,7 +713,7 @@ impl RemoteTimelineClient { upload_queue .latest_files - .insert(layer.layer_desc().filename(), metadata.clone()); + .insert(layer.layer_desc().layer_name(), metadata.clone()); upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; info!( @@ -737,7 +737,7 @@ impl RemoteTimelineClient { /// successfully. pub fn schedule_layer_file_deletion( self: &Arc, - names: &[LayerFileName], + names: &[LayerName], ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -765,7 +765,7 @@ impl RemoteTimelineClient { // the layer files as "dangling". this is fine, at worst case we create work for the // scrubber. - let names = gc_layers.iter().map(|x| x.layer_desc().filename()); + let names = gc_layers.iter().map(|x| x.layer_desc().layer_name()); self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); @@ -780,9 +780,9 @@ impl RemoteTimelineClient { self: &Arc, upload_queue: &mut UploadQueueInitialized, names: I, - ) -> Vec<(LayerFileName, LayerFileMetadata)> + ) -> Vec<(LayerName, LayerFileMetadata)> where - I: IntoIterator, + I: IntoIterator, { // Decorate our list of names with each name's metadata, dropping // names that are unexpectedly missing from our metadata. This metadata @@ -832,7 +832,7 @@ impl RemoteTimelineClient { /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`]. pub(crate) fn schedule_deletion_of_unlinked( self: &Arc, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -845,7 +845,7 @@ impl RemoteTimelineClient { fn schedule_deletion_of_unlinked0( self: &Arc, upload_queue: &mut UploadQueueInitialized, - mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>, + mut with_metadata: Vec<(LayerName, LayerFileMetadata)>, ) { // Filter out any layers which were not created by this tenant shard. These are // layers that originate from some ancestor shard after a split, and may still @@ -914,7 +914,7 @@ impl RemoteTimelineClient { self.schedule_layer_file_upload0(upload_queue, layer.clone()); } - let names = compacted_from.iter().map(|x| x.layer_desc().filename()); + let names = compacted_from.iter().map(|x| x.layer_desc().layer_name()); self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); self.launch_queued_tasks(upload_queue); @@ -1144,7 +1144,7 @@ impl RemoteTimelineClient { &self.tenant_shard_id.tenant_id, &self.timeline_id, self.tenant_shard_id.to_index(), - &uploaded.layer_desc().filename(), + &uploaded.layer_desc().layer_name(), uploaded.metadata().generation, ); @@ -1185,7 +1185,7 @@ impl RemoteTimelineClient { .get_timeline_id() .expect("Source timeline should be alive"), self.tenant_shard_id.to_index(), - &adopted.layer_desc().filename(), + &adopted.layer_desc().layer_name(), adopted.metadata().generation, ); @@ -1193,7 +1193,7 @@ impl RemoteTimelineClient { &self.tenant_shard_id.tenant_id, &self.timeline_id, self.tenant_shard_id.to_index(), - &adopted_as.layer_desc().filename(), + &adopted_as.layer_desc().layer_name(), adopted_as.metadata().generation, ); @@ -1527,7 +1527,7 @@ impl RemoteTimelineClient { &self.tenant_shard_id.tenant_id, &self.timeline_id, layer_metadata.shard, - &layer.layer_desc().filename(), + &layer.layer_desc().layer_name(), layer_metadata.generation, ); @@ -1896,14 +1896,14 @@ pub fn remote_layer_path( tenant_id: &TenantId, timeline_id: &TimelineId, shard: ShardIndex, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, generation: Generation, ) -> RemotePath { // Generation-aware key format let path = format!( "tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}", shard.get_suffix(), - layer_file_name.file_name(), + layer_file_name, generation.get_suffix() ); @@ -2000,8 +2000,8 @@ mod tests { TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap() } - fn assert_file_list(a: &HashSet, b: &[&str]) { - let mut avec: Vec = a.iter().map(|x| x.file_name()).collect(); + fn assert_file_list(a: &HashSet, b: &[&str]) { + let mut avec: Vec = a.iter().map(|x| x.to_string()).collect(); avec.sort(); let mut bvec = b.to_vec(); @@ -2127,7 +2127,7 @@ mod tests { .layer_metadata .keys() .map(|f| f.to_owned()) - .collect::>(); + .collect::>(); let initial_layer = { assert!(initial_layers.len() == 1); initial_layers.into_iter().next().unwrap() @@ -2153,7 +2153,7 @@ mod tests { ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz")) ] .into_iter() - .map(|(name, contents): (LayerFileName, Vec)| { + .map(|(name, contents): (LayerName, Vec)| { let local_path = local_layer_path( harness.conf, @@ -2234,9 +2234,9 @@ mod tests { .map(|f| f.to_owned()) .collect(), &[ - &initial_layer.file_name(), - &layers[0].layer_desc().filename().file_name(), - &layers[1].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[0].layer_desc().layer_name().to_string(), + &layers[1].layer_desc().layer_name().to_string(), ], ); assert_eq!(index_part.metadata, metadata); @@ -2250,7 +2250,7 @@ mod tests { // keep using schedule_layer_file_deletion because we don't have a way to wait for the // spawn_blocking started by the drop. client - .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()]) + .schedule_layer_file_deletion(&[layers[0].layer_desc().layer_name()]) .unwrap(); { let mut guard = client.upload_queue.lock().unwrap(); @@ -2268,9 +2268,9 @@ mod tests { } assert_remote_files( &[ - &initial_layer.file_name(), - &layers[0].layer_desc().filename().file_name(), - &layers[1].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[0].layer_desc().layer_name().to_string(), + &layers[1].layer_desc().layer_name().to_string(), "index_part.json", ], &remote_timeline_dir, @@ -2283,9 +2283,9 @@ mod tests { assert_remote_files( &[ - &initial_layer.file_name(), - &layers[1].layer_desc().filename().file_name(), - &layers[2].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[1].layer_desc().layer_name().to_string(), + &layers[2].layer_desc().layer_name().to_string(), "index_part.json", ], &remote_timeline_dir, @@ -2305,7 +2305,7 @@ mod tests { } = TestSetup::new("metrics").await.unwrap(); let client = timeline.remote_client.as_ref().unwrap(); - let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); let local_path = local_layer_path( harness.conf, &timeline.tenant_shard_id, diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index c86b22d481..b464437422 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -22,7 +22,7 @@ use crate::context::RequestContext; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::layer::local_layer_path; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; @@ -48,7 +48,7 @@ pub async fn download_layer_file<'a>( storage: &'a GenericRemoteStorage, tenant_shard_id: TenantShardId, timeline_id: TimelineId, - layer_file_name: &'a LayerFileName, + layer_file_name: &'a LayerName, layer_metadata: &'a LayerFileMetadata, cancel: &CancellationToken, ctx: &RequestContext, diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 7e0619945f..3e05905afa 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -8,7 +8,7 @@ use chrono::NaiveDateTime; use serde::{Deserialize, Serialize}; use crate::tenant::metadata::TimelineMetadata; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::LayerName; use crate::tenant::upload_queue::UploadQueueInitialized; use crate::tenant::Generation; use pageserver_api::shard::ShardIndex; @@ -75,7 +75,7 @@ pub struct IndexPart { /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata /// that latest version stores. - pub layer_metadata: HashMap, + pub layer_metadata: HashMap, // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata. // It's duplicated for convenience when reading the serialized structure, but is @@ -104,7 +104,7 @@ impl IndexPart { pub const FILE_NAME: &'static str = "index_part.json"; fn new( - layers_and_metadata: &HashMap, + layers_and_metadata: &HashMap, disk_consistent_lsn: Lsn, metadata: TimelineMetadata, ) -> Self { diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 0bb25f0ace..7075044baf 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -23,7 +23,7 @@ use super::{ mgr::TenantManager, remote_timeline_client::LayerFileMetadata, span::debug_assert_current_span_has_tenant_id, - storage_layer::{layer::local_layer_path, LayerFileName}, + storage_layer::{layer::local_layer_path, LayerName}, }; use pageserver_api::{ @@ -182,7 +182,7 @@ impl SecondaryTenant { self: &Arc, conf: &PageServerConf, timeline_id: TimelineId, - name: LayerFileName, + name: LayerName, metadata: LayerFileMetadata, ) { debug_assert_current_span_has_tenant_id(); diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 092630e74d..2a8f83be95 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -22,7 +22,7 @@ use crate::{ FAILED_REMOTE_OP_RETRIES, }, span::debug_assert_current_span_has_tenant_id, - storage_layer::{layer::local_layer_path, LayerFileName}, + storage_layer::{layer::local_layer_path, LayerName}, tasks::{warn_when_period_overrun, BackgroundLoopKind}, }, virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, @@ -111,7 +111,7 @@ impl OnDiskState { _conf: &'static PageServerConf, _tenant_shard_id: &TenantShardId, _imeline_id: &TimelineId, - _ame: LayerFileName, + _ame: LayerName, metadata: LayerFileMetadata, access_time: SystemTime, ) -> Self { @@ -124,10 +124,10 @@ impl OnDiskState { #[derive(Debug, Clone, Default)] pub(super) struct SecondaryDetailTimeline { - pub(super) on_disk_layers: HashMap, + pub(super) on_disk_layers: HashMap, /// We remember when layers were evicted, to prevent re-downloading them. - pub(super) evicted_at: HashMap, + pub(super) evicted_at: HashMap, } /// This state is written by the secondary downloader, it is opaque @@ -997,7 +997,7 @@ async fn init_timeline_state( // As we iterate through layers found on disk, we will look up their metadata from this map. // Layers not present in metadata will be discarded. - let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> = + let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = heatmap.layers.iter().map(|l| (&l.name, l)).collect(); while let Some(dentry) = dir @@ -1034,7 +1034,7 @@ async fn init_timeline_state( continue; } - match LayerFileName::from_str(file_name) { + match LayerName::from_str(file_name) { Ok(name) => { let remote_meta = heatmap_metadata.get(&name); match remote_meta { diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 73cdf6c6d4..ca91ec24c6 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -1,8 +1,6 @@ use std::time::SystemTime; -use crate::tenant::{ - remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName, -}; +use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr, TimestampSeconds}; @@ -31,7 +29,7 @@ pub(crate) struct HeatMapTimeline { #[serde_as] #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapLayer { - pub(super) name: LayerFileName, + pub(super) name: LayerName, pub(super) metadata: IndexLayerMetadata, #[serde_as(as = "TimestampSeconds")] @@ -42,7 +40,7 @@ pub(crate) struct HeatMapLayer { impl HeatMapLayer { pub(crate) fn new( - name: LayerFileName, + name: LayerName, metadata: IndexLayerMetadata, access_time: SystemTime, ) -> Self { diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 4f1b56ef9f..94a5e9ec47 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -1,11 +1,11 @@ //! Common traits and structs for layers pub mod delta_layer; -mod filename; pub mod image_layer; pub(crate) mod inmemory_layer; pub(crate) mod layer; mod layer_desc; +mod layer_name; use crate::context::{AccessStatsBehavior, RequestContext}; use crate::repository::Value; @@ -34,10 +34,10 @@ use utils::rate_limit::RateLimit; use utils::{id::TimelineId, lsn::Lsn}; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; -pub use filename::{DeltaFileName, ImageFileName, LayerFileName}; pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; +pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName}; pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; @@ -646,8 +646,8 @@ pub mod tests { use super::*; - impl From for PersistentLayerDesc { - fn from(value: DeltaFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: DeltaLayerName) -> Self { PersistentLayerDesc::new_delta( TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), @@ -658,8 +658,8 @@ pub mod tests { } } - impl From for PersistentLayerDesc { - fn from(value: ImageFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: ImageLayerName) -> Self { PersistentLayerDesc::new_img( TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), @@ -670,11 +670,11 @@ pub mod tests { } } - impl From for PersistentLayerDesc { - fn from(value: LayerFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: LayerName) -> Self { match value { - LayerFileName::Delta(d) => Self::from(d), - LayerFileName::Image(i) => Self::from(i), + LayerName::Delta(d) => Self::from(d), + LayerName::Image(i) => Self::from(i), } } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 6fd96b0e2f..c38c9bb656 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -57,6 +57,7 @@ use std::fs::File; use std::io::SeekFrom; use std::ops::Range; use std::os::unix::fs::FileExt; +use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; use tracing::*; @@ -68,7 +69,8 @@ use utils::{ }; use super::{ - AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, + AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, }; /// @@ -309,13 +311,13 @@ impl DeltaLayer { .and_then(|res| res)?; // not production code - let actual_filename = path.file_name().unwrap().to_owned(); - let expected_filename = self.layer_desc().filename().file_name(); + let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); + let expected_layer_name = self.layer_desc().layer_name(); - if actual_filename != expected_filename { + if actual_layer_name != expected_layer_name { println!("warning: filename does not match what is expected from in-file summary"); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); + println!("actual: {:?}", actual_layer_name.to_string()); + println!("expected: {:?}", expected_layer_name.to_string()); } Ok(Arc::new(loaded)) diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 1477a1fc33..c9874873e4 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -54,6 +54,7 @@ use std::fs::File; use std::io::SeekFrom; use std::ops::Range; use std::os::unix::prelude::FileExt; +use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; use tokio_stream::StreamExt; @@ -65,8 +66,10 @@ use utils::{ lsn::Lsn, }; -use super::filename::ImageFileName; -use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState}; +use super::layer_name::ImageLayerName; +use super::{ + AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, +}; /// /// Header stored in the beginning of the file @@ -231,7 +234,7 @@ impl ImageLayer { conf: &PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, - fname: &ImageFileName, + fname: &ImageLayerName, ) -> Utf8PathBuf { let rand_string: String = rand::thread_rng() .sample_iter(&Alphanumeric) @@ -267,13 +270,13 @@ impl ImageLayer { .and_then(|res| res)?; // not production code - let actual_filename = path.file_name().unwrap().to_owned(); - let expected_filename = self.layer_desc().filename().file_name(); + let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); + let expected_layer_name = self.layer_desc().layer_name(); - if actual_filename != expected_filename { + if actual_layer_name != expected_layer_name { println!("warning: filename does not match what is expected from in-file summary"); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); + println!("actual: {:?}", actual_layer_name.to_string()); + println!("expected: {:?}", expected_layer_name.to_string()); } Ok(loaded) @@ -635,7 +638,7 @@ impl ImageLayerWriterInner { conf, timeline_id, tenant_shard_id, - &ImageFileName { + &ImageLayerName { key_range: key_range.clone(), lsn, }, diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index b5e69db7f4..b5b0260327 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -25,7 +25,7 @@ use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; use super::delta_layer::{self, DeltaEntry}; use super::image_layer; use super::{ - AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc, + AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState, }; @@ -128,19 +128,20 @@ pub(crate) fn local_layer_path( conf: &PageServerConf, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, _generation: &Generation, ) -> Utf8PathBuf { let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id); - timeline_path.join(layer_file_name.file_name()) + timeline_path.join(layer_file_name.to_string()) - // TOOD: include generation in the name in now+1 releases. - // timeline_path.join(format!( - // "{}{}", - // layer_file_name.file_name(), - // generation.get_suffix() - // )) + // TODO: switch to enabling new-style layer paths after next release + // if generation.is_none() { + // // Without a generation, we may only use legacy path style + // timeline_path.join(layer_file_name.to_string()) + // } else { + // timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix())) + // } } impl Layer { @@ -148,7 +149,7 @@ impl Layer { pub(crate) fn for_evicted( conf: &'static PageServerConf, timeline: &Arc, - file_name: LayerFileName, + file_name: LayerName, metadata: LayerFileMetadata, ) -> Self { let local_path = local_layer_path( @@ -189,7 +190,7 @@ impl Layer { conf: &'static PageServerConf, timeline: &Arc, local_path: Utf8PathBuf, - file_name: LayerFileName, + file_name: LayerName, metadata: LayerFileMetadata, ) -> ResidentLayer { let desc = PersistentLayerDesc::from_filename( @@ -261,7 +262,7 @@ impl Layer { conf, &timeline.tenant_shard_id, &timeline.timeline_id, - &desc.filename(), + &desc.layer_name(), &timeline.generation, ); @@ -689,7 +690,7 @@ impl Drop for LayerInner { let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id); let path = std::mem::take(&mut self.path); - let file_name = self.layer_desc().filename(); + let file_name = self.layer_desc().layer_name(); let file_size = self.layer_desc().file_size; let timeline = self.timeline.clone(); let meta = self.metadata(); @@ -782,7 +783,9 @@ impl LayerInner { LayerInner { conf, - debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() }, + debug_str: { + format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into() + }, path: local_path, desc, timeline: Arc::downgrade(timeline), @@ -1120,7 +1123,7 @@ impl LayerInner { let result = client .download_layer_file( - &self.desc.filename(), + &self.desc.layer_name(), &self.metadata(), &timeline.cancel, ctx, @@ -1257,7 +1260,7 @@ impl LayerInner { } fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { - let layer_file_name = self.desc.filename().file_name(); + let layer_name = self.desc.layer_name().to_string(); let resident = self .inner @@ -1271,7 +1274,7 @@ impl LayerInner { let lsn_range = &self.desc.lsn_range; HistoricLayerInfo::Delta { - layer_file_name, + layer_file_name: layer_name, layer_file_size: self.desc.file_size, lsn_start: lsn_range.start, lsn_end: lsn_range.end, @@ -1282,7 +1285,7 @@ impl LayerInner { let lsn = self.desc.image_layer_lsn(); HistoricLayerInfo::Image { - layer_file_name, + layer_file_name: layer_name, layer_file_size: self.desc.file_size, lsn_start: lsn, remote: !resident, diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index c375923e81..a89b66e4a1 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -5,7 +5,7 @@ use utils::{id::TimelineId, lsn::Lsn}; use crate::repository::Key; -use super::{DeltaFileName, ImageFileName, LayerFileName}; +use super::{DeltaLayerName, ImageLayerName, LayerName}; use serde::{Deserialize, Serialize}; @@ -51,7 +51,7 @@ impl PersistentLayerDesc { } pub fn short_id(&self) -> impl Display { - self.filename() + self.layer_name() } #[cfg(test)] @@ -103,14 +103,14 @@ impl PersistentLayerDesc { pub fn from_filename( tenant_shard_id: TenantShardId, timeline_id: TimelineId, - filename: LayerFileName, + filename: LayerName, file_size: u64, ) -> Self { match filename { - LayerFileName::Image(i) => { + LayerName::Image(i) => { Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size) } - LayerFileName::Delta(d) => Self::new_delta( + LayerName::Delta(d) => Self::new_delta( tenant_shard_id, timeline_id, d.key_range, @@ -132,34 +132,34 @@ impl PersistentLayerDesc { lsn..(lsn + 1) } - /// Get a delta file name for this layer. + /// Get a delta layer name for this layer. /// /// Panic: if this is not a delta layer. - pub fn delta_file_name(&self) -> DeltaFileName { + pub fn delta_layer_name(&self) -> DeltaLayerName { assert!(self.is_delta); - DeltaFileName { + DeltaLayerName { key_range: self.key_range.clone(), lsn_range: self.lsn_range.clone(), } } - /// Get a delta file name for this layer. + /// Get a image layer name for this layer. /// /// Panic: if this is not an image layer, or the lsn range is invalid - pub fn image_file_name(&self) -> ImageFileName { + pub fn image_layer_name(&self) -> ImageLayerName { assert!(!self.is_delta); assert!(self.lsn_range.start + 1 == self.lsn_range.end); - ImageFileName { + ImageLayerName { key_range: self.key_range.clone(), lsn: self.lsn_range.start, } } - pub fn filename(&self) -> LayerFileName { + pub fn layer_name(&self) -> LayerName { if self.is_delta { - self.delta_file_name().into() + self.delta_layer_name().into() } else { - self.image_file_name().into() + self.image_layer_name().into() } } diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/layer_name.rs similarity index 72% rename from pageserver/src/tenant/storage_layer/filename.rs rename to pageserver/src/tenant/storage_layer/layer_name.rs index fff66a9d07..c733404693 100644 --- a/pageserver/src/tenant/storage_layer/filename.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -15,29 +15,29 @@ use super::PersistentLayerDesc; // Note: Timeline::load_layer_map() relies on this sort order #[derive(PartialEq, Eq, Clone, Hash)] -pub struct DeltaFileName { +pub struct DeltaLayerName { pub key_range: Range, pub lsn_range: Range, } -impl std::fmt::Debug for DeltaFileName { +impl std::fmt::Debug for DeltaLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use super::RangeDisplayDebug; - f.debug_struct("DeltaFileName") + f.debug_struct("DeltaLayerName") .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn_range", &self.lsn_range) .finish() } } -impl PartialOrd for DeltaFileName { +impl PartialOrd for DeltaLayerName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for DeltaFileName { +impl Ord for DeltaLayerName { fn cmp(&self, other: &Self) -> Ordering { let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { @@ -57,16 +57,14 @@ impl Ord for DeltaFileName { } } -/// Represents the filename of a DeltaLayer +/// Represents the region of the LSN-Key space covered by a DeltaLayer /// /// ```text /// -__- /// ``` -impl DeltaFileName { - /// - /// Parse a string as a delta file name. Returns None if the filename does not - /// match the expected pattern. - /// +impl DeltaLayerName { + /// Parse the part of a delta layer's file name that represents the LayerName. Returns None + /// if the filename does not match the expected pattern. pub fn parse_str(fname: &str) -> Option { let mut parts = fname.split("__"); let mut key_parts = parts.next()?.split('-'); @@ -105,14 +103,14 @@ impl DeltaFileName { // or panic? } - Some(DeltaFileName { + Some(DeltaLayerName { key_range: key_start..key_end, lsn_range: start_lsn..end_lsn, }) } } -impl fmt::Display for DeltaFileName { +impl fmt::Display for DeltaLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, @@ -126,29 +124,29 @@ impl fmt::Display for DeltaFileName { } #[derive(PartialEq, Eq, Clone, Hash)] -pub struct ImageFileName { +pub struct ImageLayerName { pub key_range: Range, pub lsn: Lsn, } -impl std::fmt::Debug for ImageFileName { +impl std::fmt::Debug for ImageLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use super::RangeDisplayDebug; - f.debug_struct("ImageFileName") + f.debug_struct("ImageLayerName") .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn", &self.lsn) .finish() } } -impl PartialOrd for ImageFileName { +impl PartialOrd for ImageLayerName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for ImageFileName { +impl Ord for ImageLayerName { fn cmp(&self, other: &Self) -> Ordering { let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { @@ -164,7 +162,7 @@ impl Ord for ImageFileName { } } -impl ImageFileName { +impl ImageLayerName { pub fn lsn_as_range(&self) -> Range { // Saves from having to copypaste this all over PersistentLayerDesc::image_layer_lsn_range(self.lsn) @@ -172,16 +170,14 @@ impl ImageFileName { } /// -/// Represents the filename of an ImageLayer +/// Represents the part of the Key-LSN space covered by an ImageLayer /// /// ```text /// -__ /// ``` -impl ImageFileName { - /// - /// Parse a string as an image file name. Returns None if the filename does not - /// match the expected pattern. - /// +impl ImageLayerName { + /// Parse a string as then LayerName part of an image layer file name. Returns None if the + /// filename does not match the expected pattern. pub fn parse_str(fname: &str) -> Option { let mut parts = fname.split("__"); let mut key_parts = parts.next()?.split('-'); @@ -202,14 +198,14 @@ impl ImageFileName { let lsn = Lsn::from_hex(lsn_str).ok()?; - Some(ImageFileName { + Some(ImageLayerName { key_range: key_start..key_end, lsn, }) } } -impl fmt::Display for ImageFileName { +impl fmt::Display for ImageLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, @@ -220,21 +216,24 @@ impl fmt::Display for ImageFileName { ) } } + +/// LayerName is the logical identity of a layer within a LayerMap at a moment in time. The +/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations +/// over time (e.g. across shard splits or compression). The physical filenames of layers in local +/// storage and object names in remote storage consist of the LayerName plus some extra qualifiers +/// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path]) +/// and [`crate::tenant::storage_layer::layer::local_layer_path`]) #[derive(Debug, PartialEq, Eq, Hash, Clone)] -pub enum LayerFileName { - Image(ImageFileName), - Delta(DeltaFileName), +pub enum LayerName { + Image(ImageLayerName), + Delta(DeltaLayerName), } -impl LayerFileName { - pub fn file_name(&self) -> String { - self.to_string() - } - +impl LayerName { /// Determines if this layer file is considered to be in future meaning we will discard these /// layers during timeline initialization from the given disk_consistent_lsn. pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool { - use LayerFileName::*; + use LayerName::*; match self { Image(file_name) if file_name.lsn > disk_consistent_lsn => true, Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true, @@ -243,7 +242,7 @@ impl LayerFileName { } pub(crate) fn kind(&self) -> &'static str { - use LayerFileName::*; + use LayerName::*; match self { Delta(_) => "delta", Image(_) => "image", @@ -251,7 +250,7 @@ impl LayerFileName { } } -impl fmt::Display for LayerFileName { +impl fmt::Display for LayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::Image(fname) => write!(f, "{fname}"), @@ -260,25 +259,25 @@ impl fmt::Display for LayerFileName { } } -impl From for LayerFileName { - fn from(fname: ImageFileName) -> Self { +impl From for LayerName { + fn from(fname: ImageLayerName) -> Self { Self::Image(fname) } } -impl From for LayerFileName { - fn from(fname: DeltaFileName) -> Self { +impl From for LayerName { + fn from(fname: DeltaLayerName) -> Self { Self::Delta(fname) } } -impl FromStr for LayerFileName { +impl FromStr for LayerName { type Err = String; /// Conversion from either a physical layer filename, or the string-ization of /// Self. When loading a physical layer filename, we drop any extra information /// not needed to build Self. fn from_str(value: &str) -> Result { - let gen_suffix_regex = Regex::new("^(?.+)-(?[0-9a-f]{8})$").unwrap(); + let gen_suffix_regex = Regex::new("^(?.+)(?-v1-[0-9a-f]{8})$").unwrap(); let file_name: Cow = match gen_suffix_regex.captures(value) { Some(captures) => captures .name("base") @@ -288,8 +287,8 @@ impl FromStr for LayerFileName { None => value.into(), }; - let delta = DeltaFileName::parse_str(&file_name); - let image = ImageFileName::parse_str(&file_name); + let delta = DeltaLayerName::parse_str(&file_name); + let image = ImageLayerName::parse_str(&file_name); let ok = match (delta, image) { (None, None) => { return Err(format!( @@ -304,7 +303,7 @@ impl FromStr for LayerFileName { } } -impl serde::Serialize for LayerFileName { +impl serde::Serialize for LayerName { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, @@ -316,19 +315,19 @@ impl serde::Serialize for LayerFileName { } } -impl<'de> serde::Deserialize<'de> for LayerFileName { +impl<'de> serde::Deserialize<'de> for LayerName { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { - deserializer.deserialize_string(LayerFileNameVisitor) + deserializer.deserialize_string(LayerNameVisitor) } } -struct LayerFileNameVisitor; +struct LayerNameVisitor; -impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor { - type Value = LayerFileName; +impl<'de> serde::de::Visitor<'de> for LayerNameVisitor { + type Value = LayerName; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { write!( @@ -349,16 +348,16 @@ mod test { use super::*; #[test] fn image_layer_parse() -> anyhow::Result<()> { - let expected = LayerFileName::Image(ImageFileName { + let expected = LayerName::Image(ImageLayerName { key_range: Key::from_i128(0) ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), lsn: Lsn::from_hex("00000000014FED58").unwrap(), }); - let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").map_err(|s| anyhow::anyhow!(s))?; + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?; assert_eq!(parsed, expected,); // Omitting generation suffix is valid - let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?; + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?; assert_eq!(parsed, expected,); Ok(()) @@ -366,17 +365,17 @@ mod test { #[test] fn delta_layer_parse() -> anyhow::Result<()> { - let expected = LayerFileName::Delta(DeltaFileName { + let expected = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0) ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), lsn_range: Lsn::from_hex("00000000014FED58").unwrap() ..Lsn::from_hex("000000000154C481").unwrap(), }); - let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").map_err(|s| anyhow::anyhow!(s))?; + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?; assert_eq!(parsed, expected); // Omitting generation suffix is valid - let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?; + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?; assert_eq!(parsed, expected); Ok(()) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d6d012c70c..7edb922069 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -60,7 +60,7 @@ use std::{ ops::ControlFlow, }; -use crate::tenant::storage_layer::layer::local_layer_path; +use crate::tenant::timeline::init::LocalLayerFileMetadata; use crate::tenant::{ layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, @@ -75,7 +75,7 @@ use crate::{ disk_usage_eviction_task::finite_f32, tenant::storage_layer::{ AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, - LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult, + LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult, ValueReconstructState, ValuesReconstructState, }, }; @@ -1905,7 +1905,7 @@ impl Timeline { #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] pub(crate) async fn download_layer( &self, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, ) -> anyhow::Result> { let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None); @@ -1925,7 +1925,7 @@ impl Timeline { /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`. pub(crate) async fn evict_layer( &self, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, ) -> anyhow::Result> { let _gate = self .gate @@ -2387,13 +2387,13 @@ impl Timeline { index_part: Option, ) -> anyhow::Result<()> { use init::{Decision::*, Discovered, DismissedLayer}; - use LayerFileName::*; + use LayerName::*; let mut guard = self.layers.write().await; let timer = self.metrics.load_layer_map_histo.start_timer(); - // Scan timeline directory and create ImageFileName and DeltaFilename + // Scan timeline directory and create ImageLayerName and DeltaFilename // structs representing all files on disk let timeline_path = self .conf @@ -2463,33 +2463,35 @@ impl Timeline { let mut needs_cleanup = Vec::new(); let mut total_physical_size = 0; - for (name, local_path, decision) in decided { + for (name, decision) in decided { let decision = match decision { Ok(UseRemote { local, remote }) => { // Remote is authoritative, but we may still choose to retain // the local file if the contents appear to match - if local.file_size() == remote.file_size() { + if local.metadata.file_size() == remote.file_size() { // Use the local file, but take the remote metadata so that we pick up // the correct generation. - UseLocal(remote) + UseLocal( + LocalLayerFileMetadata { + metadata: remote, + local_path: local.local_path + } + ) } else { - let local_path = local_path.as_ref().expect("Locally found layer must have path"); - init::cleanup_local_file_for_remote(local_path, &local, &remote)?; + init::cleanup_local_file_for_remote(&local, &remote)?; UseRemote { local, remote } } } Ok(decision) => decision, Err(DismissedLayer::Future { local }) => { - if local.is_some() { - let local_path = local_path.expect("Locally found layer must have path"); - init::cleanup_future_layer(&local_path, &name, disk_consistent_lsn)?; + if let Some(local) = local { + init::cleanup_future_layer(&local.local_path, &name, disk_consistent_lsn)?; } needs_cleanup.push(name); continue; } Err(DismissedLayer::LocalOnly(local)) => { - let local_path = local_path.expect("Locally found layer must have path"); - init::cleanup_local_only_file(&local_path, &name, &local)?; + init::cleanup_local_only_file(&name, &local)?; // this file never existed remotely, we will have to do rework continue; } @@ -2503,20 +2505,9 @@ impl Timeline { tracing::debug!(layer=%name, ?decision, "applied"); let layer = match decision { - UseLocal(m) => { - total_physical_size += m.file_size(); - - let local_path = local_path.unwrap_or_else(|| { - local_layer_path( - conf, - &this.tenant_shard_id, - &this.timeline_id, - &name, - &m.generation, - ) - }); - - Layer::for_resident(conf, &this, local_path, name, m).drop_eviction_guard() + UseLocal(local) => { + total_physical_size += local.metadata.file_size(); + Layer::for_resident(conf, &this, local.local_path, name, local.metadata).drop_eviction_guard() } Evicted(remote) | UseRemote { remote, .. } => { Layer::for_evicted(conf, &this, name, remote) @@ -2997,10 +2988,10 @@ impl Timeline { } } - async fn find_layer(&self, layer_name: &LayerFileName) -> Option { + async fn find_layer(&self, layer_name: &LayerName) -> Option { let guard = self.layers.read().await; for historic_layer in guard.layer_map().iter_historic_layers() { - let historic_layer_name = historic_layer.filename(); + let historic_layer_name = historic_layer.layer_name(); if layer_name == &historic_layer_name { return Some(guard.get_from_desc(&historic_layer)); } @@ -3030,7 +3021,7 @@ impl Timeline { let last_activity_ts = layer.access_stats().latest_activity_or_now(); HeatMapLayer::new( - layer.layer_desc().filename(), + layer.layer_desc().layer_name(), (&layer.metadata()).into(), last_activity_ts, ) @@ -3177,7 +3168,7 @@ impl Timeline { if let Some(open_layer) = &layers.open_layer { let start_lsn = open_layer.get_lsn_range().start; if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); + //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display()); // Get all the data needed to reconstruct the page version from this layer. // But if we have an older cached page image, no need to go past that. let lsn_floor = max(cached_lsn + 1, start_lsn); @@ -3206,7 +3197,7 @@ impl Timeline { for frozen_layer in layers.frozen_layers.iter().rev() { let start_lsn = frozen_layer.get_lsn_range().start; if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); + //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display()); let lsn_floor = max(cached_lsn + 1, start_lsn); let frozen_layer = frozen_layer.clone(); @@ -4731,7 +4722,7 @@ impl Timeline { if l.get_lsn_range().end > horizon_cutoff { debug!( "keeping {} because it's newer than horizon_cutoff {}", - l.filename(), + l.layer_name(), horizon_cutoff, ); result.layers_needed_by_cutoff += 1; @@ -4742,7 +4733,7 @@ impl Timeline { if l.get_lsn_range().end > pitr_cutoff { debug!( "keeping {} because it's newer than pitr_cutoff {}", - l.filename(), + l.layer_name(), pitr_cutoff, ); result.layers_needed_by_pitr += 1; @@ -4761,7 +4752,7 @@ impl Timeline { if &l.get_lsn_range().start <= retain_lsn { debug!( "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", - l.filename(), + l.layer_name(), retain_lsn, l.is_incremental(), ); @@ -4792,7 +4783,7 @@ impl Timeline { if !layers .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff)) { - debug!("keeping {} because it is the latest layer", l.filename()); + debug!("keeping {} because it is the latest layer", l.layer_name()); result.layers_not_updated += 1; continue 'outer; } @@ -4800,7 +4791,7 @@ impl Timeline { // We didn't find any reason to keep this file, so remove it. debug!( "garbage collecting {} is_dropped: xx is_incremental: {}", - l.filename(), + l.layer_name(), l.is_incremental(), ); layers_to_remove.push(l); diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 5c2b25da56..2641bf3d13 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -407,7 +407,7 @@ async fn remote_copy( let owned = crate::tenant::storage_layer::Layer::for_evicted( adoptee.conf, adoptee, - adopted.layer_desc().filename(), + adopted.layer_desc().layer_name(), metadata, ); diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 9c33981807..66aa765015 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -6,7 +6,7 @@ use crate::{ self, index::{IndexPart, LayerFileMetadata}, }, - storage_layer::LayerFileName, + storage_layer::LayerName, Generation, }, METADATA_FILE_NAME, @@ -20,7 +20,7 @@ use utils::lsn::Lsn; /// Identified files in the timeline directory. pub(super) enum Discovered { /// The only one we care about - Layer(LayerFileName, Utf8PathBuf, u64), + Layer(LayerName, Utf8PathBuf, u64), /// Old ephmeral files from previous launches, should be removed Ephemeral(String), /// Old temporary timeline files, unsure what these really are, should be removed @@ -43,7 +43,7 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result { let file_size = direntry.metadata()?.len(); Discovered::Layer(file_name, direntry.path().to_owned(), file_size) @@ -72,6 +72,28 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result Self { + Self { + local_path, + metadata: LayerFileMetadata::new(file_size, generation, shard), + } + } +} + /// Decision on what to do with a layer file after considering its local and remote metadata. #[derive(Clone, Debug)] pub(super) enum Decision { @@ -80,11 +102,11 @@ pub(super) enum Decision { /// The layer is present locally, but local metadata does not match remote; we must /// delete it and treat it as evicted. UseRemote { - local: LayerFileMetadata, + local: LocalLayerFileMetadata, remote: LayerFileMetadata, }, /// The layer is present locally, and metadata matches. - UseLocal(LayerFileMetadata), + UseLocal(LocalLayerFileMetadata), } /// A layer needs to be left out of the layer map. @@ -92,39 +114,29 @@ pub(super) enum Decision { pub(super) enum DismissedLayer { /// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded. Future { - /// The local metadata. `None` if the layer is only known through [`IndexPart`]. - local: Option, + /// `None` if the layer is only known through [`IndexPart`]. + local: Option, }, /// The layer only exists locally. /// /// In order to make crash safe updates to layer map, we must dismiss layers which are only /// found locally or not yet included in the remote `index_part.json`. - LocalOnly(LayerFileMetadata), + LocalOnly(LocalLayerFileMetadata), } /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions. pub(super) fn reconcile( - discovered: Vec<(LayerFileName, Utf8PathBuf, u64)>, + discovered: Vec<(LayerName, Utf8PathBuf, u64)>, index_part: Option<&IndexPart>, disk_consistent_lsn: Lsn, generation: Generation, shard: ShardIndex, -) -> Vec<( - LayerFileName, - Option, - Result, -)> { +) -> Vec<(LayerName, Result)> { use Decision::*; - // name => (local_path, local_metadata, remote_metadata) - type Collected = HashMap< - LayerFileName, - ( - Option, - Option, - Option, - ), - >; + // name => (local_metadata, remote_metadata) + type Collected = + HashMap, Option)>; let mut discovered = discovered .into_iter() @@ -135,8 +147,9 @@ pub(super) fn reconcile( // it is not in IndexPart, in which case using our current generation makes sense // because it will be uploaded in this generation. ( - Some(local_path), - Some(LayerFileMetadata::new(file_size, generation, shard)), + Some(LocalLayerFileMetadata::new( + local_path, file_size, generation, shard, + )), None, ), ) @@ -152,20 +165,20 @@ pub(super) fn reconcile( .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata))) .for_each(|(name, metadata)| { if let Some(existing) = discovered.get_mut(name) { - existing.2 = Some(metadata); + existing.1 = Some(metadata); } else { - discovered.insert(name.to_owned(), (None, None, Some(metadata))); + discovered.insert(name.to_owned(), (None, Some(metadata))); } }); discovered .into_iter() - .map(|(name, (local_path, local, remote))| { + .map(|(name, (local, remote))| { let decision = if name.is_in_future(disk_consistent_lsn) { Err(DismissedLayer::Future { local }) } else { match (local, remote) { - (Some(local), Some(remote)) if local != remote => { + (Some(local), Some(remote)) if local.metadata != remote => { Ok(UseRemote { local, remote }) } (Some(x), Some(_)) => Ok(UseLocal(x)), @@ -177,7 +190,7 @@ pub(super) fn reconcile( } }; - (name, local_path, decision) + (name, decision) }) .collect::>() } @@ -189,12 +202,12 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> { } pub(super) fn cleanup_local_file_for_remote( - path: &Utf8Path, - local: &LayerFileMetadata, + local: &LocalLayerFileMetadata, remote: &LayerFileMetadata, ) -> anyhow::Result<()> { - let local_size = local.file_size(); + let local_size = local.metadata.file_size(); let remote_size = remote.file_size(); + let path = &local.local_path; let file_name = path.file_name().expect("must be file path"); tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}"); @@ -211,7 +224,7 @@ pub(super) fn cleanup_local_file_for_remote( pub(super) fn cleanup_future_layer( path: &Utf8Path, - name: &LayerFileName, + name: &LayerName, disk_consistent_lsn: Lsn, ) -> anyhow::Result<()> { // future image layers are allowed to be produced always for not yet flushed to disk @@ -223,12 +236,14 @@ pub(super) fn cleanup_future_layer( } pub(super) fn cleanup_local_only_file( - path: &Utf8Path, - name: &LayerFileName, - local: &LayerFileMetadata, + name: &LayerName, + local: &LocalLayerFileMetadata, ) -> anyhow::Result<()> { let kind = name.kind(); - tracing::info!("found local-only {kind} layer {name}, metadata {local:?}"); - std::fs::remove_file(path)?; + tracing::info!( + "found local-only {kind} layer {name}, metadata {:?}", + local.metadata + ); + std::fs::remove_file(&local.local_path)?; Ok(()) } diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 8e8d64e0c6..a72eb1b3bf 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -294,7 +294,7 @@ impl LayerFileManager { // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. self.0 .get(&desc.key()) - .with_context(|| format!("get layer from desc: {}", desc.filename())) + .with_context(|| format!("get layer from desc: {}", desc.layer_name())) .expect("not found") .clone() } diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 0bf4d1e599..7797117e0f 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -1,4 +1,4 @@ -use super::storage_layer::LayerFileName; +use super::storage_layer::LayerName; use super::storage_layer::ResidentLayer; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::index::IndexPart; @@ -45,7 +45,7 @@ pub(crate) struct UploadQueueInitialized { /// All layer files stored in the remote storage, taking into account all /// in-progress and queued operations - pub(crate) latest_files: HashMap, + pub(crate) latest_files: HashMap, /// How many file uploads or deletions been scheduled, since the /// last (scheduling of) metadata index upload? @@ -89,7 +89,7 @@ pub(crate) struct UploadQueueInitialized { /// Putting this behind a testing feature to catch problems in tests, but assuming we could have a /// bug causing leaks, then it's better to not leave this enabled for production builds. #[cfg(feature = "testing")] - pub(crate) dangling_files: HashMap, + pub(crate) dangling_files: HashMap, /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`. pub(crate) shutting_down: bool, @@ -281,7 +281,7 @@ pub(crate) struct UploadTask { /// for timeline deletion, which skips this queue and goes directly to DeletionQueue. #[derive(Debug)] pub(crate) struct Delete { - pub(crate) layers: Vec<(LayerFileName, LayerFileMetadata)>, + pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>, } #[derive(Debug)] diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index 7c0f699958..68133fc0a9 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -13,7 +13,7 @@ use crate::metadata_stream::stream_listing; use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; use futures_util::StreamExt; use pageserver::tenant::remote_timeline_client::parse_remote_index_path; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use remote_storage::RemotePath; @@ -110,7 +110,7 @@ pub(crate) fn branch_cleanup_and_check_errors( for (layer, metadata) in index_part.layer_metadata { if metadata.file_size == 0 { result.errors.push(format!( - "index_part.json contains a layer {} that has 0 size in its layer metadata", layer.file_name(), + "index_part.json contains a layer {} that has 0 size in its layer metadata", layer, )) } @@ -121,7 +121,7 @@ pub(crate) fn branch_cleanup_and_check_errors( // layer we think is missing. result.errors.push(format!( "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage", - layer.file_name(), + layer, metadata.generation.get_suffix(), metadata.shard )) @@ -170,8 +170,7 @@ pub(crate) struct LayerRef { /// the tenant to query whether an object exists. #[derive(Default)] pub(crate) struct TenantObjectListing { - shard_timelines: - HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>, + shard_timelines: HashMap<(ShardIndex, TimelineId), HashMap<(LayerName, Generation), LayerRef>>, } impl TenantObjectListing { @@ -180,7 +179,7 @@ impl TenantObjectListing { pub(crate) fn push( &mut self, ttid: TenantShardTimelineId, - layers: HashSet<(LayerFileName, Generation)>, + layers: HashSet<(LayerName, Generation)>, ) { let shard_index = ShardIndex::new( ttid.tenant_shard_id.shard_number, @@ -208,7 +207,7 @@ impl TenantObjectListing { pub(crate) fn check_ref( &mut self, timeline_id: TimelineId, - layer_file: &LayerFileName, + layer_file: &LayerName, metadata: &IndexLayerMetadata, ) -> bool { let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else { @@ -224,7 +223,7 @@ impl TenantObjectListing { true } - pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> { + pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerName, Generation)> { let mut result = Vec::new(); for ((shard_index, timeline_id), layers) in &self.shard_timelines { for ((layer_file, generation), layer_ref) in layers { @@ -249,23 +248,23 @@ pub(crate) enum BlobDataParseResult { Parsed { index_part: IndexPart, index_part_generation: Generation, - s3_layers: HashSet<(LayerFileName, Generation)>, + s3_layers: HashSet<(LayerName, Generation)>, }, /// The remains of a deleted Timeline (i.e. an initdb archive only) Relic, Incorrect(Vec), } -fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), String> { +fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { match name.rsplit_once('-') { // FIXME: this is gross, just use a regex? Some((layer_filename, gen)) if gen.len() == 8 => { - let layer = layer_filename.parse::()?; + let layer = layer_filename.parse::()?; let gen = Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?; Ok((layer, gen)) } - _ => Ok((name.parse::()?, Generation::none())), + _ => Ok((name.parse::()?, Generation::none())), } } diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs index 4eccad381b..2c93a8490a 100644 --- a/s3_scrubber/src/tenant_snapshot.rs +++ b/s3_scrubber/src/tenant_snapshot.rs @@ -12,7 +12,7 @@ use aws_sdk_s3::Client; use camino::Utf8PathBuf; use futures::{StreamExt, TryStreamExt}; use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use pageserver_api::shard::TenantShardId; use utils::generation::Generation; @@ -48,16 +48,16 @@ impl SnapshotDownloader { async fn download_layer( &self, ttid: TenantShardTimelineId, - layer_name: LayerFileName, + layer_name: LayerName, layer_metadata: IndexLayerMetadata, - ) -> anyhow::Result<(LayerFileName, IndexLayerMetadata)> { + ) -> anyhow::Result<(LayerName, IndexLayerMetadata)> { // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format. They use // different layer names (remote-style has the generation suffix) let local_path = self.output_path.join(format!( "{}/timelines/{}/{}{}", ttid.tenant_shard_id, ttid.timeline_id, - layer_name.file_name(), + layer_name, layer_metadata.generation.get_suffix() )); @@ -76,7 +76,7 @@ impl SnapshotDownloader { let remote_layer_path = format!( "{}{}{}", timeline_root.prefix_in_bucket, - layer_name.file_name(), + layer_name, layer_metadata.generation.get_suffix() ); @@ -110,7 +110,7 @@ impl SnapshotDownloader { async fn download_layers( &self, ttid: TenantShardTimelineId, - layers: Vec<(LayerFileName, IndexLayerMetadata)>, + layers: Vec<(LayerName, IndexLayerMetadata)>, ) -> anyhow::Result<()> { let layer_count = layers.len(); tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count); @@ -138,7 +138,7 @@ impl SnapshotDownloader { tracing::info!( "[{download_count}/{layer_count}] OK: {} bytes {ttid} {}", layer_metadata.file_size, - layer_name.file_name() + layer_name ); } Err(e) => { @@ -163,7 +163,7 @@ impl SnapshotDownloader { index_part_generation: Generation, ancestor_layers: &mut HashMap< TenantShardTimelineId, - HashMap, + HashMap, >, ) -> anyhow::Result<()> { let index_bytes = serde_json::to_string(&index_part).unwrap(); @@ -234,7 +234,7 @@ impl SnapshotDownloader { // happen if this tenant has been split at some point) let mut ancestor_layers: HashMap< TenantShardTimelineId, - HashMap, + HashMap, > = Default::default(); for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index f618c508bc..390b94c2ea 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -54,7 +54,7 @@ from fixtures.pageserver.allowed_errors import ( DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.pageserver.types import IndexPartDump, LayerFileName, parse_layer_file_name +from fixtures.pageserver.types import IndexPartDump, LayerName, parse_layer_file_name from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload, @@ -2664,7 +2664,7 @@ class NeonPageserver(PgProtocol, LogUtils): ) def layer_exists( - self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerFileName + self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerName ) -> bool: layers = self.list_layers(tenant_id, timeline_id) return layer_name in [parse_layer_file_name(p.name) for p in layers] diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/types.py index fd018cb778..1fb618f445 100644 --- a/test_runner/fixtures/pageserver/types.py +++ b/test_runner/fixtures/pageserver/types.py @@ -12,7 +12,7 @@ class IndexLayerMetadata: @dataclass(frozen=True) -class ImageLayerFileName: +class ImageLayerName: lsn: Lsn key_start: Key key_end: Key @@ -26,7 +26,7 @@ class ImageLayerFileName: @dataclass(frozen=True) -class DeltaLayerFileName: +class DeltaLayerName: lsn_start: Lsn lsn_end: Lsn key_start: Key @@ -41,14 +41,16 @@ class DeltaLayerFileName: return ret -LayerFileName = Union[ImageLayerFileName, DeltaLayerFileName] +LayerName = Union[ImageLayerName, DeltaLayerName] class InvalidFileName(Exception): pass -IMAGE_LAYER_FILE_NAME = re.compile("^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-[a-f0-9]{8})?$") +IMAGE_LAYER_FILE_NAME = re.compile( + "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-v1-[a-f0-9]{8})?$" +) def parse_image_layer(f_name: str) -> Tuple[int, int, int]: @@ -62,7 +64,7 @@ def parse_image_layer(f_name: str) -> Tuple[int, int, int]: DELTA_LAYER_FILE_NAME = re.compile( - "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-[a-f0-9]{8})?$" + "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-v1-[a-f0-9]{8})?$" ) @@ -80,16 +82,16 @@ def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]: ) -def parse_layer_file_name(file_name: str) -> LayerFileName: +def parse_layer_file_name(file_name: str) -> LayerName: try: key_start, key_end, lsn = parse_image_layer(file_name) - return ImageLayerFileName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end)) + return ImageLayerName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end)) except InvalidFileName: pass try: key_start, key_end, lsn_start, lsn_end = parse_delta_layer(file_name) - return DeltaLayerFileName( + return DeltaLayerName( lsn_start=Lsn(lsn_start), lsn_end=Lsn(lsn_end), key_start=Key(key_start), @@ -101,18 +103,15 @@ def parse_layer_file_name(file_name: str) -> LayerFileName: raise InvalidFileName("neither image nor delta layer") -def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn): +def is_future_layer(layer_file_name: LayerName, disk_consistent_lsn: Lsn): """ Determines if this layer file is considered to be in future meaning we will discard these layers during timeline initialization from the given disk_consistent_lsn. """ - if ( - isinstance(layer_file_name, ImageLayerFileName) - and layer_file_name.lsn > disk_consistent_lsn - ): + if isinstance(layer_file_name, ImageLayerName) and layer_file_name.lsn > disk_consistent_lsn: return True elif ( - isinstance(layer_file_name, DeltaLayerFileName) + isinstance(layer_file_name, DeltaLayerName) and layer_file_name.lsn_end > disk_consistent_lsn + 1 ): return True @@ -122,7 +121,7 @@ def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn): @dataclass class IndexPartDump: - layer_metadata: Dict[LayerFileName, IndexLayerMetadata] + layer_metadata: Dict[LayerName, IndexLayerMetadata] disk_consistent_lsn: Lsn @classmethod diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index f311a8bf2c..cc34fd83e9 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -3,8 +3,8 @@ import time from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver from fixtures.pageserver.types import ( - DeltaLayerFileName, - ImageLayerFileName, + DeltaLayerName, + ImageLayerName, is_future_layer, ) from fixtures.pageserver.utils import ( @@ -81,7 +81,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): current = get_index_part() assert len(set(current.layer_metadata.keys())) == 1 layer_file_name = list(current.layer_metadata.keys())[0] - assert isinstance(layer_file_name, DeltaLayerFileName) + assert isinstance(layer_file_name, DeltaLayerName) assert layer_file_name.is_l0(), f"{layer_file_name}" log.info("force image layer creation in the future by writing some data into in-memory layer") @@ -146,7 +146,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): future_layers = get_future_layers() assert len(future_layers) == 1 future_layer = future_layers[0] - assert isinstance(future_layer, ImageLayerFileName) + assert isinstance(future_layer, ImageLayerName) assert future_layer.lsn == last_record_lsn log.info( f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}" diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index adcf7de8d4..58eaf404d3 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -729,8 +729,8 @@ def test_upgrade_generationless_local_file_paths( for filename in os.listdir(timeline_dir): path = os.path.join(timeline_dir, filename) log.info(f"Found file {path}") - if path.endswith("-00000001"): - new_path = path[:-9] + if path.endswith("-v1-00000001"): + new_path = path[:-12] os.rename(path, new_path) log.info(f"Renamed {path} -> {new_path}") files_renamed += 1 From b06eec41fa5971899fd15ed4b643889863c616c7 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 8 May 2024 20:58:35 +0300 Subject: [PATCH 009/130] Ignore page header when comparing VM pages in test_vm_bits.py (#7499) ## Problem See #6714, #6967 ## Summary of changes Completely ignore page header when comparing VM pages. ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist Co-authored-by: Konstantin Knizhnik --- test_runner/regress/test_vm_bits.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 06f2a8befd..b549db1af6 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -168,15 +168,16 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder): # The VM page in shared buffer cache, and the same page as reconstructed # by the pageserver, should be equal. # - # Ignore the LSN on the page though (first 8 bytes). If the dirty - # VM page is flushed from the cache for some reason, it gets WAL-logged, - # which changes the LSN on the page. + # Ignore page header (24 bytes) of visibility map. + # If the dirty VM page is flushed from the cache for some reason, + # it gets WAL-logged, which changes the LSN on the page. + # Also in neon SMGR we can replace empty heap page with zero (uninitialized) heap page. cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )") - vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex() + vm_page_in_cache = (cur.fetchall()[0][0])[24:100].hex() cur.execute( "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )" ) - vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex() + vm_page_at_pageserver = (cur.fetchall()[0][0])[24:100].hex() assert vm_page_at_pageserver == vm_page_in_cache From d5399b729b3ecd3d9d38d8e61d3511fc4bf321b5 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 8 May 2024 19:29:16 +0100 Subject: [PATCH 010/130] pageserver: fix division by zero in layer counting metric (#7662) For aux file keys (v1 or v2) the vectored read path does not return an error when they're missing. Instead they are omitted from the resulting btree (this is a requirement, not a bug). Skip updating the metric in these cases to avoid infinite results. --- pageserver/src/tenant/timeline.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 7edb922069..5983529a44 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1220,11 +1220,17 @@ impl Timeline { } reconstruct_timer.stop_and_record(); - // Note that this is an approximation. Tracking the exact number of layers visited - // per key requires virtually unbounded memory usage and is inefficient - // (i.e. segment tree tracking each range queried from a layer) - crate::metrics::VEC_READ_NUM_LAYERS_VISITED - .observe(layers_visited as f64 / results.len() as f64); + // For aux file keys (v1 or v2) the vectored read path does not return an error + // when they're missing. Instead they are omitted from the resulting btree + // (this is a requirement, not a bug). Skip updating the metric in these cases + // to avoid infinite results. + if !results.is_empty() { + // Note that this is an approximation. Tracking the exact number of layers visited + // per key requires virtually unbounded memory usage and is inefficient + // (i.e. segment tree tracking each range queried from a layer) + crate::metrics::VEC_READ_NUM_LAYERS_VISITED + .observe(layers_visited as f64 / results.len() as f64); + } Ok(results) } From ab10523cc1d59cd65d88181645b149d4adc23c5e Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 9 May 2024 10:58:38 +0200 Subject: [PATCH 011/130] remote_storage: AWS_PROFILE with endpoint overrides in ~/.aws/config (updates AWS SDKs) (#7664) Before this PR, using the AWS SDK profile feature for running against minio didn't work because * our SDK versions were too old and didn't include https://github.com/awslabs/aws-sdk-rust/issues/1060 and * we didn't massage the s3 client config builder correctly. This PR * udpates all the AWS SDKs we use to, respectively, the latest version I could find on crates.io (Is there a better process?) * changes the way remote_storage constructs the S3 client, and * documents how to run the test suite against real S3 & local minio. Regarding the changes to `remote_storage`: if one reads the SDK docs, it is clear that the recommended way is to use `aws_config::from_env`, then customize. What we were doing instead is to use the `aws_sdk_s3` builder directly. To get the `local-minio` in the added docs working, I needed to update both the SDKs and make the changes to the `remote_storage`. See the commit history in this PR for details. Refs: * byproduct: https://github.com/smithy-lang/smithy-rs/pull/3633 * follow-up on deprecation: https://github.com/neondatabase/neon/issues/7665 * follow-up for scrubber S3 setup: https://github.com/neondatabase/neon/issues/7667 --- Cargo.lock | 88 +++++++++------ Cargo.toml | 14 +-- libs/remote_storage/src/s3_bucket.rs | 92 +++++++++------ s3_scrubber/src/lib.rs | 5 +- test_runner/README.md | 160 +++++++++++++++++++++++++++ workspace_hack/Cargo.toml | 2 +- 6 files changed, 285 insertions(+), 76 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9bff5e1eff..6ce7180d67 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.9" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", @@ -284,9 +284,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "1.1.4" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b30c39ebe61f75d1b3785362b1586b41991873c9ab3e317a9181c246fb71d82" +checksum = "baaa0be6ee7d90b775ae6ccb6d2ba182b91219ec2001f92338773a094246af1d" dependencies = [ "aws-credential-types", "aws-runtime", @@ -309,14 +309,15 @@ dependencies = [ "time", "tokio", "tracing", + "url", "zeroize", ] [[package]] name = "aws-credential-types" -version = "1.1.8" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8" +checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -326,9 +327,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.1.8" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8" +checksum = "785da4a15e7b166b505fd577e4560c7a7cd8fbdf842eb1336cbcbf8944ce56f1" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -373,10 +374,11 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "1.14.0" +version = "1.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "951f7730f51a2155c711c85c79f337fbc02a577fa99d2a0a8059acfce5392113" +checksum = "7bc5ce518d4b8d16e0408de7bdf1b3097cec61a7daa979750a208f8d9934386d" dependencies = [ + "ahash", "aws-credential-types", "aws-runtime", "aws-sigv4", @@ -391,20 +393,25 @@ dependencies = [ "aws-smithy-xml", "aws-types", "bytes", + "fastrand 2.0.0", + "hex", + "hmac", "http 0.2.9", "http-body 0.4.5", + "lru", "once_cell", "percent-encoding", "regex-lite", + "sha2", "tracing", "url", ] [[package]] name = "aws-sdk-sso" -version = "1.12.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f486420a66caad72635bc2ce0ff6581646e0d32df02aa39dc983bfe794955a5b" +checksum = "ca3d6c4cba4e009391b72b0fcf12aff04ea3c9c3aa2ecaafa330326a8bd7e601" dependencies = [ "aws-credential-types", "aws-runtime", @@ -424,9 +431,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.12.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39ddccf01d82fce9b4a15c8ae8608211ee7db8ed13a70b514bbfe41df3d24841" +checksum = "73400dc239d14f63d932f4ca7b55af5e9ef1f857f7d70655249ccc287adb2570" dependencies = [ "aws-credential-types", "aws-runtime", @@ -446,9 +453,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.12.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a591f8c7e6a621a501b2b5d2e88e1697fcb6274264523a6ad4d5959889a41ce" +checksum = "10f8858308af76fba3e5ffcf1bb56af5471574d2bdfaf0159470c25bc2f760e5" dependencies = [ "aws-credential-types", "aws-runtime", @@ -469,9 +476,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263" +checksum = "58b56f1cbe6fd4d0c2573df72868f20ab1c125ca9c9dbce17927a463433a2e57" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -498,9 +505,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.1.8" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46" +checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c" dependencies = [ "futures-util", "pin-project-lite", @@ -509,9 +516,9 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.60.4" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be2acd1b9c6ae5859999250ed5a62423aedc5cf69045b844432de15fa2f31f2b" +checksum = "83fa43bc04a6b2441968faeab56e68da3812f978a670a5db32accbdcafddd12f" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -541,9 +548,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.60.7" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9" +checksum = "4a7de001a1b9a25601016d8057ea16e31a45fdca3751304c8edf4ad72e706c08" dependencies = [ "aws-smithy-eventstream", "aws-smithy-runtime-api", @@ -581,9 +588,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.1.8" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01" +checksum = "c9ac79e9f3a4d576f3cd4a470a0275b138d9e7b11b1cd514a6858ae0a79dd5bb" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -594,6 +601,7 @@ dependencies = [ "h2 0.3.26", "http 0.2.9", "http-body 0.4.5", + "http-body 1.0.0", "hyper 0.14.26", "hyper-rustls 0.24.0", "once_cell", @@ -606,9 +614,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.2.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5" +checksum = "04ec42c2f5c0e7796a2848dde4d9f3bf8ce12ccbb3d5aa40c52fa0cdd61a1c47" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -623,16 +631,19 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.1.8" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729" +checksum = "baf98d97bba6ddaba180f1b1147e202d8fe04940403a95a3f826c790f931bbd1" dependencies = [ "base64-simd", "bytes", "bytes-utils", "futures-core", "http 0.2.9", + "http 1.1.0", "http-body 0.4.5", + "http-body 1.0.0", + "http-body-util", "itoa", "num-integer", "pin-project-lite", @@ -646,18 +657,18 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.7" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9" +checksum = "d123fbc2a4adc3c301652ba8e149bf4bc1d1725affb9784eb20c953ace06bf55" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.1.8" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40" +checksum = "5a43b56df2c529fe44cb4d92bd64d0479883fb9608ff62daede4df5405381814" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -2935,6 +2946,15 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "lru" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc" +dependencies = [ + "hashbrown 0.14.0", +] + [[package]] name = "match_cfg" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 1ddadd2f3c..17f30a1327 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,14 +52,14 @@ azure_storage_blobs = "0.19" flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" -aws-config = { version = "1.1.4", default-features = false, features=["rustls"] } -aws-sdk-s3 = "1.14" +aws-config = { version = "1.3", default-features = false, features=["rustls"] } +aws-sdk-s3 = "1.26" aws-sdk-iam = "1.15.0" -aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] } -aws-smithy-types = "1.1.4" -aws-credential-types = "1.1.4" -aws-sigv4 = { version = "1.2.0", features = ["sign-http"] } -aws-types = "1.1.7" +aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] } +aws-smithy-types = "1.1.9" +aws-credential-types = "1.2.0" +aws-sigv4 = { version = "1.2.1", features = ["sign-http"] } +aws-types = "1.2.0" axum = { version = "0.6.20", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index c0b89cee2a..c3d6c75e20 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -27,7 +27,7 @@ use aws_config::{ }; use aws_credential_types::provider::SharedCredentialsProvider; use aws_sdk_s3::{ - config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep}, + config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}, error::SdkError, operation::get_object::GetObjectError, types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}, @@ -75,13 +75,13 @@ struct GetObjectRequest { } impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. - pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result { + pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result { tracing::debug!( "Creating s3 remote storage for S3 bucket {}", - aws_config.bucket_name + remote_storage_config.bucket_name ); - let region = Some(Region::new(aws_config.bucket_region.clone())); + let region = Some(Region::new(remote_storage_config.bucket_region.clone())); let provider_conf = ProviderConfig::without_region().with_region(region.clone()); @@ -113,6 +113,38 @@ impl S3Bucket { // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off let sleep_impl: Arc = Arc::new(TokioSleep::new()); + let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults( + #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ + BehaviorVersion::v2023_11_09(), + ) + .region(region) + .identity_cache(IdentityCache::lazy().build()) + .credentials_provider(SharedCredentialsProvider::new(credentials_provider)) + .sleep_impl(SharedAsyncSleep::from(sleep_impl)); + + let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| { + s.spawn(|| { + // TODO: make this function async. + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap() + .block_on(sdk_config_loader.load()) + }) + .join() + .unwrap() + }); + + let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config); + + // Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions. + // (In case we ever re-use the `sdk_config` for more than just the S3 client in the future) + if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() { + s3_config_builder = s3_config_builder + .endpoint_url(custom_endpoint) + .force_path_style(true); + } + // We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled. @@ -120,42 +152,36 @@ impl S3Bucket { retry_config .set_max_attempts(Some(1)) .set_mode(Some(RetryMode::Adaptive)); + s3_config_builder = s3_config_builder.retry_config(retry_config.build()); - let mut config_builder = Builder::default() - .behavior_version(BehaviorVersion::v2023_11_09()) - .region(region) - .identity_cache(IdentityCache::lazy().build()) - .credentials_provider(SharedCredentialsProvider::new(credentials_provider)) - .retry_config(retry_config.build()) - .sleep_impl(SharedAsyncSleep::from(sleep_impl)); + let s3_config = s3_config_builder.build(); + let client = aws_sdk_s3::Client::from_conf(s3_config); - if let Some(custom_endpoint) = aws_config.endpoint.clone() { - config_builder = config_builder - .endpoint_url(custom_endpoint) - .force_path_style(true); - } + let prefix_in_bucket = remote_storage_config + .prefix_in_bucket + .as_deref() + .map(|prefix| { + let mut prefix = prefix; + while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + prefix = &prefix[1..] + } - let client = Client::from_conf(config_builder.build()); + let mut prefix = prefix.to_string(); + while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + prefix.pop(); + } + prefix + }); - let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| { - let mut prefix = prefix; - while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { - prefix = &prefix[1..] - } - - let mut prefix = prefix.to_string(); - while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { - prefix.pop(); - } - prefix - }); Ok(Self { client, - bucket_name: aws_config.bucket_name.clone(), - max_keys_per_list_response: aws_config.max_keys_per_list_response, + bucket_name: remote_storage_config.bucket_name.clone(), + max_keys_per_list_response: remote_storage_config.max_keys_per_list_response, prefix_in_bucket, - concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()), - upload_storage_class: aws_config.upload_storage_class.clone(), + concurrency_limiter: ConcurrencyLimiter::new( + remote_storage_config.concurrency_limit.get(), + ), + upload_storage_class: remote_storage_config.upload_storage_class.clone(), timeout, }) } diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index e976e66748..7966fb6a88 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -312,7 +312,10 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie let sleep_impl: Arc = Arc::new(TokioSleep::new()); let mut builder = Config::builder() - .behavior_version(BehaviorVersion::v2023_11_09()) + .behavior_version( + #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ + BehaviorVersion::v2023_11_09(), + ) .region(bucket_region) .retry_config(RetryConfig::adaptive().with_max_attempts(3)) .sleep_impl(SharedAsyncSleep::from(sleep_impl)) diff --git a/test_runner/README.md b/test_runner/README.md index 051897744a..fd68cfff79 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -92,6 +92,166 @@ Exit after the first test failure: `./scripts/pytest -x ...` (there are many more pytest options; run `pytest -h` to see them.) +#### Running Python tests against real S3 or S3-compatible services + +Neon's `libs/remote_storage` supports multiple implementations of remote storage. +At the time of writing, that is +```rust +pub enum RemoteStorageKind { + /// Storage based on local file system. + /// Specify a root folder to place all stored files into. + LocalFs(Utf8PathBuf), + /// AWS S3 based storage, storing all files in the S3 bucket + /// specified by the config + AwsS3(S3Config), + /// Azure Blob based storage, storing all files in the container + /// specified by the config + AzureContainer(AzureConfig), +} +``` + +The test suite has a Python enum with equal name but different meaning: + +```python +@enum.unique +class RemoteStorageKind(str, enum.Enum): + LOCAL_FS = "local_fs" + MOCK_S3 = "mock_s3" + REAL_S3 = "real_s3" +``` + +* `LOCAL_FS` => `LocalFs` +* `MOCK_S3`: starts [`moto`](https://github.com/getmoto/moto)'s S3 implementation, then configures Pageserver with `AwsS3` +* `REAL_S3` => configure `AwsS3` as detailed below + +When a test in the test suite needs an `AwsS3`, it is supposed to call `remote_storage.s3_storage()`. +That function checks env var `ENABLE_REAL_S3_REMOTE_STORAGE`: +* If it is not set, use `MOCK_S3` +* If it is set, use `REAL_S3`. + +For `REAL_S3`, the test suite creates the dict/toml representation of the `RemoteStorageKind::AwsS3` based on env vars: + +```rust +pub struct S3Config { + // test suite env var: REMOTE_STORAGE_S3_BUCKET + pub bucket_name: String, + // test suite env var: REMOTE_STORAGE_S3_REGION + pub bucket_region: String, + // test suite determines this + pub prefix_in_bucket: Option, + // no env var exists; test suite sets it for MOCK_S3, because that's how moto works + pub endpoint: Option, + ... +} +``` + +*Credentials* are not part of the config, but discovered by the AWS SDK. +See the `libs/remote_storage` Rust code. +We're documenting two mechanism here: + +The test suite supports two mechanisms (`remote_storage.py`): + +**Credential mechanism 1**: env vars `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. +Populate the env vars with AWS access keys that you created in IAM. +Our CI uses this mechanism. +However, it is _not_ recommended for interactive use by developers ([learn more](https://docs.aws.amazon.com/sdkref/latest/guide/access-users.html#credentials-long-term)). +Instead, use profiles (next section). + +**Credential mechanism 2**: env var `AWS_PROFILE`. +This uses the AWS SDK's (and CLI's) profile mechanism. +Learn more about it [in the official docs](https://docs.aws.amazon.com/sdkref/latest/guide/file-format.html). +After configuring a profile (e.g. via the aws CLI), set the env var to its name. + +In conclusion, the full command line is: + +```bash +# with long-term AWS access keys +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=eu-central-1 \ +AWS_ACCESS_KEY_ID=... \ +AWS_SECRET_ACCESS_KEY=... \ +./scripts/pytest +``` + +```bash +# with AWS PROFILE +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=eu-central-1 \ +AWS_PROFILE=... \ +./scripts/pytest +``` + +If you're using SSO, make sure to `aws sso login --profile $AWS_PROFILE` first. + +##### Minio + +If you want to run test without the cloud setup, we recommend [minio](https://min.io/docs/minio/linux/index.html). + +```bash +# Start in Terminal 1 +mkdir /tmp/minio_data +minio server /tmp/minio_data --console-address 127.0.0.1:9001 --address 127.0.0.1:9000 +``` + +In another terminal, create an `aws` CLI profile for it: + +```ini +# append to ~/.aws/config +[profile local-minio] +services = local-minio-services +[services local-minio-services] +s3 = + endpoint_url=http://127.0.0.1:9000/ +``` + + +Now configure the credentials (this is going to write `~/.aws/credentials` for you). +It's an interactive prompt. + +```bash +# Terminal 2 +$ aws --profile local-minio configure +AWS Access Key ID [None]: minioadmin +AWS Secret Access Key [None]: minioadmin +Default region name [None]: +Default output format [None]: +``` + +Now create a bucket `testbucket` using the CLI. + +```bash +# (don't forget to have AWS_PROFILE env var set; or use --profile) +aws --profile local-minio s3 mb s3://mybucket +``` + +(If it doesn't work, make sure you update your AWS CLI to a recent version. + The [service-specific endpoint feature](https://docs.aws.amazon.com/sdkref/latest/guide/feature-ss-endpoints.html) + that we're using is quite new.) + +```bash +# with AWS PROFILE +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=doesntmatterforminio \ +AWS_PROFILE=local-minio \ +./scripts/pytest +``` + +NB: you can avoid the `--profile` by setting the `AWS_PROFILE` variable. +Just like the AWS SDKs, the `aws` CLI is sensible to it. + +#### Running Rust tests against real S3 or S3-compatible services + +We have some Rust tests that only run against real S3, e.g., [here](https://github.com/neondatabase/neon/blob/c18d3340b5e3c978a81c3db8b6f1e83cd9087e8a/libs/remote_storage/tests/test_real_s3.rs#L392-L397). + +They use the same env vars as the Python test suite (see previous section) +but interpret them on their own. +However, at this time, the interpretation is identical. + +So, above instructions apply to the Rust test as well. + ### Writing a test Every test needs a Neon Environment, or NeonEnv to operate in. A Neon Environment diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index b2da33e44a..b605757f64 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -19,7 +19,7 @@ aws-runtime = { version = "1", default-features = false, features = ["event-stre aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] } aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] } aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] } -aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "rt-tokio", "test-util"] } +aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "http-body-1-x", "rt-tokio", "test-util"] } axum = { version = "0.6", features = ["ws"] } base64 = { version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] } From 39c712f2ca216a1d1556d4c0f8a846919418e661 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 9 May 2024 10:07:59 +0100 Subject: [PATCH 012/130] tests: adjust log allow list since reqwest upgrade (#7666) ## Problem Various performance test cases were destabilized by the recent upgrade of `reqwest`, because it changes an error string. Examples: - https://neon-github-public-dev.s3.amazonaws.com/reports/main/9005532594/index.html#testresult/3f984e471a9029a5/ - https://neon-github-public-dev.s3.amazonaws.com/reports/main/9005532594/index.html#testresult/8bd0f095fe0402b7/ The performance tests suffer from this more than most tests, because they churn enough data that the pageserver is still trying to contact the storage controller while it is shut down at the end of tests. ## Summary of changes s/Connection refused/error sending request/ --- test_runner/fixtures/pageserver/allowed_errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index e560844944..58a76d7586 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -88,7 +88,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*Flushed oversized open layer with size.*", # During teardown, we stop the storage controller before the pageservers, so pageservers # can experience connection errors doing background deletion queue work. - ".*WARN deletion backend: calling control plane generation validation API failed.*Connection refused.*", + ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*", # Can happen when the test shuts down the storage controller while it is calling the utilization API ".*WARN.*path=/v1/utilization .*request was dropped before completing", ) From 107f53529409533fec5e1ca39abf9acde8161862 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 9 May 2024 12:33:09 +0100 Subject: [PATCH 013/130] storage controller: fix handing of tenants with no timelines during scheduling optimization (#7673) ## Problem Storage controller was using a zero layer count in SecondaryProgress as a proxy for "not initialized". However, in tenants with zero timelines (a legitimate state), the layer count remains zero forever. This caused https://github.com/neondatabase/neon/pull/7583 to destabilize the storage controller scale test, which creates lots of tenants, some of which don't get any timelines. ## Summary of changes - Use a None mtime instead of zero layer count to determine if a SecondaryProgress should be ignored. - Adjust the test to use a shorter heatmap upload period to let it proceed faster while waiting for scheduling optimizations to complete. --- storage_controller/src/service.rs | 2 +- test_runner/performance/test_storage_controller_scale.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index d3a53066c9..ae7e8d3d7d 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -4745,7 +4745,7 @@ impl Service { // them in an optimization const DOWNLOAD_FRESHNESS_THRESHOLD: u64 = 10 * 1024 * 1024 * 1024; - if progress.bytes_total == 0 + if progress.heatmap_mtime.is_none() || progress.bytes_total < DOWNLOAD_FRESHNESS_THRESHOLD && progress.bytes_downloaded != progress.bytes_total || progress.bytes_total - progress.bytes_downloaded diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 17dc96dabe..632d465c3f 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -102,6 +102,9 @@ def test_storage_controller_many_tenants( tenant_id, shard_count, stripe_size, + # Upload heatmaps fast, so that secondary downloads happen promptly, enabling + # the controller's optimization migrations to proceed promptly. + tenant_config={"heatmap_period": "10s"}, placement_policy={"Attached": 1}, ) futs.append(f) From 41fb838799ca2b0e3c20c440d49151b7153d9ff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Thu, 9 May 2024 16:01:16 +0200 Subject: [PATCH 014/130] Fix tiered compaction k-merge bug and use in-memory alternative (#7661) This PR does two things: First, it fixes a bug with tiered compaction's k-merge implementation. It ignored the lsn of a key during ordering, so multiple updates of the same key could be read in arbitrary order, say from different layers. For example there is layers `[(a, 2),(b, 3)]` and `[(a, 1),(c, 2)]` in the heap, they might return `(a,2)` and `(a,1)`. Ultimately, this change wasn't enough to fix the ordering issues in #7296, in other words there is likely still bugs in the k-merge. So as the second thing, we switch away from the k-merge to an in-memory based one, similar to #4839, but leave the code around to be improved and maybe switched to later on. Part of #7296 --- pageserver/compaction/src/compact_tiered.rs | 9 +++++-- pageserver/compaction/src/helpers.rs | 30 ++++++++++++++++++--- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 137b93055a..12882c9d59 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -24,7 +24,9 @@ use tracing::{debug, info}; use std::collections::{HashSet, VecDeque}; use std::ops::Range; -use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with}; +use crate::helpers::{ + accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, +}; use crate::interface::*; use utils::lsn::Lsn; @@ -535,7 +537,10 @@ where } } // Open stream - let key_value_stream = std::pin::pin!(merge_delta_keys::(deltas.as_slice(), ctx)); + let key_value_stream = + std::pin::pin!(merge_delta_keys_buffered::(deltas.as_slice(), ctx) + .await? + .map(Result::<_, anyhow::Error>::Ok)); let mut new_jobs = Vec::new(); // Slide a window through the keyspace diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index eb0e5ee82a..06454ee1d0 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -14,6 +14,7 @@ use std::future::Future; use std::ops::{DerefMut, Range}; use std::pin::Pin; use std::task::{ready, Poll}; +use utils::lsn::Lsn; pub fn keyspace_total_size( keyspace: &CompactionKeySpace, @@ -109,17 +110,40 @@ pub fn merge_delta_keys<'a, E: CompactionJobExecutor>( } } +pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>( + layers: &'a [E::DeltaLayer], + ctx: &'a E::RequestContext, +) -> anyhow::Result>::DeltaEntry<'a>>> +{ + let mut keys = Vec::new(); + for l in layers { + // Boxing and casting to LoadFuture is required to obtain the right Sync bound. + // If we do l.load_keys(ctx).await? directly, there is a compilation error. + let load_future: LoadFuture<'a, _> = Box::pin(l.load_keys(ctx)); + keys.extend(load_future.await?.into_iter()); + } + keys.sort_by_key(|k| (k.key(), k.lsn())); + let stream = futures::stream::iter(keys.into_iter()); + Ok(stream) +} + enum LazyLoadLayer<'a, E: CompactionJobExecutor> { Loaded(VecDeque<>::DeltaEntry<'a>>), Unloaded(&'a E::DeltaLayer), } impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> { - fn key(&self) -> E::Key { + fn min_key(&self) -> E::Key { match self { Self::Loaded(entries) => entries.front().unwrap().key(), Self::Unloaded(dl) => dl.key_range().start, } } + fn min_lsn(&self) -> Lsn { + match self { + Self::Loaded(entries) => entries.front().unwrap().lsn(), + Self::Unloaded(dl) => dl.lsn_range().start, + } + } } impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> { fn partial_cmp(&self, other: &Self) -> Option { @@ -129,12 +153,12 @@ impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> { impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> { fn cmp(&self, other: &Self) -> std::cmp::Ordering { // reverse order so that we get a min-heap - other.key().cmp(&self.key()) + (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn())) } } impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> { fn eq(&self, other: &Self) -> bool { - self.key().eq(&other.key()) + self.cmp(other) == std::cmp::Ordering::Equal } } impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {} From 2682e0254ffb82f2e1eef0ec875346742b6e8b4e Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Thu, 9 May 2024 11:15:19 -0400 Subject: [PATCH 015/130] Revert "chore(neon_test_utils): restrict installation to superuser" (#7679) This reverts commit 1173ee6a7e1168e671a6847eb94807b45c703490. ## Problem It breaks autoscaling tests --- pgxn/neon_test_utils/neon_test_utils.control | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control index 8c5b9b5dfe..5f6d640835 100644 --- a/pgxn/neon_test_utils/neon_test_utils.control +++ b/pgxn/neon_test_utils/neon_test_utils.control @@ -3,5 +3,4 @@ comment = 'helpers for neon testing and debugging' default_version = '1.1' module_pathname = '$libdir/neon_test_utils' relocatable = true -trusted = false -superuser = true +trusted = true From 5ea117cddfe3bc58c500f0eff8352af796b58268 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 9 May 2024 17:55:57 +0000 Subject: [PATCH 016/130] build(deps): bump Npgsql from 8.0.2 to 8.0.3 in /test_runner/pg_clients/csharp/npgsql (#7680) --- test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj index 50243e3ea7..edf2a01337 100644 --- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj +++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj @@ -8,7 +8,7 @@ - + From b9fd8dcf13e13b804047fc21089d2ecb509a1548 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Thu, 9 May 2024 15:52:56 -0400 Subject: [PATCH 017/130] fix(test): update the config for neon_binpath in from_repo_dir (#7684) ## Problem https://github.com/neondatabase/neon/pull/7637 breaks forward compat test. On commit ea531d448eb65c4f58abb9ef7d8cd461952f7c5f. https://neon-github-public-dev.s3.amazonaws.com/reports/main/8988324349/index.html ``` test_create_snapshot 2024-05-07T16:03:11.331883Z INFO version: git-env:ea531d448eb65c4f58abb9ef7d8cd461952f7c5f failpoints: true, features: ["testing"] launch_timestamp: 2024-05-07 16:03:11.316131763 UTC build_tag: build_tag-env:5159 test_forward_compatibility 2024-05-07T16:07:02.310769Z INFO version: git-env:ea531d448eb65c4f58abb9ef7d8cd461952f7c5f failpoints: true, features: ["testing"] launch_timestamp: 2024-05-07 16:07:02.294676183 UTC build_tag: build_tag-env:5159 ``` The forward compatibility test is actually using the same tag as the current build. The commit before that, https://neon-github-public-dev.s3.amazonaws.com/reports/main/8988126011/index.html ``` test_create_snapshot 2024-05-07T15:47:21.900796Z INFO version: git-env:2dbd1c1ed5cd0458933e8ffd40a9c0a5f4d610b8 failpoints: true, features: ["testing"] launch_timestamp: 2024-05-07 15:47:21.882784185 UTC build_tag: build_tag-env:5158 test_forward_compatibility 2024-05-07T15:50:48.828733Z INFO version: git-env:c4d7d5982553d2cf66634d1fbf85d95ef44a6524 failpoints: true, features: ["testing"] launch_timestamp: 2024-05-07 15:50:48.816635176 UTC build_tag: build_tag-env:release-5434 ``` This pull request patches the bin path so that the new neon_local will use the old binary. --------- Signed-off-by: Alex Chi Z --- test_runner/fixtures/neon_fixtures.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 390b94c2ea..da379693a0 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -701,6 +701,11 @@ class NeonEnvBuilder: config["default_tenant_id"] = snapshot_config["default_tenant_id"] config["branch_name_mappings"] = snapshot_config["branch_name_mappings"] + # Update the config with new neon + postgres path in case of compat test + # FIXME: overriding pg_distrib_dir cause storage controller fail to start + # config["pg_distrib_dir"] = str(self.pg_distrib_dir) + config["neon_distrib_dir"] = str(self.neon_binpath) + with (self.repo_dir / "config").open("w") as f: toml.dump(config, f) From be1a88e574379ef29005e5a8760105509046584a Mon Sep 17 00:00:00 2001 From: Anna Khanova <32508607+khanova@users.noreply.github.com> Date: Fri, 10 May 2024 12:17:00 +0200 Subject: [PATCH 018/130] Proxy added per ep rate limiter (#7636) ## Problem There is no global per-ep rate limiter in proxy. ## Summary of changes * Return global per-ep rate limiter back. * Rename weak compute rate limiter (the cli flags were not used anywhere, so it's safe to rename). --- proxy/src/auth/backend.rs | 73 ++++++++++++++++++++++++------ proxy/src/bin/proxy.rs | 26 ++++++++--- proxy/src/console/provider/neon.rs | 8 ++-- proxy/src/proxy.rs | 7 ++- proxy/src/rate_limiter/limiter.rs | 17 ++++--- proxy/src/serverless.rs | 19 ++++++-- proxy/src/serverless/backend.rs | 8 ++++ proxy/src/serverless/websocket.rs | 3 ++ 8 files changed, 126 insertions(+), 35 deletions(-) diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 3795e3b608..6a906b299b 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -13,7 +13,7 @@ use tokio_postgres::config::AuthKeys; use tracing::{info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; -use crate::auth::validate_password_and_exchange; +use crate::auth::{validate_password_and_exchange, AuthError}; use crate::cache::Cached; use crate::console::errors::GetAuthInfoError; use crate::console::provider::{CachedRoleSecret, ConsoleBackend}; @@ -23,7 +23,7 @@ use crate::intern::EndpointIdInt; use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; -use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo}; +use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo}; use crate::stream::Stream; use crate::{ auth::{self, ComputeUserInfoMaybeEndpoint}, @@ -280,6 +280,7 @@ async fn auth_quirks( client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, + endpoint_rate_limiter: Arc, ) -> auth::Result { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the endpoint (project) name. @@ -305,6 +306,10 @@ async fn auth_quirks( if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr)); } + + if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) { + return Err(AuthError::too_many_connections()); + } let cached_secret = match maybe_secret { Some(secret) => secret, None => api.get_role_secret(ctx, &info).await?, @@ -417,6 +422,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, + endpoint_rate_limiter: Arc, ) -> auth::Result> { use BackendType::*; @@ -428,8 +434,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { "performing authentication using the console" ); - let credentials = - auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?; + let credentials = auth_quirks( + ctx, + &*api, + user_info, + client, + allow_cleartext, + config, + endpoint_rate_limiter, + ) + .await?; BackendType::Console(api, credentials) } // NOTE: this auth backend doesn't use client credentials. @@ -539,7 +553,7 @@ mod tests { }, context::RequestMonitoring, proxy::NeonOptions, - rate_limiter::RateBucketInfo, + rate_limiter::{EndpointRateLimiter, RateBucketInfo}, scram::ServerSecret, stream::{PqStream, Stream}, }; @@ -699,10 +713,20 @@ mod tests { _ => panic!("wrong message"), } }); + let endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); - let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, &CONFIG) - .await - .unwrap(); + let _creds = auth_quirks( + &mut ctx, + &api, + user_info, + &mut stream, + false, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); handle.await.unwrap(); } @@ -739,10 +763,20 @@ mod tests { frontend::password_message(b"my-secret-password", &mut write).unwrap(); client.write_all(&write).await.unwrap(); }); + let endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); - let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG) - .await - .unwrap(); + let _creds = auth_quirks( + &mut ctx, + &api, + user_info, + &mut stream, + true, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); handle.await.unwrap(); } @@ -780,9 +814,20 @@ mod tests { client.write_all(&write).await.unwrap(); }); - let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG) - .await - .unwrap(); + let endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + + let creds = auth_quirks( + &mut ctx, + &api, + user_info, + &mut stream, + true, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); assert_eq!(creds.info.endpoint, "my-endpoint"); diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 5399f13edd..be7d961b8c 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -144,6 +144,9 @@ struct ProxyCliArgs { /// Can be given multiple times for different bucket sizes. #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] endpoint_rps_limit: Vec, + /// Wake compute rate limiter max number of requests per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + wake_compute_limit: Vec, /// Whether the auth rate limiter actually takes effect (for testing) #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] auth_rate_limit_enabled: bool, @@ -154,7 +157,7 @@ struct ProxyCliArgs { #[clap(long, default_value_t = 64)] auth_rate_limit_ip_subnet: u8, /// Redis rate limiter max number of requests per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] redis_rps_limit: Vec, /// cache for `allowed_ips` (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] @@ -365,6 +368,10 @@ async fn main() -> anyhow::Result<()> { proxy::metrics::CancellationSource::FromClient, )); + let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); + RateBucketInfo::validate(&mut endpoint_rps_limit)?; + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit)); + // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) let mut client_tasks = JoinSet::new(); @@ -373,6 +380,7 @@ async fn main() -> anyhow::Result<()> { proxy_listener, cancellation_token.clone(), cancellation_handler.clone(), + endpoint_rate_limiter.clone(), )); // TODO: rename the argument to something like serverless. @@ -387,6 +395,7 @@ async fn main() -> anyhow::Result<()> { serverless_listener, cancellation_token.clone(), cancellation_handler.clone(), + endpoint_rate_limiter.clone(), )); } @@ -559,11 +568,16 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let url = args.auth_endpoint.parse()?; let endpoint = http::Endpoint::new(url, http::new_client()); - let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); - RateBucketInfo::validate(&mut endpoint_rps_limit)?; - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit)); - let api = - console::provider::neon::Api::new(endpoint, caches, locks, endpoint_rate_limiter); + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit)); + let api = console::provider::neon::Api::new( + endpoint, + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); let api = console::provider::ConsoleBackend::Console(api); auth::BackendType::Console(MaybeOwned::Owned(api), ()) } diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index ec66641d01..7728d2cafa 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -26,7 +26,7 @@ pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, pub locks: &'static ApiLocks, - pub endpoint_rate_limiter: Arc, + pub wake_compute_endpoint_rate_limiter: Arc, jwt: String, } @@ -36,7 +36,7 @@ impl Api { endpoint: http::Endpoint, caches: &'static ApiCaches, locks: &'static ApiLocks, - endpoint_rate_limiter: Arc, + wake_compute_endpoint_rate_limiter: Arc, ) -> Self { let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") { Ok(v) => v, @@ -46,7 +46,7 @@ impl Api { endpoint, caches, locks, - endpoint_rate_limiter, + wake_compute_endpoint_rate_limiter, jwt, } } @@ -283,7 +283,7 @@ impl super::Api for Api { // check rate limit if !self - .endpoint_rate_limiter + .wake_compute_endpoint_rate_limiter .check(user_info.endpoint.normalize().into(), 1) { return Err(WakeComputeError::TooManyConnections); diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index e4e095d77d..5824b70df9 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -19,6 +19,7 @@ use crate::{ metrics::{Metrics, NumClientConnectionsGuard}, protocol2::read_proxy_protocol, proxy::handshake::{handshake, HandshakeData}, + rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, EndpointCacheKey, }; @@ -61,6 +62,7 @@ pub async fn task_main( listener: tokio::net::TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("proxy has shut down"); @@ -86,6 +88,7 @@ pub async fn task_main( let cancellation_handler = Arc::clone(&cancellation_handler); tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection"); + let endpoint_rate_limiter2 = endpoint_rate_limiter.clone(); connections.spawn(async move { let (socket, peer_addr) = match read_proxy_protocol(socket).await{ @@ -123,6 +126,7 @@ pub async fn task_main( cancellation_handler, socket, ClientMode::Tcp, + endpoint_rate_limiter2, conn_gauge, ) .instrument(span.clone()) @@ -234,6 +238,7 @@ pub async fn handle_client( cancellation_handler: Arc, stream: S, mode: ClientMode, + endpoint_rate_limiter: Arc, conn_gauge: NumClientConnectionsGuard<'static>, ) -> Result>, ClientRequestError> { info!( @@ -243,7 +248,6 @@ pub async fn handle_client( let metrics = &Metrics::get().proxy; let proto = ctx.protocol; - // let _client_gauge = metrics.client_connections.guard(proto); let _request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); @@ -286,6 +290,7 @@ pub async fn handle_client( &mut stream, mode.allow_cleartext(), &config.authentication_config, + endpoint_rate_limiter, ) .await { diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 5ba2c36436..b8c9490696 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -128,12 +128,18 @@ impl std::str::FromStr for RateBucketInfo { } impl RateBucketInfo { - pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [ + pub const DEFAULT_SET: [Self; 3] = [ Self::new(300, Duration::from_secs(1)), Self::new(200, Duration::from_secs(60)), Self::new(100, Duration::from_secs(600)), ]; + pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [ + Self::new(500, Duration::from_secs(1)), + Self::new(300, Duration::from_secs(60)), + Self::new(200, Duration::from_secs(600)), + ]; + pub fn validate(info: &mut [Self]) -> anyhow::Result<()> { info.sort_unstable_by_key(|info| info.interval); let invalid = info @@ -266,7 +272,7 @@ mod tests { #[test] fn default_rate_buckets() { - let mut defaults = RateBucketInfo::DEFAULT_ENDPOINT_SET; + let mut defaults = RateBucketInfo::DEFAULT_SET; RateBucketInfo::validate(&mut defaults[..]).unwrap(); } @@ -333,11 +339,8 @@ mod tests { let rand = rand::rngs::StdRng::from_seed([1; 32]); let hasher = BuildHasherDefault::::default(); - let limiter = BucketRateLimiter::new_with_rand_and_hasher( - &RateBucketInfo::DEFAULT_ENDPOINT_SET, - rand, - hasher, - ); + let limiter = + BucketRateLimiter::new_with_rand_and_hasher(&RateBucketInfo::DEFAULT_SET, rand, hasher); for i in 0..1_000_000 { limiter.check(i, 1); } diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index cbff51f207..f634ab4e98 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -36,6 +36,7 @@ use crate::context::RequestMonitoring; use crate::metrics::Metrics; use crate::protocol2::read_proxy_protocol; use crate::proxy::run_until_cancelled; +use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; use crate::serverless::http_util::{api_error_into_response, json_response}; @@ -54,6 +55,7 @@ pub async fn task_main( ws_listener: TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("websocket server has shut down"); @@ -82,6 +84,7 @@ pub async fn task_main( let backend = Arc::new(PoolingBackend { pool: Arc::clone(&conn_pool), config, + endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), }); let tls_config = match config.tls_config.as_ref() { @@ -129,6 +132,7 @@ pub async fn task_main( backend.clone(), connections.clone(), cancellation_handler.clone(), + endpoint_rate_limiter.clone(), conn_token.clone(), server.clone(), tls_acceptor.clone(), @@ -162,6 +166,7 @@ async fn connection_handler( backend: Arc, connections: TaskTracker, cancellation_handler: Arc, + endpoint_rate_limiter: Arc, cancellation_token: CancellationToken, server: Builder, tls_acceptor: TlsAcceptor, @@ -245,6 +250,7 @@ async fn connection_handler( session_id, peer_addr, http_request_token, + endpoint_rate_limiter.clone(), ) .in_current_span() .map_ok_or_else(api_error_into_response, |r| r), @@ -285,6 +291,7 @@ async fn request_handler( peer_addr: IpAddr, // used to cancel in-flight HTTP requests. not used to cancel websockets http_cancellation_token: CancellationToken, + endpoint_rate_limiter: Arc, ) -> Result>, ApiError> { let host = request .headers() @@ -310,9 +317,15 @@ async fn request_handler( ws_connections.spawn( async move { - if let Err(e) = - websocket::serve_websocket(config, ctx, websocket, cancellation_handler, host) - .await + if let Err(e) = websocket::serve_websocket( + config, + ctx, + websocket, + cancellation_handler, + endpoint_rate_limiter, + host, + ) + .await { error!("error in websocket connection: {e:#}"); } diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index ce58f575e2..6b79c12316 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -16,6 +16,7 @@ use crate::{ context::RequestMonitoring, error::{ErrorKind, ReportableError, UserFacingError}, proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry}, + rate_limiter::EndpointRateLimiter, Host, }; @@ -24,6 +25,7 @@ use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; pub struct PoolingBackend { pub pool: Arc>, pub config: &'static ProxyConfig, + pub endpoint_rate_limiter: Arc, } impl PoolingBackend { @@ -39,6 +41,12 @@ impl PoolingBackend { if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { return Err(AuthError::ip_address_not_allowed(ctx.peer_addr)); } + if !self + .endpoint_rate_limiter + .check(conn_info.user_info.endpoint.clone().into(), 1) + { + return Err(AuthError::too_many_connections()); + } let cached_secret = match maybe_secret { Some(secret) => secret, None => backend.get_role_secret(ctx).await?, diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index b6cd85af73..649bec2c7c 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -5,6 +5,7 @@ use crate::{ error::{io_error, ReportableError}, metrics::Metrics, proxy::{handle_client, ClientMode}, + rate_limiter::EndpointRateLimiter, }; use bytes::{Buf, Bytes}; use futures::{Sink, Stream}; @@ -134,6 +135,7 @@ pub async fn serve_websocket( mut ctx: RequestMonitoring, websocket: HyperWebsocket, cancellation_handler: Arc, + endpoint_rate_limiter: Arc, hostname: Option, ) -> anyhow::Result<()> { let websocket = websocket.await?; @@ -148,6 +150,7 @@ pub async fn serve_websocket( cancellation_handler, WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, + endpoint_rate_limiter, conn_gauge, ) .await; From 13d9589c35d444b444b6ed9ef4d8d7144ad232d0 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 10 May 2024 12:01:39 +0100 Subject: [PATCH 019/130] pageserver: don't call get_vectored with empty keyspace (#7686) ## Problem This caused a variation of the stats bug fixed by https://github.com/neondatabase/neon/pull/7662. That PR also fixed this case, but we still shouldn't make redundant get calls. ## Summary of changes - Only call get in the create image layers loop at the end of a range if some keys have been accumulated --- pageserver/src/tenant/timeline.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 5983529a44..60b3873b71 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4230,7 +4230,7 @@ impl Timeline { // Maybe flush `key_rest_accum` if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS - || last_key_in_range + || (last_key_in_range && key_request_accum.raw_size() > 0) { let results = self .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) From 873b2220808e0bc059edb631186ed19b0238394d Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Fri, 10 May 2024 15:04:23 +0400 Subject: [PATCH 020/130] use own arm64 gha runners (#7373) ## Problem Move from aws based arm64 runners to bare-metal based ## Summary of changes Changes in GitHub action workflows where `runs-on: arm64` used. More parallelism added, build time for `neon with extra platform builds` workflow reduced from 45m to 25m --- .config/nextest.toml | 2 +- .github/actionlint.yml | 4 +-- .github/workflows/build-build-tools-image.yml | 2 +- .github/workflows/neon_extra_builds.yml | 33 ++++++++++++------- 4 files changed, 24 insertions(+), 17 deletions(-) diff --git a/.config/nextest.toml b/.config/nextest.toml index a9398e4ab0..affdc16f31 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -1,2 +1,2 @@ [profile.default] -slow-timeout = { period = "20s", terminate-after = 3 } +slow-timeout = { period = "60s", terminate-after = 3 } diff --git a/.github/actionlint.yml b/.github/actionlint.yml index cb36e2eee6..942861ecd8 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -1,11 +1,9 @@ self-hosted-runner: labels: - arm64 - - dev - gen3 - large - # Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged. - - macos-14 + - large-arm64 - small - us-east-2 config-variables: diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index c527cef1ac..bdf00bcaae 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -39,7 +39,7 @@ jobs: matrix: arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }} + runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} env: IMAGE_TAG: ${{ inputs.image-tag }} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 5a2f9d6645..fdb03963fb 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -136,7 +136,7 @@ jobs: check-linux-arm-build: needs: [ check-permissions, build-build-tools-image ] timeout-minutes: 90 - runs-on: [ self-hosted, dev, arm64 ] + runs-on: [ self-hosted, large-arm64 ] env: # Use release build only, to have less debug info around @@ -232,20 +232,20 @@ jobs: - name: Run cargo build run: | - mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests + mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc) - name: Run cargo test env: NEXTEST_RETRIES: 3 run: | - cargo nextest run $CARGO_FEATURES + cargo nextest run $CARGO_FEATURES -j$(nproc) # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo nextest run --package remote_storage --test test_real_s3 + cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc) # Run separate tests for real Azure Blob Storage # XXX: replace region with `eu-central-1`-like region @@ -255,12 +255,12 @@ jobs: export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo nextest run --package remote_storage --test test_real_azure + cargo nextest run --package remote_storage --test test_real_azure -j$(nproc) check-codestyle-rust-arm: needs: [ check-permissions, build-build-tools-image ] timeout-minutes: 90 - runs-on: [ self-hosted, dev, arm64 ] + runs-on: [ self-hosted, large-arm64 ] container: image: ${{ needs.build-build-tools-image.outputs.image }} @@ -269,6 +269,11 @@ jobs: password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init + strategy: + fail-fast: false + matrix: + build_type: [ debug, release ] + steps: - name: Fix git ownership run: | @@ -305,31 +310,35 @@ jobs: exit 1 fi echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV + - name: Run cargo clippy (debug) + if: matrix.build_type == 'debug' run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS - name: Run cargo clippy (release) + if: matrix.build_type == 'release' run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS - name: Check documentation generation - run: cargo doc --workspace --no-deps --document-private-items + if: matrix.build_type == 'release' + run: cargo doc --workspace --no-deps --document-private-items -j$(nproc) env: RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - name: Check formatting - if: ${{ !cancelled() }} + if: ${{ !cancelled() && matrix.build_type == 'release' }} run: cargo fmt --all -- --check # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - name: Check rust dependencies - if: ${{ !cancelled() }} + if: ${{ !cancelled() && matrix.build_type == 'release' }} run: | cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack # https://github.com/EmbarkStudios/cargo-deny - name: Check rust licenses/bans/advisories/sources - if: ${{ !cancelled() }} + if: ${{ !cancelled() && matrix.build_type == 'release' }} run: cargo deny check gather-rust-build-stats: @@ -338,7 +347,7 @@ jobs: contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' - runs-on: [ self-hosted, gen3, large ] + runs-on: [ self-hosted, large ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -369,7 +378,7 @@ jobs: run: make walproposer-lib -j$(nproc) - name: Produce the build stats - run: cargo build --all --release --timings + run: cargo build --all --release --timings -j$(nproc) - name: Upload the build stats id: upload-stats From 0b02043ba4e8d477b77a1f01bef9809c1f433ab4 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Fri, 10 May 2024 13:32:42 +0100 Subject: [PATCH 021/130] Fix permissions for safekeeper failpoints (#7669) We didn't check permission in `"/v1/failpoints"` endpoint, it means that everyone with per-tenant token could modify the failpoints. This commit fixes that. --- safekeeper/src/http/routes.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 9ce26e6c5d..30d0081a47 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -519,6 +519,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder .get("/v1/status", |r| request_span(r, status_handler)) .put("/v1/failpoints", |r| { request_span(r, move |r| async { + check_permission(&r, None)?; let cancel = CancellationToken::new(); failpoints_handler(r, cancel).await }) From 86905c132205b5cbffb199fb002bc9c0484d4f43 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 10 May 2024 17:15:11 +0300 Subject: [PATCH 022/130] openapi: resolve the synthetic_size duplication (#7651) We had accidentally left two endpoints for `tenant`: `/synthetic_size` and `/size`. Size had the more extensive description but has returned 404 since renaming. Remove the `/size` in favor of the working one and describe the `text/html` output. --- pageserver/src/http/openapi_spec.yml | 38 +++++----------------------- 1 file changed, 6 insertions(+), 32 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index c425f3e628..36c74ed140 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -420,25 +420,6 @@ paths: description: Tenant scheduled to load successfully /v1/tenant/{tenant_id}/synthetic_size: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - get: - description: | - Calculate tenant's synthetic size - responses: - "200": - description: Tenant's synthetic size - content: - application/json: - schema: - $ref: "#/components/schemas/SyntheticSizeResponse" - - # This route has no handler. TODO: remove? - /v1/tenant/{tenant_id}/size: parameters: - name: tenant_id in: path @@ -468,19 +449,9 @@ paths: content: application/json: schema: - type: object - required: - - id - - size - properties: - id: - type: string - format: hex - size: - type: integer - nullable: true - description: | - Size metric in bytes or null if inputs_only=true was given. + $ref: "#/components/schemas/SyntheticSizeResponse" + text/html: + description: SVG representation of the tenant and it's timelines. "401": description: Unauthorized Error content: @@ -929,6 +900,9 @@ components: format: hex size: type: integer + nullable: true + description: | + Size metric in bytes or null if inputs_only=true was given. segment_sizes: type: array items: From d7f34bc3399c31f8e4c773cb5ae6f919e5d02d64 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 10 May 2024 17:41:34 +0300 Subject: [PATCH 023/130] draw_timeline_dir: draw branch points and gc cutoff lines (#7657) in addition to layer names, expand the input vocabulary to recognize lines in the form of: ${kind}:${lsn} where: - kind in `gc_cutoff` or `branch` - lsn is accepted in Lsn display format (x/y) or hex (as used in layer names) gc_cutoff and branch have different colors. --- pageserver/ctl/src/draw_timeline_dir.rs | 89 +++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 7 deletions(-) diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 9a556cb3d4..d8082f8ab4 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -28,6 +28,8 @@ //! # From an `index_part.json` in S3 //! (jq -r '.layer_metadata | keys[]' | cargo run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg //! +//! # enrich with lines for gc_cutoff and a child branch point +//! cat <(jq -r '.historic_layers[] | .layer_file_name' < layers.json) <(echo -e 'gc_cutoff:0000001CE3FE32C9\nbranch:0000001DE3FE32C9') | cargo run --bin pagectl draw-timeline >| out.svg //! ``` //! //! ## Viewing @@ -48,7 +50,7 @@ //! ``` //! -use anyhow::Result; +use anyhow::{Context, Result}; use pageserver::repository::Key; use pageserver::METADATA_FILE_NAME; use std::cmp::Ordering; @@ -90,6 +92,33 @@ fn parse_filename(name: &str) -> (Range, Range) { (keys, lsns) } +#[derive(Clone, Copy)] +enum LineKind { + GcCutoff, + Branch, +} + +impl From for Fill { + fn from(value: LineKind) -> Self { + match value { + LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)), + LineKind::Branch => Fill::Color(rgb(0, 255, 0)), + } + } +} + +impl FromStr for LineKind { + type Err = anyhow::Error; + + fn from_str(s: &str) -> std::prelude::v1::Result { + Ok(match s { + "gc_cutoff" => LineKind::GcCutoff, + "branch" => LineKind::Branch, + _ => anyhow::bail!("unsupported linekind: {s}"), + }) + } +} + pub fn main() -> Result<()> { // Parse layer filenames from stdin struct Layer { @@ -99,8 +128,29 @@ pub fn main() -> Result<()> { } let mut files: Vec = vec![]; let stdin = io::stdin(); - for line in stdin.lock().lines() { + + let mut lines: Vec<(Lsn, LineKind)> = vec![]; + + for (lineno, line) in stdin.lock().lines().enumerate() { + let lineno = lineno + 1; + let line = line.unwrap(); + if let Some((kind, lsn)) = line.split_once(':') { + let (kind, lsn) = LineKind::from_str(kind) + .context("parse kind") + .and_then(|kind| { + if lsn.contains('/') { + Lsn::from_str(lsn) + } else { + Lsn::from_hex(lsn) + } + .map(|lsn| (kind, lsn)) + .context("parse lsn") + }) + .with_context(|| format!("parse {line:?} on {lineno}"))?; + lines.push((lsn, kind)); + continue; + } let line = PathBuf::from_str(&line).unwrap(); let filename = line.file_name().unwrap(); let filename = filename.to_str().unwrap(); @@ -117,8 +167,9 @@ pub fn main() -> Result<()> { } // Collect all coordinates - let mut keys: Vec = vec![]; - let mut lsns: Vec = vec![]; + let mut keys: Vec = Vec::with_capacity(files.len()); + let mut lsns: Vec = Vec::with_capacity(files.len() + lines.len()); + for Layer { key_range: keyr, lsn_range: lsnr, @@ -131,6 +182,8 @@ pub fn main() -> Result<()> { lsns.push(lsnr.end); } + lsns.extend(lines.iter().map(|(lsn, _)| *lsn)); + // Analyze let key_map = build_coordinate_compression_map(keys); let lsn_map = build_coordinate_compression_map(lsns); @@ -144,10 +197,13 @@ pub fn main() -> Result<()> { println!( "{}", BeginSvg { - w: key_map.len() as f32, + w: (key_map.len() + 10) as f32, h: stretch * lsn_map.len() as f32 } ); + + let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas + for Layer { filename, key_range: keyr, @@ -169,7 +225,6 @@ pub fn main() -> Result<()> { let mut lsn_diff = (lsn_end - lsn_start) as f32; let mut fill = Fill::None; let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas - let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas let mut lsn_offset = 0.0; // Fill in and thicken rectangle if it's an @@ -189,7 +244,7 @@ pub fn main() -> Result<()> { println!( " {}", rectangle( - key_start as f32 + stretch * xmargin, + 5.0 + key_start as f32 + stretch * xmargin, stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)), key_diff as f32 - stretch * 2.0 * xmargin, stretch * (lsn_diff - 2.0 * ymargin) @@ -200,6 +255,26 @@ pub fn main() -> Result<()> { .comment(filename) ); } + + for (lsn, kind) in lines { + let lsn_start = *lsn_map.get(&lsn).unwrap(); + let lsn_end = lsn_start; + let stretch = 2.0; + let lsn_diff = 0.3; + let lsn_offset = -lsn_diff / 2.0; + let ymargin = 0.05; + println!( + "{}", + rectangle( + 0.0f32 + stretch * xmargin, + stretch * (lsn_map.len() as f32 - (lsn_end as f32 - ymargin - lsn_offset)), + (key_map.len() + 10) as f32, + stretch * (lsn_diff - 2.0 * ymargin) + ) + .fill(kind) + ); + } + println!("{}", EndSvg); eprintln!("num_images: {}", num_images); From 6206f76419416c6c936c97df5e660d28333ee835 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 10 May 2024 17:46:50 +0300 Subject: [PATCH 024/130] build: run doctests (#7697) While switching to use nextest with the repository in f28bdb6, we had not noticed that it doesn't yet support running doctests. Run the doc tests before other tests. --- .github/workflows/build_and_test.yml | 3 +++ libs/utils/src/poison.rs | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index eada65505f..21e7a56670 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -341,6 +341,9 @@ jobs: env: NEXTEST_RETRIES: 3 run: | + #nextest does not yet support running doctests + cargo test --doc $CARGO_FLAGS $CARGO_FEATURES + for io_engine in std-fs tokio-epoll-uring ; do NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES done diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs index 0bf5664f47..27378c69fc 100644 --- a/libs/utils/src/poison.rs +++ b/libs/utils/src/poison.rs @@ -3,7 +3,7 @@ //! # Example //! //! ``` -//! # tokio_test::block_on(async { +//! # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { //! use utils::poison::Poison; //! use std::time::Duration; //! From d7c68dc981db2d73cb5ff617472266b29bbc2ace Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 10 May 2024 17:50:47 +0200 Subject: [PATCH 025/130] Tiered compaction: fix early exit check in main loop (#7702) The old test based on the immutable `target_file_size` that was a parameter to the function. It makes no sense to go further once `current_level_target_height` has reached `u64::MAX`, as lsn's are u64 typed. In practice, we should only run into this if there is a bug, as the practical lsn range usually ends much earlier. Testing on `target_file_size` makes less sense, it basically implements an invocation mode that turns off the looping and only runs one iteration of it. @hlinnaka agrees that `current_level_target_height` is better here. Part of #7554 --- pageserver/compaction/src/compact_tiered.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 12882c9d59..20e9cf2196 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -106,7 +106,13 @@ pub async fn compact_tiered( ctx, ) .await?; - if target_file_size == u64::MAX { + if current_level_target_height == u64::MAX { + // our target height includes all possible lsns + info!( + level = current_level_no, + depth = depth, + "compaction loop reached max current_level_target_height" + ); break; } current_level_no += 1; From 95098c3216929506b66ce244a2420d07ab65e8dc Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 10 May 2024 17:20:14 +0100 Subject: [PATCH 026/130] Fix checkpoint metric (#7701) Split checkpoint_stats into two separate metrics: checkpoints_req and checkpoints_timed Fixes commit https://github.com/neondatabase/neon/commit/21e1a496a3f706097578de396a9107813c541001 --------- Co-authored-by: Peter Bendel --- vm-image-spec.yaml | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 56538630ac..e9d983eba3 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -278,15 +278,21 @@ files: ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) END AS replication_delay_seconds; - - metric_name: checkpoint_stats + - metric_name: checkpoints_req type: gauge - help: 'Number of requested and scheduled checkpoints' + help: 'Number of requested checkpoints' key_labels: - values: - - checkpoints_req - - checkpoints_timed + values: [checkpoints_req] query: | - SELECT checkpoints_req, checkpoints_timed FROM pg_stat_bgwriter; + SELECT checkpoints_req FROM pg_stat_bgwriter; + + - metric_name: checkpoints_timed + type: gauge + help: 'Number of scheduled checkpoints' + key_labels: + values: [checkpoints_timed] + query: | + SELECT checkpoints_timed FROM pg_stat_bgwriter; - filename: neon_collector_autoscaling.yml content: | collector_name: neon_collector_autoscaling From 6351313ae96ab6d0e3e2b27ed2d86eed3dd004c9 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 10 May 2024 22:30:05 +0300 Subject: [PATCH 027/130] feat: allow detaching from ancestor for timelines without writes (#7639) The first implementation #7456 did not include `index_part.json` changes in an attempt to keep amount of changes down. Tracks the historic reparentings and earlier detach in `index_part.json`. - `index_part.json` receives a new field `lineage: Lineage` - `Lineage` is queried through RemoteTimelineClient during basebackup, creating `PREV LSN: none` for the invalid prev record lsn just as it would had been created for a newly created timeline - as `struct IndexPart` grew, it is now boxed in places Cc: #6994 --- pageserver/src/basebackup.rs | 2 +- pageserver/src/tenant/metadata.rs | 6 +- .../src/tenant/remote_timeline_client.rs | 28 +++- .../tenant/remote_timeline_client/index.rs | 140 +++++++++++++++++- pageserver/src/tenant/timeline.rs | 13 +- .../src/tenant/timeline/detach_ancestor.rs | 10 -- pageserver/src/tenant/upload_queue.rs | 8 +- s3_scrubber/src/checks.rs | 4 +- s3_scrubber/src/tenant_snapshot.rs | 2 +- .../regress/test_timeline_detach_ancestor.py | 83 +++++------ 10 files changed, 225 insertions(+), 71 deletions(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 58b18dae7d..dca1510810 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -601,7 +601,7 @@ where // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { - if self.lsn == self.timeline.get_ancestor_lsn() { + if self.timeline.is_ancestor_lsn(self.lsn) { write!(zenith_signal, "PREV LSN: none") .map_err(|e| BasebackupError::Server(e.into()))?; } else { diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 8ba0775120..fc71ea7642 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -214,12 +214,12 @@ impl TimelineMetadata { self.body.ancestor_timeline = Some(*timeline); } - pub fn detach_from_ancestor(&mut self, timeline: &TimelineId, ancestor_lsn: &Lsn) { + pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) { if let Some(ancestor) = self.body.ancestor_timeline { - assert_eq!(ancestor, *timeline); + assert_eq!(ancestor, branchpoint.0); } if self.body.ancestor_lsn != Lsn(0) { - assert_eq!(self.body.ancestor_lsn, *ancestor_lsn); + assert_eq!(self.body.ancestor_lsn, branchpoint.1); } self.body.ancestor_timeline = None; self.body.ancestor_lsn = Lsn(0); diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index bbe4e16378..9103760388 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -437,6 +437,19 @@ impl RemoteTimelineClient { } } + /// Returns true if this timeline was previously detached at this Lsn and the remote timeline + /// client is currently initialized. + pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { + // technically this is a dirty read, but given how timeline detach ancestor is implemented + // via tenant restart, the lineage has always been uploaded. + self.upload_queue + .lock() + .unwrap() + .initialized_mut() + .map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn)) + .unwrap_or(false) + } + fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) { let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part { current_remote_index_part @@ -628,7 +641,7 @@ impl RemoteTimelineClient { ); let index_part = IndexPart::from(&*upload_queue); - let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn); + let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn); self.metric_begin(&op); upload_queue.queued_operations.push_back(op); upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0; @@ -647,7 +660,14 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; + let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else { + return Err(anyhow::anyhow!( + "cannot reparent without a current ancestor" + )); + }; + upload_queue.latest_metadata.reparent(new_parent); + upload_queue.latest_lineage.record_previous_ancestor(&prev); self.schedule_index_upload(upload_queue); @@ -670,9 +690,8 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - upload_queue - .latest_metadata - .detach_from_ancestor(&adopted.0, &adopted.1); + upload_queue.latest_metadata.detach_from_ancestor(&adopted); + upload_queue.latest_lineage.record_detaching(&adopted); for layer in layers { upload_queue @@ -1811,6 +1830,7 @@ impl RemoteTimelineClient { latest_files: initialized.latest_files.clone(), latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: initialized.latest_metadata.clone(), + latest_lineage: initialized.latest_lineage.clone(), projected_remote_consistent_lsn: None, visible_remote_consistent_lsn: initialized .visible_remote_consistent_lsn diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 3e05905afa..b114d6aa10 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -6,6 +6,7 @@ use std::collections::HashMap; use chrono::NaiveDateTime; use serde::{Deserialize, Serialize}; +use utils::id::TimelineId; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::LayerName; @@ -84,6 +85,9 @@ pub struct IndexPart { #[serde(rename = "metadata_bytes")] pub metadata: TimelineMetadata, + + #[serde(default)] + pub(crate) lineage: Lineage, } impl IndexPart { @@ -96,10 +100,11 @@ impl IndexPart { /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers /// is always generated from the keys of `layer_metadata`) /// - 4: timeline_layers is fully removed. - const LATEST_VERSION: usize = 4; + /// - 5: lineage was added + const LATEST_VERSION: usize = 5; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -107,6 +112,7 @@ impl IndexPart { layers_and_metadata: &HashMap, disk_consistent_lsn: Lsn, metadata: TimelineMetadata, + lineage: Lineage, ) -> Self { let layer_metadata = layers_and_metadata .iter() @@ -119,6 +125,7 @@ impl IndexPart { disk_consistent_lsn, metadata, deleted_at: None, + lineage, } } @@ -147,6 +154,7 @@ impl IndexPart { &HashMap::new(), example_metadata.disk_consistent_lsn(), example_metadata, + Default::default(), ) } } @@ -155,8 +163,9 @@ impl From<&UploadQueueInitialized> for IndexPart { fn from(uq: &UploadQueueInitialized) -> Self { let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn(); let metadata = uq.latest_metadata.clone(); + let lineage = uq.latest_lineage.clone(); - Self::new(&uq.latest_files, disk_consistent_lsn, metadata) + Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage) } } @@ -184,8 +193,76 @@ impl From<&LayerFileMetadata> for IndexLayerMetadata { } } +/// Limited history of earlier ancestors. +/// +/// A timeline can have more than 1 earlier ancestor, in the rare case that it was repeatedly +/// reparented by having an later timeline be detached from it's ancestor. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] +pub(crate) struct Lineage { + /// Has the `reparenting_history` been truncated to [`Lineage::REMEMBER_AT_MOST`]. + #[serde(skip_serializing_if = "is_false", default)] + reparenting_history_truncated: bool, + + /// Earlier ancestors, truncated when [`Self::reparenting_history_truncated`] + /// + /// These are stored in case we want to support WAL based DR on the timeline. There can be many + /// of these and at most one [`Self::original_ancestor`]. There cannot be more reparentings + /// after [`Self::original_ancestor`] has been set. + #[serde(skip_serializing_if = "Vec::is_empty", default)] + reparenting_history: Vec, + + /// The ancestor from which this timeline has been detached from and when. + /// + /// If you are adding support for detaching from a hierarchy, consider changing the ancestry + /// into a `Vec<(TimelineId, Lsn)>` to be a path instead. + #[serde(skip_serializing_if = "Option::is_none", default)] + original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>, +} + +fn is_false(b: &bool) -> bool { + !b +} + +impl Lineage { + const REMEMBER_AT_MOST: usize = 100; + + pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) { + if self.reparenting_history.last() == Some(old_ancestor) { + // do not re-record it + return; + } + + let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST; + + self.reparenting_history_truncated |= drop_oldest; + if drop_oldest { + self.reparenting_history.remove(0); + } + self.reparenting_history.push(*old_ancestor); + } + + pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) { + assert!(self.original_ancestor.is_none()); + + self.original_ancestor = + Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc())); + } + + /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed + /// to start a read/write primary at this lsn". + /// + /// Returns true if the Lsn was previously a branch point. + pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { + self.original_ancestor + .as_ref() + .is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn) + } +} + #[cfg(test)] mod tests { + use std::str::FromStr; + use super::*; #[test] @@ -221,6 +298,7 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + lineage: Lineage::default(), }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -261,6 +339,7 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + lineage: Lineage::default(), }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -302,7 +381,8 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(chrono::NaiveDateTime::parse_from_str( - "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()) + "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), + lineage: Lineage::default(), }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -347,6 +427,7 @@ mod tests { ]) .unwrap(), deleted_at: None, + lineage: Lineage::default(), }; let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -385,11 +466,58 @@ mod tests { ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), - deleted_at: Some(chrono::NaiveDateTime::parse_from_str( - "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + lineage: Lineage::default(), }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } + + #[test] + fn v5_indexpart_is_parsed() { + let example = r#"{ + "version":5, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499":{"file_size":23289856,"generation":1}, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619":{"file_size":1015808,"generation":1}}, + "disk_consistent_lsn":"0/15A7618", + "metadata_bytes":[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], + "lineage":{ + "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"], + "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"] + } + }"#; + + let expected = IndexPart { + version: 5, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata { + file_size: 23289856, + generation: Generation::new(1), + shard: ShardIndex::unsharded(), + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata { + file_size: 1015808, + generation: Generation::new(1), + shard: ShardIndex::unsharded(), + }) + ]), + disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(), + metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), + deleted_at: None, + lineage: Lineage { + reparenting_history_truncated: false, + reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], + original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), + }, + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + fn parse_naive_datetime(s: &str) -> NaiveDateTime { + chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap() + } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 60b3873b71..505dc8c30d 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3037,6 +3037,18 @@ impl Timeline { Some(HeatMapTimeline::new(self.timeline_id, layers)) } + + /// Returns true if the given lsn is or was an ancestor branchpoint. + pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool { + // upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original + // branchpoint in the value in IndexPart::lineage + self.ancestor_lsn == lsn + || (self.ancestor_lsn == Lsn::INVALID + && self + .remote_client + .as_ref() + .is_some_and(|rtc| rtc.is_previous_ancestor_lsn(lsn))) + } } type TraversalId = Arc; @@ -4354,7 +4366,6 @@ impl Timeline { /// - has an ancestor to detach from /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not /// a technical requirement - /// - has prev_lsn in remote storage (temporary restriction) /// /// After the operation has been started, it cannot be canceled. Upon restart it needs to be /// polled again until completion. diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 2641bf3d13..69b82344a6 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -22,8 +22,6 @@ pub(crate) enum Error { TooManyAncestors, #[error("shutting down, please retry later")] ShuttingDown, - #[error("detached timeline must receive writes before the operation")] - DetachedTimelineNeedsWrites, #[error("flushing failed")] FlushAncestor(#[source] anyhow::Error), #[error("layer download failed")] @@ -94,14 +92,6 @@ pub(super) async fn prepare( return Err(TooManyAncestors); } - if detached.get_prev_record_lsn() == Lsn::INVALID - || detached.disk_consistent_lsn.load() == ancestor_lsn - { - // this is to avoid a problem that after detaching we would be unable to start up the - // compute because of "PREV_LSN: invalid". - return Err(DetachedTimelineNeedsWrites); - } - // before we acquire the gate, we must mark the ancestor as having a detach operation // ongoing which will block other concurrent detach operations so we don't get to ackward // situations where there would be two branches trying to reparent earlier branches. diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 7797117e0f..a2f761fa94 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -3,6 +3,7 @@ use super::storage_layer::ResidentLayer; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use crate::tenant::remote_timeline_client::index::Lineage; use std::collections::{HashMap, VecDeque}; use std::fmt::Debug; @@ -56,6 +57,9 @@ pub(crate) struct UploadQueueInitialized { /// DANGER: do not return to outside world, e.g., safekeepers. pub(crate) latest_metadata: TimelineMetadata, + /// Part of the flattened "next" `index_part.json`. + pub(crate) latest_lineage: Lineage, + /// `disk_consistent_lsn` from the last metadata file that was successfully /// uploaded. `Lsn(0)` if nothing was uploaded yet. /// Unlike `latest_files` or `latest_metadata`, this value is never ahead. @@ -171,6 +175,7 @@ impl UploadQueue { latest_files: HashMap::new(), latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: metadata.clone(), + latest_lineage: Lineage::default(), projected_remote_consistent_lsn: None, visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)), // what follows are boring default initializations @@ -218,6 +223,7 @@ impl UploadQueue { latest_files: files, latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: index_part.metadata.clone(), + latest_lineage: index_part.lineage.clone(), projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()), visible_remote_consistent_lsn: Arc::new( index_part.metadata.disk_consistent_lsn().into(), @@ -290,7 +296,7 @@ pub(crate) enum UploadOp { UploadLayer(ResidentLayer, LayerFileMetadata), /// Upload the metadata file - UploadMetadata(IndexPart, Lsn), + UploadMetadata(Box, Lsn), /// Delete layer files Delete(Delete), diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index 68133fc0a9..dd64a0a98f 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -246,7 +246,7 @@ pub(crate) struct S3TimelineBlobData { #[derive(Debug)] pub(crate) enum BlobDataParseResult { Parsed { - index_part: IndexPart, + index_part: Box, index_part_generation: Generation, s3_layers: HashSet<(LayerName, Generation)>, }, @@ -368,7 +368,7 @@ pub(crate) async fn list_timeline_blobs( Ok(index_part) => { return Ok(S3TimelineBlobData { blob_data: BlobDataParseResult::Parsed { - index_part, + index_part: Box::new(index_part), index_part_generation, s3_layers, }, diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs index 2c93a8490a..a24a1e92ae 100644 --- a/s3_scrubber/src/tenant_snapshot.rs +++ b/s3_scrubber/src/tenant_snapshot.rs @@ -159,7 +159,7 @@ impl SnapshotDownloader { async fn download_timeline( &self, ttid: TenantShardTimelineId, - index_part: IndexPart, + index_part: Box, index_part_generation: Generation, ancestor_layers: &mut HashMap< TenantShardTimelineId, diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index b8a88ca6df..075f0a6bbc 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1,3 +1,4 @@ +import datetime import enum from concurrent.futures import ThreadPoolExecutor from queue import Empty, Queue @@ -12,6 +13,7 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import HistoricLayerInfo from fixtures.pageserver.utils import wait_timeline_detail_404 +from fixtures.remote_storage import LocalFsStorage from fixtures.types import Lsn, TimelineId @@ -56,15 +58,16 @@ SHUTDOWN_ALLOWED_ERRORS = [ @pytest.mark.parametrize("branchpoint", Branchpoint.all()) @pytest.mark.parametrize("restart_after", [True, False]) +@pytest.mark.parametrize("write_to_branch_first", [True, False]) def test_ancestor_detach_branched_from( - neon_env_builder: NeonEnvBuilder, branchpoint: Branchpoint, restart_after: bool + neon_env_builder: NeonEnvBuilder, + branchpoint: Branchpoint, + restart_after: bool, + write_to_branch_first: bool, ): """ Creates a branch relative to L0 lsn boundary according to Branchpoint. Later the timeline is detached. """ - # TODO: parametrize; currently unimplemented over at pageserver - write_to_branch_first = True - env = neon_env_builder.init_start() env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) @@ -174,8 +177,7 @@ def test_ancestor_detach_branched_from( wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) -@pytest.mark.parametrize("restart_after", [True, False]) -def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, restart_after: bool): +def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): """ The case from RFC: @@ -204,9 +206,6 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res We confirm the end result by being able to delete "old main" after deleting "after". """ - # TODO: support not yet implemented for these - write_to_branch_first = True - env = neon_env_builder.init_start() env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) @@ -244,42 +243,57 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None) - if write_to_branch_first: - with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep: - assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == 8192 - with ep.cursor() as cur: - cur.execute("UPDATE audit SET starts = starts + 1") - assert cur.rowcount == 1 - wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id) - - client.timeline_checkpoint(env.initial_tenant, timeline_id) - all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) assert all_reparented == {reparented, same_branchpoint} - if restart_after: - env.pageserver.stop() - env.pageserver.start() - env.pageserver.quiesce_tenants() # checking the ancestor after is much faster than waiting for the endpoint not start expected_result = [ ("main", env.initial_timeline, None, 16384, 1), ("after", after, env.initial_timeline, 16384, 1), - ("new main", timeline_id, None, 8192, 2), + ("new main", timeline_id, None, 8192, 1), ("same_branchpoint", same_branchpoint, timeline_id, 8192, 1), ("reparented", reparented, timeline_id, 0, 1), ] - for _, timeline_id, expected_ancestor, _, _ in expected_result: - details = client.timeline_detail(env.initial_tenant, timeline_id) + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + for _, queried_timeline, expected_ancestor, _, _ in expected_result: + details = client.timeline_detail(env.initial_tenant, queried_timeline) ancestor_timeline_id = details["ancestor_timeline_id"] if expected_ancestor is None: assert ancestor_timeline_id is None else: assert TimelineId(ancestor_timeline_id) == expected_ancestor + index_part = env.pageserver_remote_storage.index_content( + env.initial_tenant, queried_timeline + ) + lineage = index_part["lineage"] + assert lineage is not None + + assert lineage.get("reparenting_history_overflown", "false") == "false" + + if queried_timeline == timeline_id: + original_ancestor = lineage["original_ancestor"] + assert original_ancestor is not None + assert original_ancestor[0] == str(env.initial_timeline) + assert original_ancestor[1] == str(branchpoint_x) + + # this does not contain Z in the end, so fromisoformat accepts it + # it is to be in line with the deletion timestamp.. well, almost. + when = original_ancestor[2][:26] + when_ts = datetime.datetime.fromisoformat(when) + assert when_ts < datetime.datetime.now() + assert len(lineage.get("reparenting_history", [])) == 0 + elif expected_ancestor == timeline_id: + assert len(lineage.get("original_ancestor", [])) == 0 + assert lineage["reparenting_history"] == [str(env.initial_timeline)] + else: + assert len(lineage.get("original_ancestor", [])) == 0 + assert len(lineage.get("reparenting_history", [])) == 0 + for name, _, _, rows, starts in expected_result: with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows @@ -293,14 +307,10 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) -@pytest.mark.parametrize("restart_after", [True, False]) -def test_detached_receives_flushes_while_being_detached( - neon_env_builder: NeonEnvBuilder, restart_after: bool -): +def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder): """ Makes sure that the timeline is able to receive writes through-out the detach process. """ - write_to_branch_first = True env = neon_env_builder.init_start() @@ -330,12 +340,6 @@ def test_detached_receives_flushes_while_being_detached( ep = env.endpoints.create_start("new main", tenant_id=env.initial_tenant) assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows - if write_to_branch_first: - rows += insert_rows(256, ep) - wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id) - client.timeline_checkpoint(env.initial_tenant, timeline_id) - log.info("completed {write_to_branch_first=}") - def small_txs(ep, queue: Queue[str], barrier): extra_rows = 0 @@ -368,11 +372,6 @@ def test_detached_receives_flushes_while_being_detached( reparented = client.detach_ancestor(env.initial_tenant, timeline_id) assert len(reparented) == 0 - if restart_after: - # ep and row production is kept alive on purpose - env.pageserver.stop() - env.pageserver.start() - env.pageserver.quiesce_tenants() queue.put("done") From 4270e86eb282367fedebe87a5a363619b466eece Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 13 May 2024 10:58:03 +0300 Subject: [PATCH 028/130] test(ancestor detach): verify with fullbackup (#7706) In timeline detach ancestor tests there is no way to really be sure that there were no subtle off-by one bugs. One such bug is demoed and reverted. Add verifying fullbackup is equal before and after detaching ancestor. Fullbackup is expected to be equal apart from `zenith.signal`, which is known to be good because endpoint can be started without the detached branch receiving writes. --- .../regress/test_timeline_detach_ancestor.py | 89 ++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 075f0a6bbc..214e10c32e 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1,14 +1,19 @@ import datetime import enum +import tarfile +import time from concurrent.futures import ThreadPoolExecutor +from hashlib import sha256 +from pathlib import Path from queue import Empty, Queue from threading import Barrier -from typing import List +from typing import IO, List, Set, Tuple, Union import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, + PgBin, wait_for_last_flush_lsn, ) from fixtures.pageserver.http import HistoricLayerInfo @@ -60,7 +65,10 @@ SHUTDOWN_ALLOWED_ERRORS = [ @pytest.mark.parametrize("restart_after", [True, False]) @pytest.mark.parametrize("write_to_branch_first", [True, False]) def test_ancestor_detach_branched_from( + test_output_dir, + pg_distrib_dir, neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, branchpoint: Branchpoint, restart_after: bool, write_to_branch_first: bool, @@ -70,6 +78,7 @@ def test_ancestor_detach_branched_from( """ env = neon_env_builder.init_start() + psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) client = env.pageserver.http_client() @@ -146,6 +155,20 @@ def test_ancestor_detach_branched_from( else: branch_layers = set() + # run fullbackup to make sure there are no off by one errors + # take this on the parent + fullbackup_before = test_output_dir / "fullbackup-before.tar" + cmd = [ + "psql", + "--no-psqlrc", + env.pageserver.connstr(), + "-c", + f"fullbackup {env.initial_tenant} {env.initial_timeline} {branch_at}", + "-o", + str(fullbackup_before), + ] + pg_bin.run_capture(cmd, env=psql_env) + all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) assert all_reparented == set() @@ -173,9 +196,73 @@ def test_ancestor_detach_branched_from( # but if nothing was copied, then there is no nice rule. # there could be a hole in LSNs between copied from the "old main" and the first branch layer. + # take this on the detached, at same lsn + fullbackup_after = test_output_dir / "fullbackup-after.tar" + cmd = [ + "psql", + "--no-psqlrc", + env.pageserver.connstr(), + "-c", + f"fullbackup {env.initial_tenant} {timeline_id} {branch_at}", + "-o", + str(fullbackup_after), + ] + pg_bin.run_capture(cmd, env=psql_env) + client.timeline_delete(env.initial_tenant, env.initial_timeline) wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) + # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different + # as there is always "PREV_LSN: invalid" for "before" + skip_files = {"zenith.signal"} + + tar_cmp(fullbackup_before, fullbackup_after, skip_files) + + +def tar_cmp(left: Path, right: Path, skip_files: Set[str]): + """ + This is essentially: + + lines=$(comm -3 \ + <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \ + <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \ + | wc -l) + [ "$lines" = "0" ] + + But in a more mac friendly fashion. + """ + started_at = time.time() + + def hash_extracted(reader: Union[IO[bytes], None]) -> bytes: + assert reader is not None + digest = sha256(usedforsecurity=False) + while True: + buf = reader.read(64 * 1024) + if not buf: + break + digest.update(buf) + return digest.digest() + + def build_hash_list(p: Path) -> List[Tuple[str, bytes]]: + with tarfile.open(p) as f: + matching_files = (info for info in f if info.isreg() and info.name not in skip_files) + ret = list( + map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files) + ) + ret.sort(key=lambda t: t[0]) + return ret + + left_list, right_list = map(build_hash_list, [left, right]) + + try: + assert len(left_list) == len(right_list) + + for left_tuple, right_tuple in zip(left_list, right_list): + assert left_tuple == right_tuple + finally: + elapsed = time.time() - started_at + log.info(f"tar_cmp completed in {elapsed}s") + def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): """ From 216fc5ba7beb48ac7ae45aa007f72f1a3c0f62e0 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 13 May 2024 11:56:07 +0300 Subject: [PATCH 029/130] test: fix confusing limit and logging (#7589) The test has been flaky since 2024-04-11 for unknown reason, and the logging was off. Fix the logging and raise the limit a bit. The problematic ratio reproduces with pg14 and added sleep (not included) but not on pg15. The new ratio abs diff limit works for all inspected examples. Cc: #7536 --- test_runner/regress/test_disk_usage_eviction.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 5e9efa7cce..1d73e9cb18 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -623,15 +623,16 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): ratio = count_now / original_count abs_diff = abs(ratio - expected_ratio) assert original_count > count_now - log.info( - f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < 0.1" - ) + expectation = 0.06 + log.info( + f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}" + ) # in this test case both relative_spare and relative_equal produce # the same outcomes; this must be a quantization effect of similar # sizes (-s4 and -s6) and small (5MB) layer size. # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02 - assert abs_diff < 0.05 + assert abs_diff < expectation @pytest.mark.parametrize( From 1a1d52787579a617028c66f8a9f41b46641a1035 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 13 May 2024 12:21:49 +0300 Subject: [PATCH 030/130] test: allow vectored get validation failure during shutdown (#7716) Per [evidence] the timeline ancestor detach tests can panic while shutting down on vectored get validation. Allow the error because tenant is restarted twice in the test. [evidence]: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7708/9058185709/index.html#suites/a1c2be32556270764423c495fad75d47/d444f7e5c0a18ce9 --- test_runner/regress/test_timeline_detach_ancestor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 214e10c32e..1e961a4b2f 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -58,6 +58,8 @@ class Branchpoint(str, enum.Enum): SHUTDOWN_ALLOWED_ERRORS = [ ".*initial size calculation failed: downloading failed, possibly for shutdown", ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*logical_size_calculation_task:panic.*: Sequential get failed with Bad state \\(not active\\).*", + ".*Task 'initial size calculation' .* panicked.*", ] From b58a615197374da349525da843b04849c47d610f Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 13 May 2024 11:22:10 +0200 Subject: [PATCH 031/130] chore(pageserver): plumb through RequestContext to VirtualFile read methods (#7720) This PR introduces no functional changes. The `open()` path will be done separately. refs https://github.com/neondatabase/neon/issues/6107 refs https://github.com/neondatabase/neon/issues/7386 --- pageserver/src/tenant/block_io.rs | 7 +- .../src/tenant/ephemeral_file/page_caching.rs | 2 +- .../src/tenant/storage_layer/delta_layer.rs | 9 +- .../src/tenant/storage_layer/image_layer.rs | 5 +- pageserver/src/tenant/vectored_blob_io.rs | 4 +- pageserver/src/virtual_file.rs | 98 +++++++++++++------ 6 files changed, 82 insertions(+), 43 deletions(-) diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 37c84be342..92928116c1 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -102,7 +102,7 @@ impl<'a> BlockReaderRef<'a> { #[cfg(test)] TestDisk(r) => r.read_blk(blknum), #[cfg(test)] - VirtualFile(r) => r.read_blk(blknum).await, + VirtualFile(r) => r.read_blk(blknum, ctx).await, } } } @@ -177,10 +177,11 @@ impl<'a> FileBlockReader<'a> { &self, buf: PageWriteGuard<'static>, blkno: u32, + ctx: &RequestContext, ) -> Result, std::io::Error> { assert!(buf.len() == PAGE_SZ); self.file - .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64) + .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64, ctx) .await } /// Read a block. @@ -206,7 +207,7 @@ impl<'a> FileBlockReader<'a> { ReadBufResult::Found(guard) => Ok(guard.into()), ReadBufResult::NotFound(write_guard) => { // Read the page from disk into the buffer - let write_guard = self.fill_buffer(write_guard, blknum).await?; + let write_guard = self.fill_buffer(write_guard, blknum, ctx).await?; Ok(write_guard.mark_valid().into()) } } diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 42def8858e..276ac87064 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -78,7 +78,7 @@ impl RW { page_cache::ReadBufResult::NotFound(write_guard) => { let write_guard = writer .file - .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64) + .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx) .await?; let read_guard = write_guard.mark_valid(); return Ok(BlockLease::PageReadGuard(read_guard)); diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index c38c9bb656..4f30cf2e84 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -908,7 +908,7 @@ impl DeltaLayerInner { .await .map_err(GetVectoredError::Other)?; - self.do_reads_and_update_state(reads, reconstruct_state) + self.do_reads_and_update_state(reads, reconstruct_state, ctx) .await; reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start); @@ -1012,6 +1012,7 @@ impl DeltaLayerInner { &self, reads: Vec, reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, ) { let vectored_blob_reader = VectoredBlobReader::new(&self.file); let mut ignore_key_with_err = None; @@ -1029,7 +1030,7 @@ impl DeltaLayerInner { // track when a key is done. for read in reads.into_iter().rev() { let res = vectored_blob_reader - .read_blobs(&read, buf.take().expect("Should have a buffer")) + .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx) .await; let blobs_buf = match res { @@ -1274,7 +1275,7 @@ impl DeltaLayerInner { buf.clear(); buf.reserve(read.size()); - let res = reader.read_blobs(&read, buf).await?; + let res = reader.read_blobs(&read, buf, ctx).await?; for blob in res.blobs { let key = blob.meta.key; @@ -1848,7 +1849,7 @@ mod test { for read in vectored_reads { let blobs_buf = vectored_blob_reader - .read_blobs(&read, buf.take().expect("Should have a buffer")) + .read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx) .await?; for meta in blobs_buf.blobs.iter() { let value = &blobs_buf.buf[meta.start..meta.end]; diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index c9874873e4..72d1f36cab 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -474,7 +474,7 @@ impl ImageLayerInner { .await .map_err(GetVectoredError::Other)?; - self.do_reads_and_update_state(reads, reconstruct_state) + self.do_reads_and_update_state(reads, reconstruct_state, ctx) .await; Ok(()) @@ -537,6 +537,7 @@ impl ImageLayerInner { &self, reads: Vec, reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, ) { let max_vectored_read_bytes = self .max_vectored_read_bytes @@ -565,7 +566,7 @@ impl ImageLayerInner { } let buf = BytesMut::with_capacity(buf_size); - let res = vectored_blob_reader.read_blobs(&read, buf).await; + let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await; match res { Ok(blobs_buf) => { diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 91934d5e0e..6e825760e3 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -23,6 +23,7 @@ use pageserver_api::key::Key; use utils::lsn::Lsn; use utils::vec_map::VecMap; +use crate::context::RequestContext; use crate::virtual_file::VirtualFile; #[derive(Copy, Clone, Debug, PartialEq, Eq)] @@ -285,6 +286,7 @@ impl<'a> VectoredBlobReader<'a> { &self, read: &VectoredRead, buf: BytesMut, + ctx: &RequestContext, ) -> Result { assert!(read.size() > 0); assert!( @@ -295,7 +297,7 @@ impl<'a> VectoredBlobReader<'a> { ); let buf = self .file - .read_exact_at_n(buf, read.start, read.size()) + .read_exact_at_n(buf, read.start, read.size(), ctx) .await?; let blobs_at = read.blobs_at.as_slice(); diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index a17488a286..8dee73891b 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -576,21 +576,34 @@ impl VirtualFile { Ok(self.pos) } - pub async fn read_exact_at(&self, buf: B, offset: u64) -> Result + pub async fn read_exact_at( + &self, + buf: B, + offset: u64, + ctx: &RequestContext, + ) -> Result where B: IoBufMut + Send, { - let (buf, res) = - read_exact_at_impl(buf, offset, None, |buf, offset| self.read_at(buf, offset)).await; + let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| { + self.read_at(buf, offset, ctx) + }) + .await; res.map(|()| buf) } - pub async fn read_exact_at_n(&self, buf: B, offset: u64, count: usize) -> Result + pub async fn read_exact_at_n( + &self, + buf: B, + offset: u64, + count: usize, + ctx: &RequestContext, + ) -> Result where B: IoBufMut + Send, { let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| { - self.read_at(buf, offset) + self.read_at(buf, offset, ctx) }) .await; res.map(|()| buf) @@ -601,12 +614,13 @@ impl VirtualFile { &self, page: PageWriteGuard<'static>, offset: u64, + ctx: &RequestContext, ) -> Result, Error> { let buf = PageWriteGuardBuf { page, init_up_to: 0, }; - let res = self.read_exact_at(buf, offset).await; + let res = self.read_exact_at(buf, offset, ctx).await; res.map(|PageWriteGuardBuf { page, .. }| page) .map_err(|e| Error::new(ErrorKind::Other, e)) } @@ -699,7 +713,12 @@ impl VirtualFile { (buf, Ok(n)) } - pub(crate) async fn read_at(&self, buf: B, offset: u64) -> (B, Result) + pub(crate) async fn read_at( + &self, + buf: B, + offset: u64, + _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ + ) -> (B, Result) where B: tokio_epoll_uring::BoundedBufMut + Send, { @@ -1020,20 +1039,21 @@ impl VirtualFile { pub(crate) async fn read_blk( &self, blknum: u32, + ctx: &RequestContext, ) -> Result, std::io::Error> { use crate::page_cache::PAGE_SZ; let buf = vec![0; PAGE_SZ]; let buf = self - .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64)) + .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx) .await?; Ok(crate::tenant::block_io::BlockLease::Vec(buf)) } - async fn read_to_end(&mut self, buf: &mut Vec) -> Result<(), Error> { + async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { let mut tmp = vec![0; 128]; loop { let res; - (tmp, res) = self.read_at(tmp, self.pos).await; + (tmp, res) = self.read_at(tmp, self.pos, ctx).await; match res { Ok(0) => return Ok(()), Ok(n) => { @@ -1176,9 +1196,14 @@ mod tests { } impl MaybeVirtualFile { - async fn read_exact_at(&self, mut buf: Vec, offset: u64) -> Result, Error> { + async fn read_exact_at( + &self, + mut buf: Vec, + offset: u64, + ctx: &RequestContext, + ) -> Result, Error> { match self { - MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await, + MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset, ctx).await, MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf), } } @@ -1230,13 +1255,13 @@ mod tests { // Helper function to slurp contents of a file, starting at the current position, // into a string - async fn read_string(&mut self) -> Result { + async fn read_string(&mut self, ctx: &RequestContext) -> Result { use std::io::Read; let mut buf = String::new(); match self { MaybeVirtualFile::VirtualFile(file) => { let mut buf = Vec::new(); - file.read_to_end(&mut buf).await?; + file.read_to_end(&mut buf, ctx).await?; return Ok(String::from_utf8(buf).unwrap()); } MaybeVirtualFile::File(file) => { @@ -1247,9 +1272,14 @@ mod tests { } // Helper function to slurp a portion of a file into a string - async fn read_string_at(&mut self, pos: u64, len: usize) -> Result { + async fn read_string_at( + &mut self, + pos: u64, + len: usize, + ctx: &RequestContext, + ) -> Result { let buf = vec![0; len]; - let buf = self.read_exact_at(buf, pos).await?; + let buf = self.read_exact_at(buf, pos, ctx).await?; Ok(String::from_utf8(buf).unwrap()) } } @@ -1303,7 +1333,7 @@ mod tests { file_a.write_all(b"foobar".to_vec(), &ctx).await?; // cannot read from a file opened in write-only mode - let _ = file_a.read_string().await.unwrap_err(); + let _ = file_a.read_string(&ctx).await.unwrap_err(); // Close the file and re-open for reading let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?; @@ -1312,24 +1342,24 @@ mod tests { let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err(); // Try simple read - assert_eq!("foobar", file_a.read_string().await?); + assert_eq!("foobar", file_a.read_string(&ctx).await?); // It's positioned at the EOF now. - assert_eq!("", file_a.read_string().await?); + assert_eq!("", file_a.read_string(&ctx).await?); // Test seeks. assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1); - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); assert_eq!(file_a.seek(SeekFrom::End(-2)).await?, 4); - assert_eq!("ar", file_a.read_string().await?); + assert_eq!("ar", file_a.read_string(&ctx).await?); assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1); assert_eq!(file_a.seek(SeekFrom::Current(2)).await?, 3); - assert_eq!("bar", file_a.read_string().await?); + assert_eq!("bar", file_a.read_string(&ctx).await?); assert_eq!(file_a.seek(SeekFrom::Current(-5)).await?, 1); - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); // Test erroneous seeks to before byte 0 file_a.seek(SeekFrom::End(-7)).await.unwrap_err(); @@ -1337,7 +1367,7 @@ mod tests { file_a.seek(SeekFrom::Current(-2)).await.unwrap_err(); // the erroneous seek should have left the position unchanged - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); // Create another test file, and try FileExt functions on it. let path_b = testdir.join("file_b"); @@ -1354,7 +1384,7 @@ mod tests { file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?; file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?; - assert_eq!(file_b.read_string_at(2, 3).await?, "OBA"); + assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA"); // Open a lot of files, enough to cause some evictions. (Or to be precise, // open the same file many times. The effect is the same.) @@ -1366,7 +1396,7 @@ mod tests { for _ in 0..100 { let mut vfile = openfunc(path_b.clone(), OpenOptions::new().read(true).to_owned()).await?; - assert_eq!("FOOBAR", vfile.read_string().await?); + assert_eq!("FOOBAR", vfile.read_string(&ctx).await?); vfiles.push(vfile); } @@ -1375,13 +1405,13 @@ mod tests { // The underlying file descriptor for 'file_a' should be closed now. Try to read // from it again. We left the file positioned at offset 1 above. - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); // Check that all the other FDs still work too. Use them in random order for // good measure. vfiles.as_mut_slice().shuffle(&mut thread_rng()); for vfile in vfiles.iter_mut() { - assert_eq!("OOBAR", vfile.read_string_at(1, 5).await?); + assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?); } Ok(()) @@ -1397,6 +1427,7 @@ mod tests { const THREADS: usize = 100; const SAMPLE: [u8; SIZE] = [0xADu8; SIZE]; + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency"); std::fs::create_dir_all(&testdir)?; @@ -1425,12 +1456,13 @@ mod tests { let mut hdls = Vec::new(); for _threadno in 0..THREADS { let files = files.clone(); + let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error); let hdl = rt.spawn(async move { let mut buf = vec![0u8; SIZE]; let mut rng = rand::rngs::OsRng; for _ in 1..1000 { let f = &files[rng.gen_range(0..files.len())]; - buf = f.read_exact_at(buf, 0).await.unwrap(); + buf = f.read_exact_at(buf, 0, &ctx).await.unwrap(); assert!(buf == SAMPLE); } }); @@ -1446,6 +1478,7 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_basic() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic"); std::fs::create_dir_all(&testdir).unwrap(); @@ -1456,7 +1489,7 @@ mod tests { .await .unwrap(); let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); - let post = file.read_string().await.unwrap(); + let post = file.read_string(&ctx).await.unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); drop(file); @@ -1465,7 +1498,7 @@ mod tests { .await .unwrap(); let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); - let post = file.read_string().await.unwrap(); + let post = file.read_string(&ctx).await.unwrap(); assert_eq!(post, "bar"); assert!(!tmp_path.exists()); drop(file); @@ -1473,6 +1506,7 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_preexisting_tmp() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp"); std::fs::create_dir_all(&testdir).unwrap(); @@ -1488,7 +1522,7 @@ mod tests { .unwrap(); let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); - let post = file.read_string().await.unwrap(); + let post = file.read_string(&ctx).await.unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); drop(file); From f50ff1456091d8e5ac8db2d01f4b058ce36d6569 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 13 May 2024 13:05:46 +0100 Subject: [PATCH 032/130] pageserver: refuse to run without remote storage (#7722) ## Problem Since https://github.com/neondatabase/neon/pull/6769, the pageserver is intentionally not usable without remote storage: it's purpose is to act as a cache to an object store, rather than as a source of truth in its own right. ## Summary of changes - Make remote storage configuration mandatory: the pageserver will refuse to start if it is not provided. This is a precursor that will make it safe to subsequently remove all the internal Option<>s --- pageserver/src/bin/pageserver.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index eb4b8bb8bb..49f8a41b37 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -383,7 +383,7 @@ fn start_pageserver( let shutdown_pageserver = tokio_util::sync::CancellationToken::new(); // Set up remote storage client - let remote_storage = create_remote_storage_client(conf)?; + let remote_storage = Some(create_remote_storage_client(conf)?); // Set up deletion queue let (deletion_queue, deletion_workers) = DeletionQueue::new( @@ -708,12 +708,11 @@ fn start_pageserver( fn create_remote_storage_client( conf: &'static PageServerConf, -) -> anyhow::Result> { +) -> anyhow::Result { let config = if let Some(config) = &conf.remote_storage_config { config } else { - tracing::warn!("no remote storage configured, this is a deprecated configuration"); - return Ok(None); + anyhow::bail!("no remote storage configured, this is a deprecated configuration"); }; // Create the client @@ -733,7 +732,7 @@ fn create_remote_storage_client( GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures); } - Ok(Some(remote_storage)) + Ok(remote_storage) } fn cli() -> Command { From d9dcbffac37ccd3331ec9adcd12fd20ce0ea31aa Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 13 May 2024 15:16:23 +0300 Subject: [PATCH 033/130] python: allow using allowed_errors.py (#7719) See #7718. Fix it by renaming all `types.py` to `common_types.py`. Additionally, add an advert for using `allowed_errors.py` to test any added regex. --- scripts/check_allowed_errors.sh | 18 ++++++++++++++++++ test_runner/fixtures/benchmark_fixture.py | 2 +- .../fixtures/{types.py => common_types.py} | 0 test_runner/fixtures/compute_reconfigure.py | 2 +- test_runner/fixtures/neon_fixtures.py | 4 ++-- .../fixtures/pageserver/allowed_errors.py | 5 +++-- .../pageserver/{types.py => common_types.py} | 2 +- test_runner/fixtures/pageserver/http.py | 2 +- .../fixtures/pageserver/many_tenants.py | 2 +- .../fixtures/pageserver/remote_storage.py | 4 ++-- test_runner/fixtures/pageserver/utils.py | 2 +- test_runner/fixtures/remote_storage.py | 2 +- test_runner/fixtures/safekeeper/http.py | 2 +- test_runner/fixtures/safekeeper/utils.py | 2 +- test_runner/fixtures/utils.py | 7 ++++--- test_runner/fixtures/workload.py | 2 +- test_runner/performance/pageserver/util.py | 2 +- .../performance/test_branch_creation.py | 2 +- test_runner/performance/test_bulk_insert.py | 2 +- .../test_storage_controller_scale.py | 2 +- .../performance/test_wal_backpressure.py | 2 +- test_runner/regress/test_ancestor_branch.py | 2 +- .../regress/test_attach_tenant_config.py | 2 +- test_runner/regress/test_auth.py | 2 +- test_runner/regress/test_branch_and_gc.py | 2 +- test_runner/regress/test_branch_behind.py | 2 +- test_runner/regress/test_branching.py | 2 +- test_runner/regress/test_broken_timeline.py | 2 +- test_runner/regress/test_compatibility.py | 2 +- .../regress/test_disk_usage_eviction.py | 2 +- test_runner/regress/test_duplicate_layers.py | 2 +- test_runner/regress/test_fullbackup.py | 2 +- test_runner/regress/test_gc_aggressive.py | 2 +- test_runner/regress/test_import.py | 2 +- test_runner/regress/test_layer_eviction.py | 2 +- test_runner/regress/test_layers_from_future.py | 4 ++-- .../regress/test_logical_replication.py | 2 +- test_runner/regress/test_lsn_mapping.py | 2 +- test_runner/regress/test_neon_cli.py | 2 +- test_runner/regress/test_next_xid.py | 2 +- test_runner/regress/test_old_request_lsn.py | 2 +- test_runner/regress/test_ondemand_download.py | 2 +- .../regress/test_ondemand_slru_download.py | 2 +- test_runner/regress/test_pageserver_api.py | 2 +- .../regress/test_pageserver_generations.py | 2 +- .../test_pageserver_getpage_throttle.py | 2 +- .../regress/test_pageserver_layer_rolling.py | 2 +- .../test_pageserver_metric_collection.py | 2 +- .../regress/test_pageserver_secondary.py | 4 ++-- test_runner/regress/test_pitr_gc.py | 2 +- test_runner/regress/test_read_trace.py | 2 +- test_runner/regress/test_readonly_node.py | 2 +- test_runner/regress/test_remote_storage.py | 4 ++-- test_runner/regress/test_s3_restore.py | 2 +- test_runner/regress/test_s3_scrubber.py | 2 +- test_runner/regress/test_sharding.py | 2 +- test_runner/regress/test_storage_controller.py | 2 +- test_runner/regress/test_tenant_conf.py | 2 +- test_runner/regress/test_tenant_delete.py | 2 +- test_runner/regress/test_tenant_detach.py | 2 +- test_runner/regress/test_tenant_relocation.py | 2 +- test_runner/regress/test_tenant_size.py | 2 +- test_runner/regress/test_tenant_tasks.py | 2 +- test_runner/regress/test_tenants.py | 2 +- .../test_tenants_with_remote_storage.py | 4 ++-- test_runner/regress/test_timeline_delete.py | 2 +- .../regress/test_timeline_detach_ancestor.py | 2 +- test_runner/regress/test_timeline_size.py | 2 +- test_runner/regress/test_wal_acceptor.py | 2 +- test_runner/regress/test_wal_acceptor_async.py | 2 +- test_runner/regress/test_wal_receiver.py | 2 +- test_runner/regress/test_wal_restore.py | 2 +- .../test_walredo_not_left_behind_on_detach.py | 2 +- 73 files changed, 100 insertions(+), 80 deletions(-) create mode 100755 scripts/check_allowed_errors.sh rename test_runner/fixtures/{types.py => common_types.py} (100%) rename test_runner/fixtures/pageserver/{types.py => common_types.py} (98%) diff --git a/scripts/check_allowed_errors.sh b/scripts/check_allowed_errors.sh new file mode 100755 index 0000000000..87e52c1e64 --- /dev/null +++ b/scripts/check_allowed_errors.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -eu + +HELPER_DIR="$(dirname "${BASH_SOURCE[0]}")" +SCRIPT="test_runner/fixtures/pageserver/allowed_errors.py" + +# first run to understand all of the errors: +# +# example: ./scripts/check_allowed_errors.sh -i - < pageserver.log +# example: ./scripts/check_allowed_errors.sh -i pageserver.log +# +# then edit the test local allowed_errors to the +# test_runner/fixtures/pageserver/allowed_errors.py, then re-run to make sure +# they are handled. +# +# finally revert any local changes to allowed_errors.py. +poetry run python3 "$HELPER_DIR/../$SCRIPT" $* diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index c32748f6f0..038f557cc8 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -19,9 +19,9 @@ from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest from _pytest.terminal import TerminalReporter +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonPageserver -from fixtures.types import TenantId, TimelineId """ This file contains fixtures for micro-benchmarks. diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/common_types.py similarity index 100% rename from test_runner/fixtures/types.py rename to test_runner/fixtures/common_types.py diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index a883d94f73..66fc35b6aa 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -5,8 +5,8 @@ import pytest from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response +from fixtures.common_types import TenantId from fixtures.log_helper import log -from fixtures.types import TenantId class ComputeReconfigure: diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index da379693a0..0c2b70202e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -47,14 +47,15 @@ from urllib3.util.retry import Retry from fixtures import overlayfs from fixtures.broker import NeonBroker +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pageserver.allowed_errors import ( DEFAULT_PAGESERVER_ALLOWED_ERRORS, DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS, ) +from fixtures.pageserver.common_types import IndexPartDump, LayerName, parse_layer_file_name from fixtures.pageserver.http import PageserverHttpClient -from fixtures.pageserver.types import IndexPartDump, LayerName, parse_layer_file_name from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload, @@ -72,7 +73,6 @@ from fixtures.remote_storage import ( ) from fixtures.safekeeper.http import SafekeeperHttpClient from fixtures.safekeeper.utils import are_walreceivers_absent -from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import ( ATTACHMENT_NAME_REGEX, allure_add_grafana_links, diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 58a76d7586..91cd67d107 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -131,9 +131,10 @@ if __name__ == "__main__": "-i", "--input", type=argparse.FileType("r"), - default=sys.stdin, - help="Pageserver logs file. Reads from stdin if no file is provided.", + help="Pageserver logs file. Use '-' for stdin.", + required=True, ) + args = parser.parse_args() errors = _check_allowed_errors(args.input) diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/common_types.py similarity index 98% rename from test_runner/fixtures/pageserver/types.py rename to test_runner/fixtures/pageserver/common_types.py index 1fb618f445..a6c327a8a0 100644 --- a/test_runner/fixtures/pageserver/types.py +++ b/test_runner/fixtures/pageserver/common_types.py @@ -2,7 +2,7 @@ import re from dataclasses import dataclass from typing import Any, Dict, Tuple, Union -from fixtures.types import KEY_MAX, KEY_MIN, Key, Lsn +from fixtures.common_types import KEY_MAX, KEY_MIN, Key, Lsn @dataclass diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index b06972056c..0b2963d89c 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -11,10 +11,10 @@ import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pg_version import PgVersion -from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import Fn diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index f47a3ea043..def80a1c3e 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -3,6 +3,7 @@ import time from typing import Any, Callable, Dict, Tuple import fixtures.pageserver.remote_storage +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -12,7 +13,6 @@ from fixtures.pageserver.utils import ( wait_until_tenant_state, ) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import TenantId, TimelineId def single_timeline( diff --git a/test_runner/fixtures/pageserver/remote_storage.py b/test_runner/fixtures/pageserver/remote_storage.py index e6cd9b4614..0c3612716a 100644 --- a/test_runner/fixtures/pageserver/remote_storage.py +++ b/test_runner/fixtures/pageserver/remote_storage.py @@ -6,13 +6,13 @@ import threading from pathlib import Path from typing import Any, List, Tuple +from fixtures.common_types import TenantId, TimelineId from fixtures.neon_fixtures import NeonEnv, Pagectl -from fixtures.pageserver.types import ( +from fixtures.pageserver.common_types import ( InvalidFileName, parse_layer_file_name, ) from fixtures.remote_storage import LocalFsStorage -from fixtures.types import TenantId, TimelineId def duplicate_one_tenant(env: NeonEnv, template_tenant: TenantId, new_tenant: TenantId): diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 4b0dd7a815..91435e8a1f 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -8,10 +8,10 @@ from mypy_boto3_s3.type_defs import ( ObjectTypeDef, ) +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage -from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import wait_until diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 132d2450a7..ee18c53b52 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -12,8 +12,8 @@ import boto3 import toml from mypy_boto3_s3 import S3Client +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.types import TenantId, TimelineId TIMELINE_INDEX_PART_FILE_NAME = "index_part.json" TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json" diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index b9c1986818..82148d0556 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -6,8 +6,8 @@ from typing import Any, Dict, List, Optional, Tuple, Union import pytest import requests +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log -from fixtures.types import Lsn, TenantId, TimelineId # Walreceiver as returned by sk's timeline status endpoint. diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py index 2818a493d6..0e4b5d7883 100644 --- a/test_runner/fixtures/safekeeper/utils.py +++ b/test_runner/fixtures/safekeeper/utils.py @@ -1,6 +1,6 @@ +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.safekeeper.http import SafekeeperHttpClient -from fixtures.types import TenantId, TimelineId def are_walreceivers_absent( diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 6470621900..16dc9e8cfb 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -25,14 +25,14 @@ import zstandard from psycopg2.extensions import cursor from fixtures.log_helper import log -from fixtures.pageserver.types import ( +from fixtures.pageserver.common_types import ( parse_delta_layer, parse_image_layer, ) if TYPE_CHECKING: from fixtures.neon_fixtures import PgBin -from fixtures.types import TimelineId +from fixtures.common_types import TimelineId Fn = TypeVar("Fn", bound=Callable[..., Any]) @@ -452,6 +452,7 @@ def humantime_to_ms(humantime: str) -> float: def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]: + # FIXME: this duplicates test_runner/fixtures/pageserver/allowed_errors.py error_or_warn = re.compile(r"\s(ERROR|WARN)") errors = [] for lineno, line in enumerate(input, start=1): @@ -484,7 +485,7 @@ def assert_no_errors(log_file, service, allowed_errors): for _lineno, error in errors: log.info(f"not allowed {service} error: {error.strip()}") - assert not errors, f"Log errors on {service}: {errors[0]}" + assert not errors, f"First log error on {service}: {errors[0]}\nHint: use scripts/check_allowed_errors.sh to test any new allowed_error you add" @enum.unique diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index c44628ce06..dfd9caba3e 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -1,6 +1,7 @@ import threading from typing import Any, Optional +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -10,7 +11,6 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, ) from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload -from fixtures.types import TenantId, TimelineId # neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex # to ensure we don't do that: this enables running lots of Workloads in parallel safely. diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index 009d62c9ba..f31cd9a9f8 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -5,13 +5,13 @@ Utilities used by all code in this sub-directory from typing import Any, Callable, Dict, Tuple import fixtures.pageserver.many_tenants as many_tenants +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, ) from fixtures.pageserver.utils import wait_until_all_tenants_state -from fixtures.types import TenantId, TimelineId def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 7687b8417f..b3866f1813 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -9,11 +9,11 @@ from typing import List import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare from fixtures.log_helper import log from fixtures.neon_fixtures import NeonPageserver from fixtures.pageserver.utils import wait_for_last_record_lsn -from fixtures.types import Lsn from fixtures.utils import wait_until from prometheus_client.samples import Sample diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 1df3f2f5f1..3f56da7c1d 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -2,10 +2,10 @@ from contextlib import closing import pytest from fixtures.benchmark_fixture import MetricReport +from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare, PgCompare from fixtures.pageserver.utils import wait_tenant_status_404 from fixtures.pg_version import PgVersion -from fixtures.types import Lsn # diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 632d465c3f..cb013ae8c3 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -3,6 +3,7 @@ import random import time import pytest +from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( @@ -10,7 +11,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pg_version import PgVersion -from fixtures.types import TenantId, TenantShardId, TimelineId @pytest.mark.timeout(3600) # super long running test: should go down as we optimize diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 7eb244d378..513ebc74c3 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -6,10 +6,10 @@ from typing import Any, Callable, List import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare from fixtures.log_helper import log from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin -from fixtures.types import Lsn from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index d16d2d6a24..7e40081aa2 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -1,6 +1,6 @@ +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import TimelineId from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 693add422f..2ec375271c 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -2,13 +2,13 @@ from dataclasses import dataclass from typing import Generator, Optional import pytest +from fixtures.common_types import TenantId from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverApiException, TenantConfig from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import TenantId from fixtures.utils import wait_until diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index bb622c0d59..035ab2796f 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -4,13 +4,13 @@ from pathlib import Path import psycopg2 import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PgProtocol, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient -from fixtures.types import TenantId, TimelineId def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient): diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index ddd02238ea..eb503ddbfa 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -2,10 +2,10 @@ import threading import time import pytest +from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.pageserver.http import TimelineCreate406 -from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index b79cad979f..ac2fc79be4 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -1,8 +1,8 @@ import pytest +from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.http import TimelineCreate406 -from fixtures.types import Lsn, TimelineId from fixtures.utils import print_gc_result, query_scalar diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 9fe9f77fea..03d6946c15 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -5,6 +5,7 @@ from concurrent.futures import ThreadPoolExecutor from typing import List import pytest +from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -14,7 +15,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import wait_until_tenant_active -from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix from requests import RequestException diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 1279c1bf81..7d4e101189 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -3,6 +3,7 @@ import os from typing import List, Tuple import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -11,7 +12,6 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, ) from fixtures.pg_version import PgVersion -from fixtures.types import TenantId, TimelineId # Test restarting page server, while safekeeper and compute node keep diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 787c114fc1..ef35bf4696 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -7,6 +7,7 @@ from typing import List, Optional import pytest import toml +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -21,7 +22,6 @@ from fixtures.pageserver.utils import ( ) from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn # # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases. diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 1d73e9cb18..7ae2352c06 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -5,6 +5,7 @@ from dataclasses import dataclass from typing import Any, Dict, Iterable, Tuple import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -16,7 +17,6 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_upload_queue_empty from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py index 7471338ce5..0ebb99c712 100644 --- a/test_runner/regress/test_duplicate_layers.py +++ b/test_runner/regress/test_duplicate_layers.py @@ -2,7 +2,7 @@ import time import pytest from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn -from fixtures.pageserver.types import parse_layer_file_name +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload_queue_empty, diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index d5f898492b..e1e4f700d4 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -1,6 +1,7 @@ import os from pathlib import Path +from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -8,7 +9,6 @@ from fixtures.neon_fixtures import ( VanillaPostgres, ) from fixtures.port_distributor import PortDistributor -from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar, subprocess_capture num_rows = 1000 diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index c5070ee815..e5067bba8b 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -2,6 +2,7 @@ import asyncio import concurrent.futures import random +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -10,7 +11,6 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, ) from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import TimelineId # Test configuration # diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 132427ba2d..1f1c8cc582 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -7,6 +7,7 @@ from contextlib import closing from pathlib import Path import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -20,7 +21,6 @@ from fixtures.pageserver.utils import ( wait_for_upload, ) from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import subprocess_capture diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index b178baea11..0ef4f6d95b 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -7,7 +7,7 @@ from fixtures.neon_fixtures import ( flush_ep_to_pageserver, wait_for_last_flush_lsn, ) -from fixtures.pageserver.types import parse_layer_file_name +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import wait_for_upload from fixtures.remote_storage import RemoteStorageKind diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index cc34fd83e9..18e5111786 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -1,8 +1,9 @@ import time +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver -from fixtures.pageserver.types import ( +from fixtures.pageserver.common_types import ( DeltaLayerName, ImageLayerName, is_future_layer, @@ -13,7 +14,6 @@ from fixtures.pageserver.utils import ( wait_until_tenant_active, ) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import Lsn from fixtures.utils import query_scalar, wait_until diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 57d3447cae..a657d5a035 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -4,6 +4,7 @@ from random import choice from string import ascii_lowercase import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( AuxFileStore, @@ -12,7 +13,6 @@ from fixtures.neon_fixtures import ( logical_replication_sync, wait_for_last_flush_lsn, ) -from fixtures.types import Lsn from fixtures.utils import query_scalar, wait_until diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 225622868d..83d52d4c4c 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -2,10 +2,10 @@ import re import time from datetime import datetime, timedelta, timezone +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn from fixtures.pageserver.http import PageserverApiException -from fixtures.types import Lsn from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index cb69f0ef39..ba170cfb4c 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -5,6 +5,7 @@ from typing import cast import pytest import requests +from fixtures.common_types import TenantId, TimelineId from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, @@ -13,7 +14,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pg_version import PgVersion, skip_on_postgres -from fixtures.types import TenantId, TimelineId def helper_compare_timeline_list( diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index e880445c4d..45c0e3e409 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -3,13 +3,13 @@ import os import time from pathlib import Path +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn from fixtures.pageserver.utils import ( wait_for_last_record_lsn, ) from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index 391305c58a..43b0bb56f0 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,6 +1,6 @@ +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import TimelineId from fixtures.utils import print_gc_result, query_scalar diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 6c2556f6a2..b51754c9e0 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -5,6 +5,7 @@ import time from collections import defaultdict from typing import Any, DefaultDict, Dict, Tuple +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -21,7 +22,6 @@ from fixtures.pageserver.utils import ( wait_until_tenant_active, ) from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn from fixtures.utils import query_scalar, wait_until diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py index 0b36b32552..4af7dcdfc3 100644 --- a/test_runner/regress/test_ondemand_slru_download.py +++ b/test_runner/regress/test_ondemand_slru_download.py @@ -1,9 +1,9 @@ from typing import Optional import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, tenant_get_shards -from fixtures.types import Lsn from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index bd7e4f118f..80a1d72f4a 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -3,13 +3,13 @@ from pathlib import Path from typing import Optional import toml +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 58eaf404d3..a38bcd45da 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -16,6 +16,7 @@ import time from typing import Optional import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -37,7 +38,6 @@ from fixtures.pageserver.utils import ( from fixtures.remote_storage import ( RemoteStorageKind, ) -from fixtures.types import TenantId, TimelineId from fixtures.utils import print_gc_result, wait_until from fixtures.workload import Workload diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py index 42cc28efee..111285b40c 100644 --- a/test_runner/regress/test_pageserver_getpage_throttle.py +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -2,10 +2,10 @@ import json import uuid from anyio import Path +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, PgBin from fixtures.pg_version import PgVersion -from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index c5dc0f2919..aab0536f5a 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -5,6 +5,7 @@ from typing import Optional, Tuple import psutil import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -13,7 +14,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until TIMELINE_COUNT = 10 diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index c34ef46d07..b0465f2a96 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -7,6 +7,7 @@ from pathlib import Path from queue import SimpleQueue from typing import Any, Dict, Set +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -17,7 +18,6 @@ from fixtures.remote_storage import ( RemoteStorageKind, remote_storage_to_toml_inline_table, ) -from fixtures.types import TenantId, TimelineId from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index c40bb962f2..fdc09a063d 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -5,9 +5,10 @@ import time from typing import Any, Dict, Optional import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber -from fixtures.pageserver.types import parse_layer_file_name +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_prefix_empty, poll_for_remote_storage_iterations, @@ -15,7 +16,6 @@ from fixtures.pageserver.utils import ( wait_for_upload_queue_empty, ) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage -from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until from fixtures.workload import Workload diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 539ef3eda7..6434f431a4 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -1,6 +1,6 @@ +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import TimelineId from fixtures.utils import print_gc_result, query_scalar diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py index e6b3ccd7ec..cc5853b727 100644 --- a/test_runner/regress/test_read_trace.py +++ b/test_runner/regress/test_read_trace.py @@ -1,8 +1,8 @@ from contextlib import closing +from fixtures.common_types import Lsn from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.utils import wait_for_last_record_lsn -from fixtures.types import Lsn from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index b7c8f36107..ba8b91e84d 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,8 +1,8 @@ import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.pageserver.utils import wait_for_last_record_lsn -from fixtures.types import Lsn from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 70c025c225..7f79bf5d5c 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -6,13 +6,14 @@ import time from typing import Dict, List, Optional, Tuple import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, wait_for_last_flush_lsn, ) +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient -from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_for_last_record_lsn, @@ -25,7 +26,6 @@ from fixtures.remote_storage import ( RemoteStorageKind, available_remote_storages, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( assert_eq, assert_ge, diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py index 9227836862..7fdabaaec7 100644 --- a/test_runner/regress/test_s3_restore.py +++ b/test_runner/regress/test_s3_restore.py @@ -1,6 +1,7 @@ import time from datetime import datetime, timezone +from fixtures.common_types import Lsn from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, @@ -14,7 +15,6 @@ from fixtures.pageserver.utils import ( wait_for_upload, ) from fixtures.remote_storage import RemoteStorageKind, s3_storage -from fixtures.types import Lsn from fixtures.utils import run_pg_bench_small diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_s3_scrubber.py index 018c1637d0..8981000c24 100644 --- a/test_runner/regress/test_s3_scrubber.py +++ b/test_runner/regress/test_s3_scrubber.py @@ -3,12 +3,12 @@ import shutil from typing import Optional import pytest +from fixtures.common_types import TenantShardId from fixtures.neon_fixtures import ( NeonEnvBuilder, S3Scrubber, ) from fixtures.remote_storage import S3Storage, s3_storage -from fixtures.types import TenantShardId from fixtures.workload import Workload diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index d33803250f..87544af598 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -5,6 +5,7 @@ from typing import Dict, List, Optional, Union import pytest import requests +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( @@ -18,7 +19,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty from fixtures.remote_storage import s3_storage -from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import wait_until from fixtures.workload import Workload from pytest_httpserver import HTTPServer diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index bdd356388f..4a501e60ed 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -6,6 +6,7 @@ from datetime import datetime, timezone from typing import Any, Dict, List, Union import pytest +from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -25,7 +26,6 @@ from fixtures.pageserver.utils import ( ) from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind, s3_storage -from fixtures.types import TenantId, TenantShardId, TimelineId from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until from fixtures.workload import Workload from mypy_boto3_s3.type_defs import ( diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index fc099297e1..a345464208 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -2,13 +2,13 @@ import json from contextlib import closing import psycopg2.extras +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import Lsn from fixtures.utils import wait_until diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 363c3c88ec..3fc44de6fa 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -5,6 +5,7 @@ import shutil from threading import Thread import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -26,7 +27,6 @@ from fixtures.pageserver.utils import ( wait_until_tenant_state, ) from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import run_pg_bench_small, wait_until from requests.exceptions import ReadTimeout diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 0ba0108651..12a4730e69 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -7,6 +7,7 @@ from typing import List, Optional import asyncpg import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -22,7 +23,6 @@ from fixtures.pageserver.utils import ( from fixtures.remote_storage import ( RemoteStorageKind, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until from prometheus_client.samples import Sample diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 68d9d9a660..be289e03d6 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import Any, Dict, Optional, Tuple import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver from fixtures.pageserver.http import PageserverHttpClient @@ -20,7 +21,6 @@ from fixtures.remote_storage import ( LocalFsStorage, RemoteStorageKind, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( query_scalar, wait_until, diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index e73eae91f0..7894f6933d 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import List, Tuple import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -19,7 +20,6 @@ from fixtures.pageserver.utils import ( wait_until_tenant_active, ) from fixtures.pg_version import PgVersion -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 75e5c2c91c..d08ad3cd2e 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,3 +1,4 @@ +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.utils import ( @@ -5,7 +6,6 @@ from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_until_tenant_active, ) -from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 2832304dcc..93e9ad3673 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -9,6 +9,7 @@ from typing import List import pytest import requests +from fixtures.common_types import Lsn, TenantId from fixtures.log_helper import log from fixtures.metrics import ( PAGESERVER_GLOBAL_METRICS, @@ -24,7 +25,6 @@ from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import timeline_delete_wait_completed, wait_until_tenant_active from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId from fixtures.utils import wait_until from prometheus_client.samples import Sample diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index a1e96928bf..168876b711 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -11,6 +11,7 @@ import os from pathlib import Path from typing import List, Tuple +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -18,7 +19,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, last_flush_lsn_upload, ) -from fixtures.pageserver.types import parse_layer_file_name +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_tenant_state, wait_for_last_record_lsn, @@ -28,7 +29,6 @@ from fixtures.remote_storage import ( LocalFsStorage, RemoteStorageKind, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 0eb1327c9e..da37f469b3 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -6,6 +6,7 @@ import threading import pytest import requests +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -31,7 +32,6 @@ from fixtures.remote_storage import ( RemoteStorageKind, s3_storage, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, run_pg_bench_small, wait_until from urllib3.util.retry import Retry diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 1e961a4b2f..8406de8bc1 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -10,6 +10,7 @@ from threading import Barrier from typing import IO, List, Set, Tuple, Union import pytest +from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -19,7 +20,6 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.http import HistoricLayerInfo from fixtures.pageserver.utils import wait_timeline_detail_404 from fixtures.remote_storage import LocalFsStorage -from fixtures.types import Lsn, TimelineId def by_end_lsn(info: HistoricLayerInfo) -> Lsn: diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 628c484fbd..18063bf104 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -10,6 +10,7 @@ from typing import Optional import psycopg2.errors import psycopg2.extras import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -31,7 +32,6 @@ from fixtures.pageserver.utils import ( from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import TenantId, TimelineId from fixtures.utils import get_timeline_dir_size, wait_until diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 967d133e18..ea66eeff63 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -18,6 +18,7 @@ import psycopg2.errors import psycopg2.extras import pytest from fixtures.broker import NeonBroker +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import ( @@ -47,7 +48,6 @@ from fixtures.remote_storage import ( ) from fixtures.safekeeper.http import SafekeeperHttpClient from fixtures.safekeeper.utils import are_walreceivers_absent -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import get_dir_size, query_scalar, start_in_background diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index dce5616ac6..b5d86de574 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -8,10 +8,10 @@ from typing import List, Optional import asyncpg import pytest import toml +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import getLogger from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId, TimelineId log = getLogger("root.safekeeper_async") diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index 7ac6e6332c..d9265dcbcd 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -1,8 +1,8 @@ import time +from fixtures.common_types import Lsn, TenantId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder -from fixtures.types import Lsn, TenantId # Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout. diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 083a259d85..01a1d5cf55 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -6,6 +6,7 @@ from typing import List import pytest import zstandard +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -19,7 +20,6 @@ from fixtures.pageserver.utils import ( ) from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import LocalFsStorage, S3Storage, s3_storage -from fixtures.types import Lsn, TenantId, TimelineId from mypy_boto3_s3.type_defs import ( ObjectTypeDef, ) diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index 13159efbe8..ad37807dba 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -2,10 +2,10 @@ import time import psutil import pytest +from fixtures.common_types import TenantId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.http import PageserverApiException -from fixtures.types import TenantId def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_present=False): From 5a0da93c530e99a374a9b2cabc1e2034a68773f1 Mon Sep 17 00:00:00 2001 From: Jure Bajic Date: Mon, 13 May 2024 14:18:14 +0200 Subject: [PATCH 034/130] Fix `test_lock_time_tracing` flakiness (#7712) ## Problem Closes [test_lock_time_tracing](https://github.com/neondatabase/neon/issues/7691) ## Summary of changes Taking a look at the execution of the same test in logs, it can be concluded that the time we are holding the lock is sometimes not enough(must be above 30s) to cause the second log to be shown by the thread that is creating a timeline. In the [successful execution](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7663/9021247520/index.html#testresult/a21bce8c702b37f0) it can be seen that the log `Operation TimelineCreate on key 5e088fc2dd14945020d0fa6d9efd1e36 has waited 30.000887709s for shared lock` was on the edge of being logged, if it was below 30s it would not be shown. ``` 2024-05-09T18:02:32.552093Z WARN request{method=PUT path=/control/v1/tenant/5e088fc2dd14945020d0fa6d9efd1e36/policy request_id=af7e4a04-d181-4acb-952f-9597c8eba5a8}: Lock on UpdatePolicy was held for 31.001892592s 2024-05-09T18:02:32.552109Z INFO request{method=PUT path=/control/v1/tenant/5e088fc2dd14945020d0fa6d9efd1e36/policy request_id=af7e4a04-d181-4acb-952f-9597c8eba5a8}: Request handled, status: 200 OK 2024-05-09T18:02:32.552271Z WARN request{method=POST path=/v1/tenant/5e088fc2dd14945020d0fa6d9efd1e36/timeline request_id=d3af756e-dbb3-476b-89bd-3594f19bbb67}: Operation TimelineCreate on key 5e088fc2dd14945020d0fa6d9efd1e36 has waited 30.000887709s for shared lock ``` In the [failed execution](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7663/9022743601/index.html#/testresult/deb90136aeae4fce): ``` 2024-05-09T20:14:33.526311Z INFO request{method=POST path=/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/timeline request_id=1daa8c31-522d-4805-9114-68cdcffb9823}: Creating timeline 68194ffadb61ca11adcbb11cbeb4ec6e/f72185990ed13f0b0533383f81d877af 2024-05-09T20:14:36.441165Z INFO Heartbeat round complete for 1 nodes, 0 offline 2024-05-09T20:14:41.441657Z INFO Heartbeat round complete for 1 nodes, 0 offline 2024-05-09T20:14:41.535227Z INFO request{method=POST path=/upcall/v1/validate request_id=94a7be88-474e-4163-92f8-57b401473add}: Handling request 2024-05-09T20:14:41.535269Z INFO request{method=POST path=/upcall/v1/validate request_id=94a7be88-474e-4163-92f8-57b401473add}: handle_validate: 68194ffadb61ca11adcbb11cbeb4ec6e(gen 1): valid=true (latest Some(00000001)) 2024-05-09T20:14:41.535284Z INFO request{method=POST path=/upcall/v1/validate request_id=94a7be88-474e-4163-92f8-57b401473add}: Request handled, status: 200 OK 2024-05-09T20:14:46.441854Z INFO Heartbeat round complete for 1 nodes, 0 offline 2024-05-09T20:14:51.441151Z INFO Heartbeat round complete for 1 nodes, 0 offline 2024-05-09T20:14:56.441199Z INFO Heartbeat round complete for 1 nodes, 0 offline 2024-05-09T20:15:01.440971Z INFO Heartbeat round complete for 1 nodes, 0 offline 2024-05-09T20:15:03.516320Z INFO request{method=PUT path=/control/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/policy request_id=0edfdb5b-2b05-486b-9879-d83f234d2f0d}: failpoint "tenant-update-policy-exclusive-lock": sleep done 2024-05-09T20:15:03.518474Z INFO request{method=PUT path=/control/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/policy request_id=0edfdb5b-2b05-486b-9879-d83f234d2f0d}: Updated scheduling policy to Stop tenant_id=68194ffadb61ca11adcbb11cbeb4ec6e shard_id=0000 2024-05-09T20:15:03.518512Z WARN request{method=PUT path=/control/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/policy request_id=0edfdb5b-2b05-486b-9879-d83f234d2f0d}: Scheduling is disabled by policy Stop tenant_id=68194ffadb61ca11adcbb11cbeb4ec6e shard_id=0000 2024-05-09T20:15:03.518540Z WARN request{method=PUT path=/control/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/policy request_id=0edfdb5b-2b05-486b-9879-d83f234d2f0d}: Lock on UpdatePolicy was held for 31.003712703s 2024-05-09T20:15:03.518570Z INFO request{method=PUT path=/control/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/policy request_id=0edfdb5b-2b05-486b-9879-d83f234d2f0d}: Request handled, status: 200 OK 2024-05-09T20:15:03.518804Z WARN request{method=POST path=/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/timeline request_id=1daa8c31-522d-4805-9114-68cdcffb9823}: Scheduling is disabled by policy Stop tenant_id=68194ffadb61ca11adcbb11cbeb4ec6e shard_id=0000 2024-05-09T20:15:03.518815Z INFO request{method=POST path=/v1/tenant/68194ffadb61ca11adcbb11cbeb4ec6e/timeline request_id=1daa8c31-522d-4805-9114-68cdcffb9823}: Creating timeline on shard 68194ffadb61ca11adcbb11cbeb4ec6e/f72185990ed13f0b0533383f81d877af, attached to node 1 (localhost) ``` we can see that the difference between starting to create timeline `2024-05-09T20:14:33.526311Z` and creating timeline `2024-05-09T20:15:03.518815Z` is not above 30s and will not cause any logs to appear. The proposed solution is to prolong how long we will pause to ensure that the thread that creates the timeline waits above 30s. --- test_runner/regress/test_storage_controller.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 4a501e60ed..3a9a522f3f 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -1284,7 +1284,7 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder): # Apply failpoint env.storage_controller.configure_failpoints( - ("tenant-update-policy-exclusive-lock", "return(31000)") + ("tenant-update-policy-exclusive-lock", "return(35000)") ) # This will hold the exclusive for enough time to cause an warning @@ -1306,7 +1306,7 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder): env.storage_controller.pageserver_api().timeline_create( pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id ) - thread_update_tenant_policy.join(timeout=10) + thread_update_tenant_policy.join() env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for") env.storage_controller.assert_log_contains( From bbe730d7caec380edb389487fb1d6db4c9b12cc8 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Mon, 13 May 2024 13:41:14 +0100 Subject: [PATCH 035/130] Revert protocol version upgrade (#7727) ## Problem "John pointed out that the switch to protocol version 2 made test_gc_aggressive test flaky: https://github.com/neondatabase/neon/issues/7692. I tracked it down, and that is indeed an issue. Conditions for hitting the issue: The problem occurs in the primary GC horizon is set to a very low value, e.g. 0. If the primary is actively writing WAL, and GC runs in the pageserver at the same time that the primary sends a GetPage request, it's possible that the GC advances the GC horizon past the GetPage request's LSN. I'm working on a fix here: https://github.com/neondatabase/neon/pull/7708." - Heikki ## Summary of changes Use protocol version 1 as default. --- pgxn/neon/libpagestore.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index f5ce2caff3..b7b1e7ccbf 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -49,7 +49,7 @@ char *neon_auth_token; int readahead_buffer_size = 128; int flush_every_n_requests = 8; -int neon_protocol_version = 2; +int neon_protocol_version = 1; static int n_reconnect_attempts = 0; static int max_reconnect_attempts = 60; @@ -860,7 +860,7 @@ pg_init_libpagestore(void) "Version of compute<->page server protocol", NULL, &neon_protocol_version, - 2, /* use protocol version 2 */ + 1, /* default to old protocol for now */ 1, /* min */ 2, /* max */ PGC_SU_BACKEND, From 6ff74295b5b21d54192d20d114d78621d8d53ba0 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 13 May 2024 14:52:06 +0200 Subject: [PATCH 036/130] chore(pageserver): plumb through RequestContext to VirtualFile open methods (#7725) This PR introduces no functional changes. The `open()` path will be done separately. refs https://github.com/neondatabase/neon/issues/6107 refs https://github.com/neondatabase/neon/issues/7386 Co-authored-by: Joonas Koivunen --- pageserver/ctl/src/layer_map_analyzer.rs | 2 +- pageserver/ctl/src/layers.rs | 2 +- pageserver/src/pgdatadir_mapping.rs | 2 +- pageserver/src/tenant/blob_io.rs | 4 +- pageserver/src/tenant/ephemeral_file.rs | 4 +- .../tenant/remote_timeline_client/download.rs | 21 ++-- .../src/tenant/storage_layer/delta_layer.rs | 10 +- .../src/tenant/storage_layer/image_layer.rs | 8 +- .../tenant/storage_layer/inmemory_layer.rs | 4 +- pageserver/src/tenant/timeline.rs | 35 ++++-- pageserver/src/tenant/timeline/compaction.rs | 4 + .../src/tenant/timeline/detach_ancestor.rs | 2 + .../src/tenant/timeline/layer_manager.rs | 4 +- pageserver/src/virtual_file.rs | 102 +++++++++++++----- 14 files changed, 143 insertions(+), 61 deletions(-) diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index c4c282f33d..b4bb239f44 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -100,7 +100,7 @@ pub(crate) fn parse_filename(name: &str) -> Option { // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH" async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result> { - let file = VirtualFile::open(path).await?; + let file = VirtualFile::open(path, ctx).await?; let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader.read_blk(0, ctx).await?; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index be8f91675d..3611b0baab 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -61,7 +61,7 @@ async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); page_cache::init(100); - let file = VirtualFile::open(path).await?; + let file = VirtualFile::open(path, ctx).await?; let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader.read_blk(0, ctx).await?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index a4215ee107..ffcab5f140 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1671,7 +1671,7 @@ impl<'a> DatadirModification<'a> { } if !self.pending_deletions.is_empty() { - writer.delete_batch(&self.pending_deletions).await?; + writer.delete_batch(&self.pending_deletions, ctx).await?; self.pending_deletions.clear(); } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 1dc451f5c9..24b4e4f3ea 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -299,7 +299,7 @@ mod tests { // Write part (in block to drop the file) let mut offsets = Vec::new(); { - let file = VirtualFile::create(pathbuf.as_path()).await?; + let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?; let mut wtr = BlobWriter::::new(file, 0); for blob in blobs.iter() { let (_, res) = wtr.write_blob(blob.clone(), &ctx).await; @@ -314,7 +314,7 @@ mod tests { wtr.flush_buffer(&ctx).await?; } - let file = VirtualFile::open(pathbuf.as_path()).await?; + let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?; let rdr = BlockReaderRef::VirtualFile(&file); let rdr = BlockCursor::new(rdr); for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 8b815a1885..79cc7bf153 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -28,6 +28,7 @@ impl EphemeralFile { conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + ctx: &RequestContext, ) -> Result { static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1); let filename_disambiguator = @@ -45,6 +46,7 @@ impl EphemeralFile { .read(true) .write(true) .create(true), + ctx, ) .await?; @@ -153,7 +155,7 @@ mod tests { async fn test_ephemeral_blobs() -> Result<(), io::Error> { let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?; - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?; + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?; let pos_foo = file.write_blob(b"foo", &ctx).await?; assert_eq!( diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index b464437422..f3c9e64533 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -112,14 +112,17 @@ pub async fn download_layer_file<'a>( // We use fatal_err() below because the after the rename above, // the in-memory state of the filesystem already has the layer file in its final place, // and subsequent pageserver code could think it's durable while it really isn't. - let work = async move { - let timeline_dir = VirtualFile::open(&timeline_path) - .await - .fatal_err("VirtualFile::open for timeline dir fsync"); - timeline_dir - .sync_all() - .await - .fatal_err("VirtualFile::sync_all timeline dir"); + let work = { + let ctx = ctx.detached_child(ctx.task_kind(), ctx.download_behavior()); + async move { + let timeline_dir = VirtualFile::open(&timeline_path, &ctx) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + } }; crate::virtual_file::io_engine::get() .spawn_blocking_and_block_on_if_std(work) @@ -196,7 +199,7 @@ async fn download_object<'a>( use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer}; use bytes::BytesMut; async { - let destination_file = VirtualFile::create(dst_path) + let destination_file = VirtualFile::create(dst_path, ctx) .await .with_context(|| format!("create a destination file for layer '{dst_path}'")) .map_err(DownloadError::Other)?; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 4f30cf2e84..1b3802840f 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -394,6 +394,7 @@ impl DeltaLayerWriterInner { tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, + ctx: &RequestContext, ) -> anyhow::Result { // Create the file initially with a temporary filename. We don't know // the end key yet, so we cannot form the final filename yet. We will @@ -404,7 +405,7 @@ impl DeltaLayerWriterInner { let path = DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range); - let mut file = VirtualFile::create(&path).await?; + let mut file = VirtualFile::create(&path, ctx).await?; // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?; let blob_writer = BlobWriter::new(file, PAGE_SZ as u64); @@ -586,6 +587,7 @@ impl DeltaLayerWriter { tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, + ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { inner: Some( @@ -595,6 +597,7 @@ impl DeltaLayerWriter { tenant_shard_id, key_start, lsn_range, + ctx, ) .await?, ), @@ -701,6 +704,7 @@ impl DeltaLayer { let mut file = VirtualFile::open_with_options( path, virtual_file::OpenOptions::new().read(true).write(true), + ctx, ) .await .with_context(|| format!("Failed to open file '{}'", path))?; @@ -734,7 +738,7 @@ impl DeltaLayerInner { max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path).await { + let file = match VirtualFile::open(path, ctx).await { Ok(file) => file, Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), }; @@ -1792,6 +1796,7 @@ mod test { harness.tenant_shard_id, entries_meta.key_range.start, entries_meta.lsn_range.clone(), + &ctx, ) .await?; @@ -1979,6 +1984,7 @@ mod test { tenant.tenant_shard_id, Key::MIN, Lsn(0x11)..truncate_at, + ctx, ) .await .unwrap(); diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 72d1f36cab..6ea452b993 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -343,6 +343,7 @@ impl ImageLayer { let mut file = VirtualFile::open_with_options( path, virtual_file::OpenOptions::new().read(true).write(true), + ctx, ) .await .with_context(|| format!("Failed to open file '{}'", path))?; @@ -377,7 +378,7 @@ impl ImageLayerInner { max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path).await { + let file = match VirtualFile::open(path, ctx).await { Ok(file) => file, Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), }; @@ -632,6 +633,7 @@ impl ImageLayerWriterInner { tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, + ctx: &RequestContext, ) -> anyhow::Result { // Create the file initially with a temporary filename. // We'll atomically rename it to the final name when we're done. @@ -651,6 +653,7 @@ impl ImageLayerWriterInner { virtual_file::OpenOptions::new() .write(true) .create_new(true), + ctx, ) .await? }; @@ -805,10 +808,11 @@ impl ImageLayerWriter { tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, + ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { inner: Some( - ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn) + ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx) .await?, ), }) diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 4dacbec2f3..9553f83026 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -473,10 +473,11 @@ impl InMemoryLayer { timeline_id: TimelineId, tenant_shard_id: TenantShardId, start_lsn: Lsn, + ctx: &RequestContext, ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?; + let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { @@ -642,6 +643,7 @@ impl InMemoryLayer { self.tenant_shard_id, Key::MIN, self.start_lsn..end_lsn, + ctx, ) .await?; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 505dc8c30d..d2fcd6c4a5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3560,7 +3560,11 @@ impl Timeline { /// /// Get a handle to the latest layer for appending. /// - async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { + async fn get_layer_for_write( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result> { let mut guard = self.layers.write().await; let layer = guard .get_layer_for_write( @@ -3569,6 +3573,7 @@ impl Timeline { self.conf, self.timeline_id, self.tenant_shard_id, + ctx, ) .await?; Ok(layer) @@ -3833,8 +3838,8 @@ impl Timeline { ); self.create_delta_layer( &frozen_layer, - ctx, Some(metadata_keyspace.0.ranges[0].clone()), + ctx, ) .await? } else { @@ -3863,7 +3868,7 @@ impl Timeline { // Normal case, write out a L0 delta layer file. // `create_delta_layer` will not modify the layer map. // We will remove frozen layer and add delta layer in one atomic operation later. - let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else { + let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else { panic!("delta layer cannot be empty if no filter is applied"); }; ( @@ -3992,8 +3997,8 @@ impl Timeline { async fn create_delta_layer( self: &Arc, frozen_layer: &Arc, - ctx: &RequestContext, key_range: Option>, + ctx: &RequestContext, ) -> anyhow::Result> { let self_clone = Arc::clone(self); let frozen_layer = Arc::clone(frozen_layer); @@ -4016,6 +4021,7 @@ impl Timeline { &self_clone .conf .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id), + &ctx, ) .await .fatal_err("VirtualFile::open for timeline dir fsync"); @@ -4209,6 +4215,7 @@ impl Timeline { self.tenant_shard_id, &img_range, lsn, + ctx, ) .await?; @@ -4313,6 +4320,7 @@ impl Timeline { &self .conf .timeline_path(&self.tenant_shard_id, &self.timeline_id), + ctx, ) .await .fatal_err("VirtualFile::open for timeline dir fsync"); @@ -5214,7 +5222,7 @@ impl<'a> TimelineWriter<'a> { let buf_size: u64 = buf.len().try_into().expect("oversized value buf"); let action = self.get_open_layer_action(lsn, buf_size); - let layer = self.handle_open_layer_action(lsn, action).await?; + let layer = self.handle_open_layer_action(lsn, action, ctx).await?; let res = layer.put_value(key, lsn, &buf, ctx).await; if res.is_ok() { @@ -5237,14 +5245,15 @@ impl<'a> TimelineWriter<'a> { &mut self, at: Lsn, action: OpenLayerAction, + ctx: &RequestContext, ) -> anyhow::Result<&Arc> { match action { OpenLayerAction::Roll => { let freeze_at = self.write_guard.as_ref().unwrap().max_lsn.unwrap(); self.roll_layer(freeze_at).await?; - self.open_layer(at).await?; + self.open_layer(at, ctx).await?; } - OpenLayerAction::Open => self.open_layer(at).await?, + OpenLayerAction::Open => self.open_layer(at, ctx).await?, OpenLayerAction::None => { assert!(self.write_guard.is_some()); } @@ -5253,8 +5262,8 @@ impl<'a> TimelineWriter<'a> { Ok(&self.write_guard.as_ref().unwrap().open_layer) } - async fn open_layer(&mut self, at: Lsn) -> anyhow::Result<()> { - let layer = self.tl.get_layer_for_write(at).await?; + async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> { + let layer = self.tl.get_layer_for_write(at, ctx).await?; let initial_size = layer.size().await?; let last_freeze_at = self.last_freeze_at.load(); @@ -5331,10 +5340,14 @@ impl<'a> TimelineWriter<'a> { Ok(()) } - pub(crate) async fn delete_batch(&mut self, batch: &[(Range, Lsn)]) -> anyhow::Result<()> { + pub(crate) async fn delete_batch( + &mut self, + batch: &[(Range, Lsn)], + ctx: &RequestContext, + ) -> anyhow::Result<()> { if let Some((_, lsn)) = batch.first() { let action = self.get_open_layer_action(*lsn, 0); - let layer = self.handle_open_layer_action(*lsn, action).await?; + let layer = self.handle_open_layer_action(*lsn, action, ctx).await?; layer.put_tombstones(batch).await?; } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index e83878b8fb..4226bf431e 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -700,6 +700,7 @@ impl Timeline { debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); lsn_range.clone() }, + ctx, ) .await?, ); @@ -755,6 +756,7 @@ impl Timeline { &self .conf .timeline_path(&self.tenant_shard_id, &self.timeline_id), + ctx, ) .await .fatal_err("VirtualFile::open for timeline dir fsync"); @@ -1093,6 +1095,7 @@ impl CompactionJobExecutor for TimelineAdaptor { self.timeline.tenant_shard_id, key_range.start, lsn_range.clone(), + ctx, ) .await?; @@ -1167,6 +1170,7 @@ impl TimelineAdaptor { self.timeline.tenant_shard_id, key_range, lsn, + ctx, ) .await?; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 69b82344a6..9471ba860f 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -215,6 +215,7 @@ pub(super) async fn prepare( &detached .conf .timeline_path(&detached.tenant_shard_id, &detached.timeline_id), + ctx, ) .await .fatal_err("VirtualFile::open for timeline dir fsync"); @@ -339,6 +340,7 @@ async fn copy_lsn_prefix( target_timeline.tenant_shard_id, layer.layer_desc().key_range.start, layer.layer_desc().lsn_range.start..end_lsn, + ctx, ) .await .map_err(CopyDeltaPrefix)?; diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index a72eb1b3bf..248420e632 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -9,6 +9,7 @@ use utils::{ use crate::{ config::PageServerConf, + context::RequestContext, metrics::TimelineMetrics, tenant::{ layer_map::{BatchedUpdates, LayerMap}, @@ -69,6 +70,7 @@ impl LayerManager { conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, + ctx: &RequestContext, ) -> Result> { ensure!(lsn.is_aligned()); @@ -105,7 +107,7 @@ impl LayerManager { ); let new_layer = - InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?; + InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone()); diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 8dee73891b..b68f3a0e89 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -344,16 +344,23 @@ macro_rules! with_file { impl VirtualFile { /// Open a file in read-only mode. Like File::open. - pub async fn open(path: &Utf8Path) -> Result { - Self::open_with_options(path, OpenOptions::new().read(true)).await + pub async fn open( + path: &Utf8Path, + ctx: &RequestContext, + ) -> Result { + Self::open_with_options(path, OpenOptions::new().read(true), ctx).await } /// Create a new file for writing. If the file exists, it will be truncated. /// Like File::create. - pub async fn create(path: &Utf8Path) -> Result { + pub async fn create( + path: &Utf8Path, + ctx: &RequestContext, + ) -> Result { Self::open_with_options( path, OpenOptions::new().write(true).create(true).truncate(true), + ctx, ) .await } @@ -366,6 +373,7 @@ impl VirtualFile { pub async fn open_with_options( path: &Utf8Path, open_options: &OpenOptions, + _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ ) -> Result { let path_str = path.to_string(); let parts = path_str.split('/').collect::>(); @@ -1179,7 +1187,6 @@ mod tests { use rand::seq::SliceRandom; use rand::thread_rng; use rand::Rng; - use std::future::Future; use std::io::Write; use std::os::unix::fs::FileExt; use std::sync::Arc; @@ -1293,41 +1300,69 @@ mod tests { // results with VirtualFiles as with native Files. (Except that with // native files, you will run out of file descriptors if the ulimit // is low enough.) - test_files("virtual_files", |path, open_options| async move { - let vf = VirtualFile::open_with_options(&path, &open_options).await?; - Ok(MaybeVirtualFile::VirtualFile(vf)) - }) - .await + struct A; + + impl Adapter for A { + async fn open( + path: Utf8PathBuf, + opts: OpenOptions, + ctx: &RequestContext, + ) -> Result { + let vf = VirtualFile::open_with_options(&path, &opts, ctx).await?; + Ok(MaybeVirtualFile::VirtualFile(vf)) + } + } + test_files::("virtual_files").await } #[tokio::test] async fn test_physical_files() -> anyhow::Result<()> { - test_files("physical_files", |path, open_options| async move { - Ok(MaybeVirtualFile::File({ - let owned_fd = open_options.open(path.as_std_path()).await?; - File::from(owned_fd) - })) - }) - .await + struct B; + + impl Adapter for B { + async fn open( + path: Utf8PathBuf, + opts: OpenOptions, + _ctx: &RequestContext, + ) -> Result { + Ok(MaybeVirtualFile::File({ + let owned_fd = opts.open(path.as_std_path()).await?; + File::from(owned_fd) + })) + } + } + + test_files::("physical_files").await } - async fn test_files(testname: &str, openfunc: OF) -> anyhow::Result<()> + /// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition + /// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function + /// in trait which benefits from the new lifetime capture rules already. + trait Adapter { + async fn open( + path: Utf8PathBuf, + opts: OpenOptions, + ctx: &RequestContext, + ) -> Result; + } + + async fn test_files(testname: &str) -> anyhow::Result<()> where - OF: Fn(Utf8PathBuf, OpenOptions) -> FT, - FT: Future>, + A: Adapter, { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir(testname); std::fs::create_dir_all(&testdir)?; let path_a = testdir.join("file_a"); - let mut file_a = openfunc( + let mut file_a = A::open( path_a.clone(), OpenOptions::new() .write(true) .create(true) .truncate(true) .to_owned(), + &ctx, ) .await?; file_a.write_all(b"foobar".to_vec(), &ctx).await?; @@ -1336,7 +1371,7 @@ mod tests { let _ = file_a.read_string(&ctx).await.unwrap_err(); // Close the file and re-open for reading - let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?; + let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?; // cannot write to a file opened in read-only mode let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err(); @@ -1371,7 +1406,7 @@ mod tests { // Create another test file, and try FileExt functions on it. let path_b = testdir.join("file_b"); - let mut file_b = openfunc( + let mut file_b = A::open( path_b.clone(), OpenOptions::new() .read(true) @@ -1379,6 +1414,7 @@ mod tests { .create(true) .truncate(true) .to_owned(), + &ctx, ) .await?; file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?; @@ -1394,8 +1430,12 @@ mod tests { let mut vfiles = Vec::new(); for _ in 0..100 { - let mut vfile = - openfunc(path_b.clone(), OpenOptions::new().read(true).to_owned()).await?; + let mut vfile = A::open( + path_b.clone(), + OpenOptions::new().read(true).to_owned(), + &ctx, + ) + .await?; assert_eq!("FOOBAR", vfile.read_string(&ctx).await?); vfiles.push(vfile); } @@ -1441,8 +1481,12 @@ mod tests { // Open the file many times. let mut files = Vec::new(); for _ in 0..VIRTUAL_FILES { - let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true)) - .await?; + let f = VirtualFile::open_with_options( + &test_file_path, + OpenOptions::new().read(true), + &ctx, + ) + .await?; files.push(f); } let files = Arc::new(files); @@ -1488,7 +1532,7 @@ mod tests { VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); + let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); let post = file.read_string(&ctx).await.unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); @@ -1497,7 +1541,7 @@ mod tests { VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); + let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); let post = file.read_string(&ctx).await.unwrap(); assert_eq!(post, "bar"); assert!(!tmp_path.exists()); @@ -1521,7 +1565,7 @@ mod tests { .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); + let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); let post = file.read_string(&ctx).await.unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); From 55ba885f6be431b2b694bca4d95d7a40f4bda98a Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 13 May 2024 14:16:03 +0100 Subject: [PATCH 037/130] CI(report-benchmarks-failures): report benchmarks failures to slack (#7678) ## Problem `benchmarks` job that we run on the main doesn't block anything, so it's easy to miss its failure. Ref https://github.com/neondatabase/cloud/issues/13087 ## Summary of changes - Add `report-benchmarks-failures` job that report failures of `benchmarks` job to a Slack channel --- .github/workflows/build_and_test.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 21e7a56670..f417cecd58 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -546,9 +546,27 @@ jobs: # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones + report-benchmarks-failures: + needs: [ benchmarks, create-test-report ] + if: github.ref_name == 'main' && failure() + runs-on: ubuntu-latest + + steps: + - uses: slackapi/slack-github-action@v1 + with: + channel-id: C060CNA47S9 # on-call-staging-storage-stream + slack-message: | + Benchmarks failed on main: ${{ github.event.head_commit.url }} + + Allure report: ${{ needs.create-test-report.outputs.report-url }} + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + create-test-report: needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ] if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} + outputs: + report-url: ${{ steps.create-allure-report.outputs.report-url }} runs-on: [ self-hosted, gen3, small ] container: From 4d8a10af1cc41563cc4542beb327a2d75fb1bad8 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 13 May 2024 17:49:50 +0300 Subject: [PATCH 038/130] fix: do not create metrics contention from background task permit (#7730) The background task loop permit metrics do two of `with_label_values` very often. Change the codepath to cache the counters on first access into a `Lazy` with `enum_map::EnumMap`. The expectation is that this should not fix for metric collection failures under load, but it doesn't hurt. Cc: #7161 --- pageserver/src/tenant/tasks.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index f153719f98..ba2b8afd03 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -41,7 +41,7 @@ static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy &'static str { - let s: &'static str = self.into(); - s + self.into() } } +static PERMIT_GAUGES: once_cell::sync::Lazy< + enum_map::EnumMap, +> = once_cell::sync::Lazy::new(|| { + enum_map::EnumMap::from_array(std::array::from_fn(|i| { + let kind = ::from_usize(i); + crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()]) + })) +}); + /// Cancellation safe. pub(crate) async fn concurrent_background_tasks_rate_limit_permit( loop_kind: BackgroundLoopKind, _ctx: &RequestContext, ) -> tokio::sync::SemaphorePermit<'static> { - let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE - .with_label_values(&[loop_kind.as_static_str()]) - .guard(); + let _guard = PERMIT_GAUGES[loop_kind].guard(); pausable_failpoint!( "initial-size-calculation-permit-pause", From 7f517640011a5dc2c811c66c2767ae58e27d0c79 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Mon, 13 May 2024 11:33:41 -0400 Subject: [PATCH 039/130] feat(pageserver): add metrics for aux file size (#7623) ref https://github.com/neondatabase/neon/issues/7443 ## Summary of changes This pull request adds a size estimator for aux files. Each timeline stores a cached `isize` for the estimated total size of aux files. It gets reset on basebackup, and gets updated for each aux file modification. TODO: print a warning when it exceeds the size. The size metrics is not accurate. Race between `on_basebackup` and other functions could create a negative basebackup size, but the chance is rare. Anyways, this does not impose any extra I/Os to the storage as everything is computed in-memory. The aux files are only stored on shard 0. As basebackups are only generated on shard 0, only shard 0 will report this metrics. --------- Signed-off-by: Alex Chi Z --- pageserver/src/aux_file.rs | 52 +++++++++++++++++++++++++++++ pageserver/src/metrics.rs | 15 +++++++++ pageserver/src/pgdatadir_mapping.rs | 52 +++++++++++++++++++++-------- pageserver/src/tenant/timeline.rs | 32 ++++++++++++------ test_runner/fixtures/metrics.py | 1 + 5 files changed, 128 insertions(+), 24 deletions(-) diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs index a26ed84a0d..e6d950487d 100644 --- a/pageserver/src/aux_file.rs +++ b/pageserver/src/aux_file.rs @@ -1,3 +1,6 @@ +use std::sync::Arc; + +use ::metrics::IntGauge; use bytes::{Buf, BufMut, Bytes}; use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE}; use tracing::warn; @@ -140,6 +143,55 @@ pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result> { Ok(encoded) } +/// An estimation of the size of aux files. +pub struct AuxFileSizeEstimator { + aux_file_size_gauge: IntGauge, + size: Arc>>, +} + +impl AuxFileSizeEstimator { + pub fn new(aux_file_size_gauge: IntGauge) -> Self { + Self { + aux_file_size_gauge, + size: Arc::new(std::sync::Mutex::new(None)), + } + } + + pub fn on_base_backup(&self, new_size: usize) { + let mut guard = self.size.lock().unwrap(); + *guard = Some(new_size as isize); + self.report(new_size as isize); + } + + pub fn on_add(&self, file_size: usize) { + let mut guard = self.size.lock().unwrap(); + if let Some(size) = &mut *guard { + *size += file_size as isize; + self.report(*size); + } + } + + pub fn on_remove(&self, file_size: usize) { + let mut guard = self.size.lock().unwrap(); + if let Some(size) = &mut *guard { + *size -= file_size as isize; + self.report(*size); + } + } + + pub fn on_update(&self, old_size: usize, new_size: usize) { + let mut guard = self.size.lock().unwrap(); + if let Some(size) = &mut *guard { + *size += new_size as isize - old_size as isize; + self.report(*size); + } + } + + pub fn report(&self, size: isize) { + self.aux_file_size_gauge.set(size as i64); + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 256f2f334c..b27bfb43b0 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -585,6 +585,15 @@ static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define current logical size metric") }); +static AUX_FILE_SIZE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_aux_file_estimated_size", + "The size of all aux files for a timeline in aux file v2 store.", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + pub(crate) mod initial_logical_size { use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; use once_cell::sync::Lazy; @@ -2115,6 +2124,7 @@ pub(crate) struct TimelineMetrics { resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, + pub aux_file_size_gauge: IntGauge, pub directory_entries_count_gauge: Lazy UIntGauge>>, pub evictions: IntCounter, pub evictions_with_low_residence_duration: std::sync::RwLock, @@ -2187,6 +2197,9 @@ impl TimelineMetrics { let current_logical_size_gauge = CURRENT_LOGICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let aux_file_size_gauge = AUX_FILE_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065 let directory_entries_count_gauge_closure = { let tenant_shard_id = *tenant_shard_id; @@ -2224,6 +2237,7 @@ impl TimelineMetrics { last_record_gauge, resident_physical_size_gauge, current_logical_size_gauge, + aux_file_size_gauge, directory_entries_count_gauge, evictions, evictions_with_low_residence_duration: std::sync::RwLock::new( @@ -2264,6 +2278,7 @@ impl TimelineMetrics { let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]); } let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); self.evictions_with_low_residence_duration .write() diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index ffcab5f140..1092d64d33 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -699,13 +699,17 @@ impl Timeline { .await .context("scan")?; let mut result = HashMap::new(); + let mut sz = 0; for (_, v) in kv { let v = v.context("get value")?; let v = aux_file::decode_file_value_bytes(&v).context("value decode")?; for (fname, content) in v { + sz += fname.len(); + sz += content.len(); result.insert(fname, content); } } + self.aux_file_size_estimator.on_base_backup(sz); Ok(result) } @@ -1474,23 +1478,45 @@ impl<'a> DatadirModification<'a> { Err(PageReconstructError::MissingKey(_)) => None, Err(e) => return Err(e.into()), }; - let files = if let Some(ref old_val) = old_val { + let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val { aux_file::decode_file_value(old_val)? } else { Vec::new() }; - let new_files = if content.is_empty() { - files - .into_iter() - .filter(|(p, _)| &path != p) - .collect::>() - } else { - files - .into_iter() - .filter(|(p, _)| &path != p) - .chain(std::iter::once((path, content))) - .collect::>() - }; + let mut other_files = Vec::with_capacity(files.len()); + let mut modifying_file = None; + for file @ (p, content) in files { + if path == p { + assert!( + modifying_file.is_none(), + "duplicated entries found for {}", + path + ); + modifying_file = Some(content); + } else { + other_files.push(file); + } + } + let mut new_files = other_files; + match (modifying_file, content.is_empty()) { + (Some(old_content), false) => { + self.tline + .aux_file_size_estimator + .on_update(old_content.len(), content.len()); + new_files.push((path, content)); + } + (Some(old_content), true) => { + self.tline + .aux_file_size_estimator + .on_remove(old_content.len()); + // not adding the file key to the final `new_files` vec. + } + (None, false) => { + self.tline.aux_file_size_estimator.on_add(content.len()); + new_files.push((path, content)); + } + (None, true) => anyhow::bail!("removing non-existing aux file: {}", path), + } let new_val = aux_file::encode_file_value(&new_files)?; self.put(key, Value::Image(new_val.into())); } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d2fcd6c4a5..01f354b9e8 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -61,9 +61,12 @@ use std::{ }; use crate::tenant::timeline::init::LocalLayerFileMetadata; -use crate::tenant::{ - layer_map::{LayerMap, SearchResult}, - metadata::TimelineMetadata, +use crate::{ + aux_file::AuxFileSizeEstimator, + tenant::{ + layer_map::{LayerMap, SearchResult}, + metadata::TimelineMetadata, + }, }; use crate::{ context::{DownloadBehavior, RequestContext}, @@ -409,6 +412,8 @@ pub struct Timeline { /// Keep aux directory cache to avoid it's reconstruction on each update pub(crate) aux_files: tokio::sync::Mutex, + + pub(crate) aux_file_size_estimator: AuxFileSizeEstimator, } pub struct WalReceiverInfo { @@ -2161,6 +2166,16 @@ impl Timeline { }; Arc::new_cyclic(|myself| { + let metrics = TimelineMetrics::new( + &tenant_shard_id, + &timeline_id, + crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( + "mtime", + evictions_low_residence_duration_metric_threshold, + ), + ); + let aux_file_metrics = metrics.aux_file_size_gauge.clone(); + let mut result = Timeline { conf, tenant_conf, @@ -2192,14 +2207,7 @@ impl Timeline { ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), - metrics: TimelineMetrics::new( - &tenant_shard_id, - &timeline_id, - crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( - "mtime", - evictions_low_residence_duration_metric_threshold, - ), - ), + metrics, query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new( &tenant_shard_id, @@ -2263,6 +2271,8 @@ impl Timeline { dir: None, n_deltas: 0, }), + + aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics), }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 7d34e12ca3..8fa67e75c9 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -149,6 +149,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_storage_operations_seconds_sum_total", "pageserver_evictions_total", "pageserver_evictions_with_low_residence_duration_total", + "pageserver_aux_file_estimated_size", *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, # "pageserver_directory_entries_count", -- only used if above a certain threshold # "pageserver_broken_tenants_count" -- used only for broken From be0c73f8e7543ca4de8c89b816eb70f792f2685a Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 13 May 2024 17:59:59 +0100 Subject: [PATCH 040/130] pageserver: improve API for invoking GC (#7655) ## Problem In https://github.com/neondatabase/neon/pull/7531, I had a test flaky because the GC API endpoint fails if the tenant happens not to be active yet. ## Summary of changes While adding that wait for the tenant to be active, I noticed that this endpoint is kind of strange (spawns a TaskManager task) and has a comment `// TODO: spawning is redundant now, need to hold the gate`, so this PR cleans it up to just run the GC inline while holding a gate. The GC code is updated to avoid assuming it runs inside a task manager task. Avoiding checking the task_mgr cancellation token is safe, because our timeline shutdown always cancels Timeline::cancel. --- pageserver/src/http/routes.rs | 7 +- pageserver/src/tenant.rs | 2 +- pageserver/src/tenant/mgr.rs | 107 +++++++++++++----------------- pageserver/src/tenant/timeline.rs | 2 - 4 files changed, 49 insertions(+), 69 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a8ca642dc5..2370561756 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1715,12 +1715,7 @@ async fn timeline_gc_handler( let gc_req: TimelineGcRequest = json_request(&mut request).await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?; - let gc_result = wait_task_done - .await - .context("wait for gc task") - .map_err(ApiError::InternalServerError)? - .map_err(ApiError::InternalServerError)?; + let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?; json_response(StatusCode::OK, gc_result) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 010e56a899..80d354d79e 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -2800,7 +2800,7 @@ impl Tenant { // See comments in [`Tenant::branch_timeline`] for more information about why branch // creation task can run concurrently with timeline's GC iteration. for timeline in gc_timelines { - if task_mgr::is_shutdown_requested() || cancel.is_cancelled() { + if cancel.is_cancelled() { // We were requested to shut down. Stop and return with the progress we // made. break; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 6be66e99ad..7a3e36bf02 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2880,86 +2880,73 @@ use { utils::http::error::ApiError, }; -pub(crate) fn immediate_gc( +#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))] +pub(crate) async fn immediate_gc( tenant_shard_id: TenantShardId, timeline_id: TimelineId, gc_req: TimelineGcRequest, cancel: CancellationToken, ctx: &RequestContext, -) -> Result>, ApiError> { - let guard = TENANTS.read().unwrap(); - - let tenant = guard - .get(&tenant_shard_id) - .cloned() - .with_context(|| format!("tenant {tenant_shard_id}")) - .map_err(|e| ApiError::NotFound(e.into()))?; +) -> Result { + let tenant = { + let guard = TENANTS.read().unwrap(); + guard + .get(&tenant_shard_id) + .cloned() + .with_context(|| format!("tenant {tenant_shard_id}")) + .map_err(|e| ApiError::NotFound(e.into()))? + }; let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); // Use tenant's pitr setting let pitr = tenant.get_pitr_interval(); + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + // Run in task_mgr to avoid race with tenant_detach operation - let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); - let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); - let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); + let ctx: RequestContext = + ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); - // TODO: spawning is redundant now, need to hold the gate - task_mgr::spawn( - &tokio::runtime::Handle::current(), - TaskKind::GarbageCollector, - Some(tenant_shard_id), - Some(timeline_id), - &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"), - false, - async move { - fail::fail_point!("immediate_gc_task_pre"); + let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?; - #[allow(unused_mut)] - let mut result = tenant - .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) - .await; - // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it - // better once the types support it. + fail::fail_point!("immediate_gc_task_pre"); - #[cfg(feature = "testing")] - { - // we need to synchronize with drop completion for python tests without polling for - // log messages - if let Ok(result) = result.as_mut() { - let mut js = tokio::task::JoinSet::new(); - for layer in std::mem::take(&mut result.doomed_layers) { - js.spawn(layer.wait_drop()); - } - tracing::info!(total = js.len(), "starting to wait for the gc'd layers to be dropped"); - while let Some(res) = js.join_next().await { - res.expect("wait_drop should not panic"); - } - } + #[allow(unused_mut)] + let mut result = tenant + .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) + .await; + // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it + // better once the types support it. - let timeline = tenant.get_timeline(timeline_id, false).ok(); - let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref()); - - if let Some(rtc) = rtc { - // layer drops schedule actions on remote timeline client to actually do the - // deletions; don't care about the shutdown error, just exit fast - drop(rtc.wait_completion().await); - } + #[cfg(feature = "testing")] + { + // we need to synchronize with drop completion for python tests without polling for + // log messages + if let Ok(result) = result.as_mut() { + let mut js = tokio::task::JoinSet::new(); + for layer in std::mem::take(&mut result.doomed_layers) { + js.spawn(layer.wait_drop()); } - - match task_done.send(result) { - Ok(_) => (), - Err(result) => error!("failed to send gc result: {result:?}"), + tracing::info!( + total = js.len(), + "starting to wait for the gc'd layers to be dropped" + ); + while let Some(res) = js.join_next().await { + res.expect("wait_drop should not panic"); } - Ok(()) } - .instrument(span) - ); - // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task - drop(guard); + let timeline = tenant.get_timeline(timeline_id, false).ok(); + let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref()); - Ok(wait_task_done) + if let Some(rtc) = rtc { + // layer drops schedule actions on remote timeline client to actually do the + // deletions; don't care about the shutdown error, just exit fast + drop(rtc.wait_completion().await); + } + } + + result.map_err(ApiError::InternalServerError) } #[cfg(test)] diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 01f354b9e8..9ee24a4ff0 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4656,11 +4656,9 @@ impl Timeline { pub(super) async fn gc(&self) -> anyhow::Result { // this is most likely the background tasks, but it might be the spawned task from // immediate_gc - let cancel = crate::task_mgr::shutdown_token(); let _g = tokio::select! { guard = self.gc_lock.lock() => guard, _ = self.cancel.cancelled() => return Ok(GcResult::default()), - _ = cancel.cancelled() => return Ok(GcResult::default()), }; let timer = self.metrics.garbage_collect_histo.start_timer(); From 1412e9b3e827435bab6de0eeea0a3f9d721273c0 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Mon, 13 May 2024 18:24:12 +0100 Subject: [PATCH 041/130] pagectl: fix diagrams generation for paths containing generations (#7739) ## Problem When layer paths include generations, the lsn parsing does not work and `pagectl` errors out. ## Summary of changes If the last "word" of the layer path contains 8 characters, discard it for the purpose of lsn parsing. --- pageserver/ctl/src/draw_timeline_dir.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index d8082f8ab4..4dff8af1fc 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -83,6 +83,11 @@ fn parse_filename(name: &str) -> (Range, Range) { let split: Vec<&str> = name.split("__").collect(); let keys: Vec<&str> = split[0].split('-').collect(); let mut lsns: Vec<&str> = split[1].split('-').collect(); + + if lsns.last().expect("should").len() == 8 { + lsns.pop(); + } + if lsns.len() == 1 { lsns.push(lsns[0]); } From 972470b1745146ad28c833385d34f8d413e34d1d Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 13 May 2024 18:38:30 +0100 Subject: [PATCH 042/130] pageserver: use adaptive concurrency in secondary layer downloads (#7675) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem Secondary downloads are a low priority task, and intentionally do not try to max out download speeds. This is almost always fine when they are used through the life of a tenant shard as a continuous "trickle" of background downloads. However, there are sometimes circumstances where we would like to populate a secondary location as fast as we can, within the constraint that we don't want to impact the activity of attached tenants: - During node removal, where we will need to create replacements for secondary locations on the node being removed - After a shard split, we need new secondary locations for the new shards to populate before the shards can be migrated to their final location. ## Summary of changes - Add an activity() function to the remote storage interface, enabling callers to query how busy the remote storage backend is - In the secondary download code, use a very modest amount of concurrency, driven by the remote storage's state: we only use concurrency if the remote storage semaphore is 75% free, and scale the amount of concurrency used within that range. This is not a super clever form of prioritization, but it should accomplish the key goals: - Enable secondary downloads to happen faster when the system is idle - Make secondary downloads a much lower priority than attached tenants when the remote storage is busy. --------- Co-authored-by: Arpad Müller --- libs/remote_storage/src/azure_blob.rs | 5 + libs/remote_storage/src/lib.rs | 34 +++ libs/remote_storage/src/local_fs.rs | 14 +- libs/remote_storage/src/s3_bucket.rs | 8 +- libs/remote_storage/src/simulate_failures.rs | 6 +- pageserver/src/tenant/secondary/downloader.rs | 238 +++++++++++++----- 6 files changed, 238 insertions(+), 67 deletions(-) diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 24c1248304..220d4ef115 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -29,6 +29,7 @@ use http_types::{StatusCode, Url}; use tokio_util::sync::CancellationToken; use tracing::debug; +use crate::RemoteStorageActivity; use crate::{ error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata, @@ -525,6 +526,10 @@ impl RemoteStorage for AzureBlobStorage { // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview Err(TimeTravelError::Unimplemented) } + + fn activity(&self) -> RemoteStorageActivity { + self.concurrency_limiter.activity() + } } pin_project_lite::pin_project! { diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 708662f20f..f024021507 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -263,6 +263,17 @@ pub trait RemoteStorage: Send + Sync + 'static { done_if_after: SystemTime, cancel: &CancellationToken, ) -> Result<(), TimeTravelError>; + + /// Query how busy we currently are: may be used by callers which wish to politely + /// back off if there are already a lot of operations underway. + fn activity(&self) -> RemoteStorageActivity; +} + +pub struct RemoteStorageActivity { + pub read_available: usize, + pub read_total: usize, + pub write_available: usize, + pub write_total: usize, } /// DownloadStream is sensitive to the timeout and cancellation used with the original @@ -444,6 +455,15 @@ impl GenericRemoteStorage> { } } } + + pub fn activity(&self) -> RemoteStorageActivity { + match self { + Self::LocalFs(s) => s.activity(), + Self::AwsS3(s) => s.activity(), + Self::AzureBlob(s) => s.activity(), + Self::Unreliable(s) => s.activity(), + } + } } impl GenericRemoteStorage { @@ -774,6 +794,9 @@ struct ConcurrencyLimiter { // The helps to ensure we don't exceed the thresholds. write: Arc, read: Arc, + + write_total: usize, + read_total: usize, } impl ConcurrencyLimiter { @@ -802,10 +825,21 @@ impl ConcurrencyLimiter { Arc::clone(self.for_kind(kind)).acquire_owned().await } + fn activity(&self) -> RemoteStorageActivity { + RemoteStorageActivity { + read_available: self.read.available_permits(), + read_total: self.read_total, + write_available: self.write.available_permits(), + write_total: self.write_total, + } + } + fn new(limit: usize) -> ConcurrencyLimiter { Self { read: Arc::new(Semaphore::new(limit)), write: Arc::new(Semaphore::new(limit)), + read_total: limit, + write_total: limit, } } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 1f7bcfc982..f12f6590a3 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken}; use utils::crashsafe::path_with_suffix_extension; use crate::{ - Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel, - REMOTE_STORAGE_PREFIX_SEPARATOR, + Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity, + TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR, }; use super::{RemoteStorage, StorageMetadata}; @@ -605,6 +605,16 @@ impl RemoteStorage for LocalFs { ) -> Result<(), TimeTravelError> { Err(TimeTravelError::Unimplemented) } + + fn activity(&self) -> RemoteStorageActivity { + // LocalFS has no concurrency limiting: give callers the impression that plenty of units are available + RemoteStorageActivity { + read_available: 16, + read_total: 16, + write_available: 16, + write_total: 16, + } + } } fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index c3d6c75e20..0f6772b274 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -47,8 +47,8 @@ use utils::backoff; use super::StorageMetadata; use crate::{ error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, - Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel, - MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, + Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config, + TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, }; pub(super) mod metrics; @@ -975,6 +975,10 @@ impl RemoteStorage for S3Bucket { } Ok(()) } + + fn activity(&self) -> RemoteStorageActivity { + self.concurrency_limiter.activity() + } } /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`]. diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index c467a2d196..66522e04ca 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken; use crate::{ Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage, - StorageMetadata, TimeTravelError, + RemoteStorageActivity, StorageMetadata, TimeTravelError, }; pub struct UnreliableWrapper { @@ -213,4 +213,8 @@ impl RemoteStorage for UnreliableWrapper { .time_travel_recover(prefix, timestamp, done_if_after, cancel) .await } + + fn activity(&self) -> RemoteStorageActivity { + self.inner.activity() + } } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 2a8f83be95..c28e041fa2 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -45,10 +45,10 @@ use crate::tenant::{ use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; -use futures::Future; +use futures::{Future, StreamExt}; use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; -use remote_storage::{DownloadError, Etag, GenericRemoteStorage}; +use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity}; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, warn, Instrument}; @@ -71,6 +71,12 @@ use super::{ /// `` const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000); +/// Range of concurrency we may use when downloading layers within a timeline. This is independent +/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in +/// `PageServerConf::secondary_download_concurrency` +const MAX_LAYER_CONCURRENCY: usize = 16; +const MIN_LAYER_CONCURRENCY: usize = 1; + pub(super) async fn downloader_task( tenant_manager: Arc, remote_storage: GenericRemoteStorage, @@ -79,14 +85,15 @@ pub(super) async fn downloader_task( cancel: CancellationToken, root_ctx: RequestContext, ) { - let concurrency = tenant_manager.get_conf().secondary_download_concurrency; + // How many tenants' secondary download operations we will run concurrently + let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency; let generator = SecondaryDownloader { tenant_manager, remote_storage, root_ctx, }; - let mut scheduler = Scheduler::new(generator, concurrency); + let mut scheduler = Scheduler::new(generator, tenant_concurrency); scheduler .run(command_queue, background_jobs_can_start, cancel) @@ -792,6 +799,8 @@ impl<'a> TenantDownloader<'a> { tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); + let mut download_futs = Vec::new(); + // Download heatmap layers that are not present on local disk, or update their // access time if they are already present. for layer in timeline.layers { @@ -874,67 +883,33 @@ impl<'a> TenantDownloader<'a> { } } - // Failpoint for simulating slow remote storage - failpoint_support::sleep_millis_async!( - "secondary-layer-download-sleep", - &self.secondary_state.cancel - ); - - // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally - let downloaded_bytes = match download_layer_file( - self.conf, - self.remote_storage, - *tenant_shard_id, - timeline.timeline_id, - &layer.name, - &LayerFileMetadata::from(&layer.metadata), - &self.secondary_state.cancel, + download_futs.push(self.download_layer( + tenant_shard_id, + &timeline.timeline_id, + layer, ctx, - ) - .await - { - Ok(bytes) => bytes, - Err(DownloadError::NotFound) => { - // A heatmap might be out of date and refer to a layer that doesn't exist any more. - // This is harmless: continue to download the next layer. It is expected during compaction - // GC. - tracing::debug!( - "Skipped downloading missing layer {}, raced with compaction/gc?", - layer.name - ); - continue; + )); + } + + // Break up layer downloads into chunks, so that for each chunk we can re-check how much + // concurrency to use based on activity level of remote storage. + while !download_futs.is_empty() { + let chunk = + download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY)); + + let concurrency = Self::layer_concurrency(self.remote_storage.activity()); + + let mut result_stream = futures::stream::iter(chunk).buffered(concurrency); + let mut result_stream = std::pin::pin!(result_stream); + while let Some(result) = result_stream.next().await { + match result { + Err(e) => return Err(e), + Ok(None) => { + // No error, but we didn't download the layer. Don't mark it touched + } + Ok(Some(layer)) => touched.push(layer), } - Err(e) => return Err(e.into()), - }; - - if downloaded_bytes != layer.metadata.file_size { - let local_path = local_layer_path( - self.conf, - tenant_shard_id, - &timeline.timeline_id, - &layer.name, - &layer.metadata.generation, - ); - - tracing::warn!( - "Downloaded layer {} with unexpected size {} != {}. Removing download.", - layer.name, - downloaded_bytes, - layer.metadata.file_size - ); - - tokio::fs::remove_file(&local_path) - .await - .or_else(fs_ext::ignore_not_found)?; - } else { - tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes); - let mut progress = self.secondary_state.progress.lock().unwrap(); - progress.bytes_downloaded += downloaded_bytes; - progress.layers_downloaded += 1; } - - SECONDARY_MODE.download_layer.inc(); - touched.push(layer) } // Write updates to state to record layers we just downloaded or touched. @@ -966,6 +941,90 @@ impl<'a> TenantDownloader<'a> { Ok(()) } + + async fn download_layer( + &self, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + layer: HeatMapLayer, + ctx: &RequestContext, + ) -> Result, UpdateError> { + // Failpoint for simulating slow remote storage + failpoint_support::sleep_millis_async!( + "secondary-layer-download-sleep", + &self.secondary_state.cancel + ); + + // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally + let downloaded_bytes = match download_layer_file( + self.conf, + self.remote_storage, + *tenant_shard_id, + *timeline_id, + &layer.name, + &LayerFileMetadata::from(&layer.metadata), + &self.secondary_state.cancel, + ctx, + ) + .await + { + Ok(bytes) => bytes, + Err(DownloadError::NotFound) => { + // A heatmap might be out of date and refer to a layer that doesn't exist any more. + // This is harmless: continue to download the next layer. It is expected during compaction + // GC. + tracing::debug!( + "Skipped downloading missing layer {}, raced with compaction/gc?", + layer.name + ); + return Ok(None); + } + Err(e) => return Err(e.into()), + }; + + if downloaded_bytes != layer.metadata.file_size { + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + timeline_id, + &layer.name, + &layer.metadata.generation, + ); + + tracing::warn!( + "Downloaded layer {} with unexpected size {} != {}. Removing download.", + layer.name, + downloaded_bytes, + layer.metadata.file_size + ); + + tokio::fs::remove_file(&local_path) + .await + .or_else(fs_ext::ignore_not_found)?; + } else { + tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes); + let mut progress = self.secondary_state.progress.lock().unwrap(); + progress.bytes_downloaded += downloaded_bytes; + progress.layers_downloaded += 1; + } + + SECONDARY_MODE.download_layer.inc(); + + Ok(Some(layer)) + } + + /// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage + fn layer_concurrency(activity: RemoteStorageActivity) -> usize { + // When less than 75% of units are available, use minimum concurrency. Else, do a linear mapping + // of our concurrency range to the units available within the remaining 25%. + let clamp_at = (activity.read_total * 3) / 4; + if activity.read_available > clamp_at { + (MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at)) + / (activity.read_total - clamp_at) + } else { + MIN_LAYER_CONCURRENCY + } + } } /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline @@ -1092,3 +1151,58 @@ async fn init_timeline_state( detail } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn layer_concurrency() { + // Totally idle + assert_eq!( + TenantDownloader::layer_concurrency(RemoteStorageActivity { + read_available: 16, + read_total: 16, + write_available: 16, + write_total: 16 + }), + MAX_LAYER_CONCURRENCY + ); + + // Totally busy + assert_eq!( + TenantDownloader::layer_concurrency(RemoteStorageActivity { + read_available: 0, + read_total: 16, + + write_available: 16, + write_total: 16 + }), + MIN_LAYER_CONCURRENCY + ); + + // Edge of the range at which we interpolate + assert_eq!( + TenantDownloader::layer_concurrency(RemoteStorageActivity { + read_available: 12, + read_total: 16, + + write_available: 16, + write_total: 16 + }), + MIN_LAYER_CONCURRENCY + ); + + // Midpoint of the range in which we interpolate + assert_eq!( + TenantDownloader::layer_concurrency(RemoteStorageActivity { + read_available: 14, + read_total: 16, + + write_available: 16, + write_total: 16 + }), + MAX_LAYER_CONCURRENCY / 2 + ); + } +} From 9ffb8523597d846b6afce7eb855b820b79efcbd6 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Mon, 13 May 2024 17:14:08 -0400 Subject: [PATCH 043/130] fix(test): ensure compatibility test uses the correct compute node (#7741) Use the old compute node for compat tests. --------- Signed-off-by: Alex Chi Z --- test_runner/fixtures/neon_fixtures.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 0c2b70202e..8432655370 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -702,8 +702,7 @@ class NeonEnvBuilder: config["branch_name_mappings"] = snapshot_config["branch_name_mappings"] # Update the config with new neon + postgres path in case of compat test - # FIXME: overriding pg_distrib_dir cause storage controller fail to start - # config["pg_distrib_dir"] = str(self.pg_distrib_dir) + config["pg_distrib_dir"] = str(self.pg_distrib_dir) config["neon_distrib_dir"] = str(self.neon_binpath) with (self.repo_dir / "config").open("w") as f: From 3a6fa768286fced1787b2320d4878e1b961b5a0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 14 May 2024 01:13:25 +0200 Subject: [PATCH 044/130] Tiered compaction: cut deltas along lsn as well if needed (#7671) In general, tiered compaction is splitting delta layers along the key dimension, but this can only continue until a single key is reached: if the changes from a single key don't fit into one layer file, we used to create layer files of unbounded sizes. This patch implements the method listed as TODO/FIXME in the source code. It does the following things: * Make `accum_key_values` take the target size and if one key's modifications exceed it, make it fill `partition_lsns`, a vector of lsns to use for partitioning. * Have `retile_deltas` use that `partition_lsns` to create delta layers separated by lsn. * Adjust the `test_many_updates_for_single_key` to allow layer files below 0.5 the target size. This situation can create arbitarily small layer files: The amount of data is arbitrary that sits between having just cut a new delta, and then stumbling upon the key that needs to be split along lsn. This data will end up in a dedicated layer and it can be arbitrarily small. * Ignore single-key delta layers for depth calculation: in theory we might have only single-key delta layers in a tier, and this might confuse depth calculation as well, but this should be unlikely. Fixes #7243 Part of #7554 --------- Co-authored-by: Heikki Linnakangas --- pageserver/compaction/src/compact_tiered.rs | 107 ++++++++++++++----- pageserver/compaction/src/helpers.rs | 23 +++- pageserver/compaction/src/identify_levels.rs | 6 ++ pageserver/compaction/tests/tests.rs | 8 +- 4 files changed, 109 insertions(+), 35 deletions(-) diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 20e9cf2196..a8f184af24 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -530,8 +530,6 @@ where // If we have accumulated only a narrow band of keyspace, create an // image layer. Otherwise write a delta layer. - // FIXME: deal with the case of lots of values for same key - // FIXME: we are ignoring images here. Did we already divide the work // so that we won't encounter them here? @@ -550,39 +548,94 @@ where let mut new_jobs = Vec::new(); // Slide a window through the keyspace - let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream)); + let mut key_accum = + std::pin::pin!(accum_key_values(key_value_stream, self.target_file_size)); let mut all_in_window: bool = false; let mut window = Window::new(); + + // Helper function to create a job for a new delta layer with given key-lsn + // rectangle. + let create_delta_job = |key_range, lsn_range: &Range, new_jobs: &mut Vec<_>| { + // The inputs for the job are all the input layers of the original job that + // overlap with the rectangle. + let batch_layers: Vec = job + .input_layers + .iter() + .filter(|layer_id| { + overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range) + }) + .cloned() + .collect(); + assert!(!batch_layers.is_empty()); + new_jobs.push(CompactionJob { + key_range, + lsn_range: lsn_range.clone(), + strategy: CompactionStrategy::CreateDelta, + input_layers: batch_layers, + completed: false, + }); + }; + loop { - if all_in_window && window.elems.is_empty() { + if all_in_window && window.is_empty() { // All done! break; } + + // If we now have enough keyspace for next delta layer in the window, create a + // new delta layer if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window) { - let batch_layers: Vec = job - .input_layers - .iter() - .filter(|layer_id| { - overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range) - }) - .cloned() - .collect(); - assert!(!batch_layers.is_empty()); - new_jobs.push(CompactionJob { - key_range, - lsn_range: job.lsn_range.clone(), - strategy: CompactionStrategy::CreateDelta, - input_layers: batch_layers, - completed: false, - }); - } else { - assert!(!all_in_window); - if let Some(next_key) = key_accum.next().await.transpose()? { - window.feed(next_key.key, next_key.size); - } else { + create_delta_job(key_range, &job.lsn_range, &mut new_jobs); + continue; + } + assert!(!all_in_window); + + // Process next key in the key space + match key_accum.next().await.transpose()? { + None => { all_in_window = true; } + Some(next_key) if next_key.partition_lsns.is_empty() => { + // Normal case: extend the window by the key + window.feed(next_key.key, next_key.size); + } + Some(next_key) => { + // A key with too large size impact for a single delta layer. This + // case occurs if you make a huge number of updates for a single key. + // + // Drain the window with has_more = false to make a clean cut before + // the key, and then make dedicated delta layers for the single key. + // + // We cannot cluster the key with the others, because we don't want + // layer files to overlap with each other in the lsn,key space (no + // overlaps for the rectangles). + let key = next_key.key; + debug!("key {key} with size impact larger than the layer size"); + while !window.is_empty() { + let has_more = false; + let key_range = window.choose_next_delta(self.target_file_size, has_more) + .expect("with has_more==false, choose_next_delta always returns something for a non-empty Window"); + create_delta_job(key_range, &job.lsn_range, &mut new_jobs); + } + + // Not really required: but here for future resilience: + // We make a "gap" here, so any structure the window holds should + // probably be reset. + window = Window::new(); + + let mut prior_lsn = job.lsn_range.start; + let mut lsn_ranges = Vec::new(); + for (lsn, _size) in next_key.partition_lsns.iter() { + lsn_ranges.push(prior_lsn..*lsn); + prior_lsn = *lsn; + } + lsn_ranges.push(prior_lsn..job.lsn_range.end); + for lsn_range in lsn_ranges { + let key_range = key..key.next(); + create_delta_job(key_range, &lsn_range, &mut new_jobs); + } + } } } @@ -803,6 +856,10 @@ where self.elems.front().unwrap().accum_size - self.splitoff_size } + fn is_empty(&self) -> bool { + self.elems.is_empty() + } + fn commit_upto(&mut self, mut upto: usize) { while upto > 1 { let popped = self.elems.pop_front().unwrap(); diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 06454ee1d0..2c922b0a49 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -235,9 +235,14 @@ pub struct KeySize { pub key: K, pub num_values: u64, pub size: u64, + /// The lsns to partition at (if empty then no per-lsn partitioning) + pub partition_lsns: Vec<(Lsn, u64)>, } -pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream, E>> +pub fn accum_key_values<'a, I, K, D, E>( + input: I, + target_size: u64, +) -> impl Stream, E>> where K: Eq + PartialOrd + Display + Copy, I: Stream>, @@ -249,25 +254,35 @@ where if let Some(first) = input.next().await { let first = first?; + let mut part_size = first.size(); let mut accum: KeySize = KeySize { key: first.key(), num_values: 1, - size: first.size(), + size: part_size, + partition_lsns: Vec::new(), }; let mut last_key = accum.key; while let Some(this) = input.next().await { let this = this?; if this.key() == accum.key { - accum.size += this.size(); + let add_size = this.size(); + if part_size + add_size > target_size { + accum.partition_lsns.push((this.lsn(), part_size)); + part_size = 0; + } + part_size += add_size; + accum.size += add_size; accum.num_values += 1; } else { assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key); last_key = accum.key; yield accum; + part_size = this.size(); accum = KeySize { key: this.key(), num_values: 1, - size: this.size(), + size: part_size, + partition_lsns: Vec::new(), }; } } diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs index 98dd46925c..1853afffdd 100644 --- a/pageserver/compaction/src/identify_levels.rs +++ b/pageserver/compaction/src/identify_levels.rs @@ -184,6 +184,12 @@ impl Level { } let mut events: Vec> = Vec::new(); for (idx, l) in self.layers.iter().enumerate() { + let key_range = l.key_range(); + if key_range.end == key_range.start.next() && l.is_delta() { + // Ignore single-key delta layers as they can be stacked on top of each other + // as that is the only way to cut further. + continue; + } events.push(Event { key: l.key_range().start, layer_idx: idx, diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs index 7aa20e6863..bd8b54a286 100644 --- a/pageserver/compaction/tests/tests.rs +++ b/pageserver/compaction/tests/tests.rs @@ -20,10 +20,6 @@ pub(crate) fn setup_logging() { /// even if we produce an extremely narrow delta layer, spanning just that one /// key, we still too many records to fit in the target file size. We need to /// split in the LSN dimension too in that case. -/// -/// TODO: The code to avoid this problem has not been implemented yet! So the -/// assertion currently fails, but we need to make it not fail. -#[ignore] #[tokio::test] async fn test_many_updates_for_single_key() { setup_logging(); @@ -43,9 +39,9 @@ async fn test_many_updates_for_single_key() { } for l in executor.live_layers.iter() { assert!(l.file_size() < executor.target_file_size * 2); - // sanity check that none of the delta layers are stupidly small either + // Sanity check that none of the delta layers are empty either. if l.is_delta() { - assert!(l.file_size() > executor.target_file_size / 2); + assert!(l.file_size() > 0); } } } From ba20752b7678179f5db0248ce98f56d225684c77 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 13 May 2024 13:17:27 +0300 Subject: [PATCH 045/130] Refactor the request LSNs to a separate struct (#7708) We had a lot of code that passed around the two LSNs that are associated with each GetPage request. Introduce a new struct to encapsulate them. I'm about to add a third LSN to the struct in the next commit, this is a mechanical refactoring in preparation for that. --- pgxn/neon/pagestore_client.h | 24 ++++- pgxn/neon/pagestore_smgr.c | 179 ++++++++++++++------------------ pgxn/neon_test_utils/neontest.c | 24 ++--- 3 files changed, 111 insertions(+), 116 deletions(-) diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 7709ab9d42..1334a04f9a 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -237,18 +237,38 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum, extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +/* + * LSN values associated with each request to the pageserver + */ +typedef struct +{ + /* + * 'request_lsn' is the main value that determines which page version to + * fetch. + */ + XLogRecPtr request_lsn; + + /* + * A hint to the pageserver that the requested page hasn't been modified + * between this LSN and 'request_lsn'. That allows the pageserver to + * return the page faster, without waiting for 'request_lsn' to arrive in + * the pageserver, as long as 'not_modified_since' has arrived. + */ + XLogRecPtr not_modified_since; +} neon_request_lsns; + #if PG_MAJORVERSION_NUM < 16 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer); + neon_request_lsns request_lsns, char *buffer); extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); #else extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer); + neon_request_lsns request_lsns, void *buffer); extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync); #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 44ecdbd9aa..8fcbbfe54d 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -168,8 +168,7 @@ typedef enum PrefetchStatus typedef struct PrefetchRequest { BufferTag buftag; /* must be first entry in the struct */ - XLogRecPtr request_lsn; - XLogRecPtr not_modified_since; + neon_request_lsns request_lsns; NeonResponse *response; /* may be null */ PrefetchStatus status; shardno_t shard_no; @@ -271,16 +270,15 @@ static PrefetchState *MyPState; static bool compact_prefetch_buffers(void); static void consume_prefetch_responses(void); -static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since); +static uint64 prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns); static bool prefetch_read(PrefetchRequest *slot); -static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since); +static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns); static bool prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); -static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, - XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since); -static bool neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since, +static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno); +static bool neon_prefetch_response_usable(neon_request_lsns request_lsns, PrefetchRequest *slot); static bool @@ -338,8 +336,7 @@ compact_prefetch_buffers(void) target_slot->shard_no = source_slot->shard_no; target_slot->status = source_slot->status; target_slot->response = source_slot->response; - target_slot->request_lsn = source_slot->request_lsn; - target_slot->not_modified_since = source_slot->not_modified_since; + target_slot->request_lsns = source_slot->request_lsns; target_slot->my_ring_index = empty_ring_index; prfh_delete(MyPState->prf_hash, source_slot); @@ -358,8 +355,9 @@ compact_prefetch_buffers(void) }; source_slot->response = NULL; source_slot->my_ring_index = 0; - source_slot->request_lsn = InvalidXLogRecPtr; - source_slot->not_modified_since = InvalidXLogRecPtr; + source_slot->request_lsns = (neon_request_lsns) { + InvalidXLogRecPtr, InvalidXLogRecPtr + }; /* update bookkeeping */ n_moved++; @@ -689,7 +687,7 @@ prefetch_set_unused(uint64 ring_index) * prefetch_wait_for(). */ static void -prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since) +prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns) { bool found; NeonGetPageRequest request = { @@ -700,23 +698,14 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe .blkno = slot->buftag.blockNum, }; - Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL))); - - if (force_request_lsn) - { - request.req.lsn = *force_request_lsn; - request.req.not_modified_since = *force_not_modified_since; - } + if (force_request_lsns) + slot->request_lsns = *force_request_lsns; else - { - neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag), - slot->buftag.forkNum, - slot->buftag.blockNum, - &request.req.lsn, - &request.req.not_modified_since); - } - slot->request_lsn = request.req.lsn; - slot->not_modified_since = request.req.not_modified_since; + slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), + slot->buftag.forkNum, + slot->buftag.blockNum); + request.req.lsn = slot->request_lsns.request_lsn; + request.req.not_modified_since = slot->request_lsns.not_modified_since; Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_unused); @@ -742,25 +731,22 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe * * Register that we may want the contents of BufferTag in the near future. * - * If force_request_lsn and force_not_modified_since are not NULL, those - * values are sent to the pageserver. If they are NULL, we utilize the - * lastWrittenLsn -infrastructure to fill them in. + * If force_request_lsns is not NULL, those values are sent to the + * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure + * to calculate the LSNs to send. * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. */ static uint64 -prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, - XLogRecPtr *force_not_modified_since) +prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns) { uint64 ring_index; PrefetchRequest req; PrefetchRequest *slot; PrfHashEntry *entry; - Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL))); - /* use an intermediate PrefetchRequest struct to ensure correct alignment */ req.buftag = tag; Retry: @@ -781,10 +767,9 @@ Retry: * If the caller specified a request LSN to use, only accept prefetch * responses that satisfy that request. */ - if (force_request_lsn) + if (force_request_lsns) { - if (!neon_prefetch_response_usable(*force_request_lsn, - *force_not_modified_since, slot)) + if (!neon_prefetch_response_usable(*force_request_lsns, slot)) { /* Wait for the old request to finish and discard it */ if (!prefetch_wait_for(ring_index)) @@ -886,7 +871,7 @@ Retry: slot->shard_no = get_shard_number(&tag); slot->my_ring_index = ring_index; - prefetch_do_request(slot, force_request_lsn, force_not_modified_since); + prefetch_do_request(slot, force_request_lsns); Assert(slot->status == PRFS_REQUESTED); Assert(MyPState->ring_last <= ring_index && ring_index < MyPState->ring_unused); @@ -1529,11 +1514,11 @@ nm_adjust_lsn(XLogRecPtr lsn) /* * Return LSN for requesting pages and number of blocks from page server */ -static void -neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, - XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since) +static neon_request_lsns +neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) { XLogRecPtr last_written_lsn; + neon_request_lsns result; last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno); last_written_lsn = nm_adjust_lsn(last_written_lsn); @@ -1542,12 +1527,12 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, if (RecoveryInProgress()) { /* Request the page at the last replayed LSN. */ - *request_lsn = GetXLogReplayRecPtr(NULL); - *not_modified_since = last_written_lsn; - Assert(last_written_lsn <= *request_lsn); + result.request_lsn = GetXLogReplayRecPtr(NULL); + result.not_modified_since = last_written_lsn; + Assert(last_written_lsn <= result.request_lsn); - neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X", - LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since)); + neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", + LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since)); } else { @@ -1559,7 +1544,7 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, * must still in the buffer cache, so our request cannot concern * those. */ - neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", + neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X", LSN_FORMAT_ARGS(last_written_lsn)); /* @@ -1592,9 +1577,11 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, * flush, it should still be in the buffer cache, and we wouldn't be * requesting it. */ - *request_lsn = flushlsn; - *not_modified_since = last_written_lsn; + result.request_lsn = flushlsn; + result.not_modified_since = last_written_lsn; } + + return result; } /* @@ -1604,12 +1591,12 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, * satisfy a page read now. */ static bool -neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since, +neon_prefetch_response_usable(neon_request_lsns request_lsns, PrefetchRequest *slot) { /* sanity check the LSN's on the old and the new request */ - Assert(request_lsn >= not_modified_since); - Assert(slot->request_lsn >= slot->not_modified_since); + Assert(request_lsns.request_lsn >= request_lsns.not_modified_since); + Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); Assert(slot->status != PRFS_UNUSED); /* @@ -1627,14 +1614,15 @@ neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_si * calculate LSNs "out of order" with each other, but the prefetch queue * is backend-private at the moment.) */ - if (request_lsn < slot->request_lsn || not_modified_since < slot->not_modified_since) + if (request_lsns.request_lsn < slot->request_lsns.request_lsn || + request_lsns.not_modified_since < slot->request_lsns.not_modified_since) { ereport(LOG, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "request with unexpected LSN after prefetch"), errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", - LSN_FORMAT_ARGS(request_lsn), LSN_FORMAT_ARGS(not_modified_since), - LSN_FORMAT_ARGS(slot->request_lsn), LSN_FORMAT_ARGS(slot->not_modified_since)))); + LSN_FORMAT_ARGS(request_lsns.request_lsn), LSN_FORMAT_ARGS(request_lsns.not_modified_since), + LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); return false; } @@ -1675,9 +1663,9 @@ neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_si */ /* this follows from the checks above */ - Assert(request_lsn >= slot->not_modified_since); + Assert(request_lsns.request_lsn >= slot->request_lsns.not_modified_since); - return not_modified_since <= slot->request_lsn; + return request_lsns.not_modified_since <= slot->request_lsns.request_lsn; } /* @@ -1689,8 +1677,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) bool exists; NeonResponse *resp; BlockNumber n_blocks; - XLogRecPtr request_lsn; - XLogRecPtr not_modified_since; + neon_request_lsns request_lsns; switch (reln->smgr_relpersistence) { @@ -1745,15 +1732,15 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO, - &request_lsn, ¬_modified_since); + request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO); { NeonExistsRequest request = { .req.tag = T_NeonExistsRequest, - .req.lsn = request_lsn, - .req.not_modified_since = not_modified_since, + .req.lsn = request_lsns.request_lsn, + .req.not_modified_since = request_lsns.not_modified_since, .rinfo = InfoFromSMgrRel(reln), - .forknum = forkNum}; + .forknum = forkNum + }; resp = page_server_request(&request); } @@ -1770,7 +1757,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + LSN_FORMAT_ARGS(request_lsns.request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; @@ -2135,7 +2122,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln)); - ring_index = prefetch_register_buffer(tag, NULL, NULL); + ring_index = prefetch_register_buffer(tag, NULL); Assert(ring_index < MyPState->ring_unused && MyPState->ring_last <= ring_index); @@ -2188,10 +2175,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, void #if PG_MAJORVERSION_NUM < 16 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer) + neon_request_lsns request_lsns, char *buffer) #else neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer) + neon_request_lsns request_lsns, void *buffer) #endif { NeonResponse *resp; @@ -2223,7 +2210,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * value of the LwLsn cache when the entry is not found. */ if (RecoveryInProgress() && !(MyBackendType == B_STARTUP)) - XLogWaitForReplayOf(request_lsn); + XLogWaitForReplayOf(request_lsns.request_lsn); /* * Try to find prefetched page in the list of received pages. @@ -2234,7 +2221,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (entry != NULL) { slot = entry->slot; - if (neon_prefetch_response_usable(request_lsn, not_modified_since, slot)) + if (neon_prefetch_response_usable(request_lsns, slot)) { ring_index = slot->my_ring_index; pgBufferUsage.prefetch.hits += 1; @@ -2268,8 +2255,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, { pgBufferUsage.prefetch.misses += 1; - ring_index = prefetch_register_buffer(buftag, &request_lsn, - ¬_modified_since); + ring_index = prefetch_register_buffer(buftag, &request_lsns); slot = GetPrfSlot(ring_index); } else @@ -2310,7 +2296,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, slot->shard_no, blkno, RelFileInfoFmt(rinfo), forkNum, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + LSN_FORMAT_ARGS(request_lsns.request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; @@ -2333,8 +2319,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) #endif { - XLogRecPtr request_lsn; - XLogRecPtr not_modified_since; + neon_request_lsns request_lsns; switch (reln->smgr_relpersistence) { @@ -2359,9 +2344,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer return; } - neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno, - &request_lsn, ¬_modified_since); - neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer); + request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno); + neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -2530,8 +2514,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) { NeonResponse *resp; BlockNumber n_blocks; - XLogRecPtr request_lsn; - XLogRecPtr not_modified_since; + neon_request_lsns request_lsns; switch (reln->smgr_relpersistence) { @@ -2558,13 +2541,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO, - &request_lsn, ¬_modified_since); + request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO); { NeonNblocksRequest request = { .req.tag = T_NeonNblocksRequest, - .req.lsn = request_lsn, - .req.not_modified_since = not_modified_since, + .req.lsn = request_lsns.request_lsn, + .req.not_modified_since = request_lsns.not_modified_since, .rinfo = InfoFromSMgrRel(reln), .forknum = forknum, }; @@ -2584,7 +2566,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + LSN_FORMAT_ARGS(request_lsns.request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; @@ -2595,10 +2577,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - n_blocks); + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, + LSN_FORMAT_ARGS(request_lsns.request_lsn), + n_blocks); pfree(resp); return n_blocks; @@ -2612,17 +2594,15 @@ neon_dbsize(Oid dbNode) { NeonResponse *resp; int64 db_size; - XLogRecPtr request_lsn, - not_modified_since; + neon_request_lsns request_lsns; NRelFileInfo dummy_node = {0}; - neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO, - &request_lsn, ¬_modified_since); + request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); { NeonDbSizeRequest request = { .req.tag = T_NeonDbSizeRequest, - .req.lsn = request_lsn, - .req.not_modified_since = not_modified_since, + .req.lsn = request_lsns.request_lsn, + .req.not_modified_since = request_lsns.not_modified_since, .dbNode = dbNode, }; @@ -2639,8 +2619,7 @@ neon_dbsize(Oid dbNode) ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X", - dbNode, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + dbNode, LSN_FORMAT_ARGS(request_lsns.request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; @@ -2650,9 +2629,7 @@ neon_dbsize(Oid dbNode) } neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", - dbNode, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - db_size); + dbNode, LSN_FORMAT_ARGS(request_lsns.request_lsn), db_size); pfree(resp); return db_size; diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 677006923d..9f63b58a86 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -48,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush); */ #if PG_MAJORVERSION_NUM < 16 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer); + neon_request_lsns request_lsns, char *buffer); #else typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer); + neon_request_lsns request_lsns, void *buffer); #endif static neon_read_at_lsn_type neon_read_at_lsn_ptr; @@ -298,9 +298,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) text *relname; text *forkname; uint32 blkno; - - XLogRecPtr request_lsn; - XLogRecPtr not_modified_since; + neon_request_lsns request_lsns; if (PG_NARGS() != 5) elog(ERROR, "unexpected number of arguments in SQL function signature"); @@ -312,8 +310,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) forkname = PG_GETARG_TEXT_PP(1); blkno = PG_GETARG_UINT32(2); - request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3); - not_modified_since = PG_ARGISNULL(4) ? request_lsn : PG_GETARG_LSN(4); + request_lsns.request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3); + request_lsns.not_modified_since = PG_ARGISNULL(4) ? request_lsns.request_lsn : PG_GETARG_LSN(4); if (!superuser()) ereport(ERROR, @@ -367,7 +365,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsn, not_modified_since, raw_page_data); + neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsns, + raw_page_data); relation_close(rel, AccessShareLock); @@ -413,19 +412,18 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) ForkNumber forknum = PG_GETARG_UINT32(3); uint32 blkno = PG_GETARG_UINT32(4); - XLogRecPtr request_lsn; - XLogRecPtr not_modified_since; + neon_request_lsns request_lsns; /* Initialize buffer to copy to */ bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); - request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5); - not_modified_since = PG_ARGISNULL(6) ? request_lsn : PG_GETARG_LSN(6); + request_lsns.request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5); + request_lsns.not_modified_since = PG_ARGISNULL(6) ? request_lsns.request_lsn : PG_GETARG_LSN(6); SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(rinfo, forknum, blkno, request_lsn, not_modified_since, raw_page_data); + neon_read_at_lsn(rinfo, forknum, blkno, request_lsns, raw_page_data); PG_RETURN_BYTEA_P(raw_page); } } From 22afaea6e1c595282633b210ac02990f98378458 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 13 May 2024 13:17:30 +0300 Subject: [PATCH 046/130] Always use Lsn::MAX as the request LSN in the primary (#7708) The new protocol version supports sending two LSNs to the pageserver: request LSN and a "not_modified_since" hint. A primary always wants to read the latest version of each page, so having two values was not strictly necessary, and the old protocol worked fine with just the "not_modified_since" LSN and a flag to request the latest page version. Nevertheless, it seemed like a good idea to set the request LSN to the current insert/flush LSN, because that's logically the page version that the primary wants to read. However, that made the test_gc_aggressive test case flaky. When the primary requests a page with the last inserted or flushed LSN, it's possible that by the time that the pageserver processes the request, more WAL has been generated by other processes in the compute and already digested by the pageserver. Furthermore, if the PITR horizon in the pageserver is set to 0, and GC runs during that window, it's possible that the GC horizon has advances past the request LSN, before the pageserver processes the request. It is still correct to send the latest page version in that case, because the compute either has the page locked so the it cannot have been modified in the primary, or if it's a prefetch request, and we will validate the LSNs when the prefetch response is processed and discard it if the page has been modified. But the pageserver doesn't know that and rightly complains. To fix, modify the compute so that the primary always uses Lsn::MAX in the requests. This reverts the primary's behavior to how the protocol version 1 worked. In protocol version 1, there was only one LSN, the "not_modified_since" hint, and a flag was set to read the latest page version, whatever that might be. Requests from computes that are still using protocol version 1 were already mapped to Lsn::MAX in the pageserver, now we do the same with protocol version 2 for primary's requests. (I'm a bit sad about losing the information in the pageserver, what the last LSN was at the time that the request wa made. We never had it with protocol version 1, but I wanted to make it available for debugging purposes.) Add another field, 'effective_request_lsn', to track what the flush LSN was when the request was made. It's not sent to the pageserver, Lsn::MAX is now used as the request LSN, but it's still needed internally in the compute to track the validity of prefetch requests. Fixes issue https://github.com/neondatabase/neon/issues/7692 --- pgxn/neon/pagestore_client.h | 12 +++++ pgxn/neon/pagestore_smgr.c | 95 +++++++++++++++++++++++---------- pgxn/neon_test_utils/neontest.c | 14 +++++ 3 files changed, 92 insertions(+), 29 deletions(-) diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 1334a04f9a..8951e6607b 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -255,6 +255,18 @@ typedef struct * the pageserver, as long as 'not_modified_since' has arrived. */ XLogRecPtr not_modified_since; + + /* + * 'effective_request_lsn' is not included in the request that's sent to + * the pageserver, but is used to keep track of the latest LSN of when the + * request was made. In a standby server, this is always the same as the + * 'request_lsn', but in the primary we use UINT64_MAX as the + * 'request_lsn' to request the latest page version, so we need this + * separate field to remember that latest LSN was when the request was + * made. It's needed to manage prefetch request, to verify if the response + * to a prefetched request is still valid. + */ + XLogRecPtr effective_request_lsn; } neon_request_lsns; #if PG_MAJORVERSION_NUM < 16 diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 8fcbbfe54d..e3b841f526 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -356,7 +356,7 @@ compact_prefetch_buffers(void) source_slot->response = NULL; source_slot->my_ring_index = 0; source_slot->request_lsns = (neon_request_lsns) { - InvalidXLogRecPtr, InvalidXLogRecPtr + InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr }; /* update bookkeeping */ @@ -1529,6 +1529,7 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) /* Request the page at the last replayed LSN. */ result.request_lsn = GetXLogReplayRecPtr(NULL); result.not_modified_since = last_written_lsn; + result.effective_request_lsn = result.request_lsn; Assert(last_written_lsn <= result.request_lsn); neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", @@ -1570,15 +1571,30 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) } /* - * Request the latest version of the page. The most up-to-date request - * LSN we could use would be the current insert LSN, but to avoid the - * overhead of looking it up, use 'flushlsn' instead. This relies on - * the assumption that if the page was modified since the last WAL - * flush, it should still be in the buffer cache, and we wouldn't be - * requesting it. + * Request the very latest version of the page. In principle we + * want to read the page at the current insert LSN, and we could + * use that value in the request. However, there's a corner case + * with pageserver's garbage collection. If the GC horizon is + * set to a very small value, it's possible that by the time + * that the pageserver processes our request, the GC horizon has + * already moved past the LSN we calculate here. Standby servers + * always have that problem as the can always lag behind the + * primary, but for the primary we can avoid it by always + * requesting the latest page, by setting request LSN to + * UINT64_MAX. + * + * Remember the current LSN, however, so that we can later + * correctly determine if the response to the request is still + * valid. The most up-to-date LSN we could use for that purpose + * would be the current insert LSN, but to avoid the overhead of + * looking it up, use 'flushlsn' instead. This relies on the + * assumption that if the page was modified since the last WAL + * flush, it should still be in the buffer cache, and we + * wouldn't be requesting it. */ - result.request_lsn = flushlsn; + result.request_lsn = UINT64_MAX; result.not_modified_since = last_written_lsn; + result.effective_request_lsn = flushlsn; } return result; @@ -1596,7 +1612,11 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns, { /* sanity check the LSN's on the old and the new request */ Assert(request_lsns.request_lsn >= request_lsns.not_modified_since); + Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since); + Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn); Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); + Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); + Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); Assert(slot->status != PRFS_UNUSED); /* @@ -1614,27 +1634,40 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns, * calculate LSNs "out of order" with each other, but the prefetch queue * is backend-private at the moment.) */ - if (request_lsns.request_lsn < slot->request_lsns.request_lsn || + if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn || request_lsns.not_modified_since < slot->request_lsns.not_modified_since) { ereport(LOG, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "request with unexpected LSN after prefetch"), errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", - LSN_FORMAT_ARGS(request_lsns.request_lsn), LSN_FORMAT_ARGS(request_lsns.not_modified_since), - LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), + LSN_FORMAT_ARGS(request_lsns.not_modified_since), + LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), + LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); return false; } /*--- - * Each request to the pageserver carries two LSN values: - * `not_modified_since` and `request_lsn`. The (not_modified_since, - * request_lsn] range of each request is effectively a claim that the page - * has not been modified between those LSNs. If the range of the old - * request in the queue overlaps with the new request, we know that the - * page hasn't been modified in the union of the ranges. We can use the - * response to old request to satisfy the new request in that case. For - * example: + * Each request to the pageserver has three LSN values associated with it: + * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'. + * `not_modified_since` and `request_lsn` are sent to the pageserver, but + * in the primary node, we always use UINT64_MAX as the `request_lsn`, so + * we remember `effective_request_lsn` separately. In a primary, + * `effective_request_lsn` is the last flush WAL position when the request + * was sent to the pageserver. That's logically the LSN that we are + * requesting the page at, but we send UINT64_MAX to the pageserver so + * that if the GC horizon advances past that position, we still get a + * valid response instead of an error. + * + * To determine whether a response to a GetPage request issued earlier is + * still valid to satisfy a new page read, we look at the + * (not_modified_since, effective_request_lsn] range of the request. It is + * effectively a claim that the page has not been modified between those + * LSNs. If the range of the old request in the queue overlaps with the + * new request, we know that the page hasn't been modified in the union of + * the ranges. We can use the response to old request to satisfy the new + * request in that case. For example: * * 100 500 * Old request: +--------+ @@ -1663,9 +1696,9 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns, */ /* this follows from the checks above */ - Assert(request_lsns.request_lsn >= slot->request_lsns.not_modified_since); + Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); - return request_lsns.not_modified_since <= slot->request_lsns.request_lsn; + return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn; } /* @@ -1757,7 +1790,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, - LSN_FORMAT_ARGS(request_lsns.request_lsn)), + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; @@ -2296,7 +2329,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, slot->shard_no, blkno, RelFileInfoFmt(rinfo), forkNum, - LSN_FORMAT_ARGS(request_lsns.request_lsn)), + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; @@ -2566,7 +2599,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, - LSN_FORMAT_ARGS(request_lsns.request_lsn)), + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; @@ -2579,7 +2612,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, - LSN_FORMAT_ARGS(request_lsns.request_lsn), + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), n_blocks); pfree(resp); @@ -2619,7 +2652,7 @@ neon_dbsize(Oid dbNode) ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X", - dbNode, LSN_FORMAT_ARGS(request_lsns.request_lsn)), + dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; @@ -2629,7 +2662,7 @@ neon_dbsize(Oid dbNode) } neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", - dbNode, LSN_FORMAT_ARGS(request_lsns.request_lsn), db_size); + dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); pfree(resp); return db_size; @@ -2874,6 +2907,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf XLogRecPtr request_lsn, not_modified_since; + /* + * Compute a request LSN to use, similar to neon_get_request_lsns() but the + * logic is a bit simpler. + */ if (RecoveryInProgress()) { request_lsn = GetXLogReplayRecPtr(NULL); @@ -2885,10 +2922,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf */ request_lsn = GetRedoStartLsn(); } + request_lsn = nm_adjust_lsn(request_lsn); } else - request_lsn = GetXLogInsertRecPtr(); - request_lsn = nm_adjust_lsn(request_lsn); + request_lsn = UINT64_MAX; /* * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 9f63b58a86..47f245fbf1 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -312,6 +312,13 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) request_lsns.request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3); request_lsns.not_modified_since = PG_ARGISNULL(4) ? request_lsns.request_lsn : PG_GETARG_LSN(4); + /* + * For the time being, use the same LSN for request and + * effective request LSN. If any test needed to use UINT64_MAX + * as the request LSN, we'd need to add effective_request_lsn + * as a new argument. + */ + request_lsns.effective_request_lsn = request_lsns.request_lsn; if (!superuser()) ereport(ERROR, @@ -419,6 +426,13 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) request_lsns.request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5); request_lsns.not_modified_since = PG_ARGISNULL(6) ? request_lsns.request_lsn : PG_GETARG_LSN(6); + /* + * For the time being, use the same LSN for request + * and effective request LSN. If any test needed to + * use UINT64_MAX as the request LSN, we'd need to add + * effective_request_lsn as a new argument. + */ + request_lsns.effective_request_lsn = request_lsns.request_lsn; SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); From cd0e34493887c91485184f3fa5515ea94438aeda Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 14 May 2024 09:31:26 +0100 Subject: [PATCH 047/130] pageserver: do fewer heatmap uploads for tiny tenants (#7731) ## Problem Currently we do a large number of heatmap uploads for tiny tenants. "tiny" in this context is defined as being less than a single layer in size. These uploads are triggered by atime changes rather than changes in the set of layers. Uploading heatmaps for atime changes on small tenants isn't useful, because even without bumping these atimes, disk usage eviction still avoids evicting the largest resident layer of a tenant, which in practice keeps tiny/empty tenants mostly resident irrespective of atimes. ## Summary of changes - For tenants smaller than one checkpoint interval, only upload heatmap if the set of layers has changed, not if only the atimes have changed. - Include the heatmap period in the uploaded heatmap, as a precursor to implementing https://github.com/neondatabase/neon/issues/6200 (auto-adjusting download intervals to match upload intervals) --- pageserver/src/tenant/secondary/heatmap.rs | 25 ++++++ .../src/tenant/secondary/heatmap_uploader.rs | 82 +++++++++++++------ 2 files changed, 81 insertions(+), 26 deletions(-) diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index ca91ec24c6..2da4a3b9d5 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -15,6 +15,14 @@ pub(super) struct HeatMapTenant { pub(super) generation: Generation, pub(super) timelines: Vec, + + /// Uploaders provide their own upload period in the heatmap, as a hint to downloaders + /// of how frequently it is worthwhile to check for updates. + /// + /// This is optional for backward compat, and because we sometimes might upload + /// a heatmap explicitly via API for a tenant that has no periodic upload configured. + #[serde(default)] + pub(super) upload_period_ms: Option, } #[serde_as] @@ -81,4 +89,21 @@ impl HeatMapTenant { stats } + + pub(crate) fn strip_atimes(self) -> Self { + Self { + timelines: self + .timelines + .into_iter() + .map(|mut tl| { + for layer in &mut tl.layers { + layer.access_time = SystemTime::UNIX_EPOCH; + } + tl + }) + .collect(), + generation: self.generation, + upload_period_ms: self.upload_period_ms, + } + } } diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index 352409f5fc..fddced3ead 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -80,7 +80,7 @@ impl RunningJob for WriteInProgress { struct UploadPending { tenant: Arc, - last_digest: Option, + last_upload: Option, target_time: Option, period: Option, } @@ -94,7 +94,7 @@ impl scheduler::PendingJob for UploadPending { struct WriteComplete { tenant_shard_id: TenantShardId, completed_at: Instant, - digest: Option, + uploaded: Option, next_upload: Option, } @@ -115,10 +115,7 @@ struct UploaderTenantState { tenant: Weak, /// Digest of the serialized heatmap that we last successfully uploaded - /// - /// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag, - /// which is also an md5sum. - last_digest: Option, + last_upload_state: Option, /// When the last upload attempt completed (may have been successful or failed) last_upload: Option, @@ -187,7 +184,7 @@ impl JobGenerator tenant: Arc::downgrade(&tenant), last_upload: None, next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)), - last_digest: None, + last_upload_state: None, }); // Decline to do the upload if insufficient time has passed @@ -195,10 +192,10 @@ impl JobGenerator return; } - let last_digest = state.last_digest; + let last_upload = state.last_upload_state.clone(); result.jobs.push(UploadPending { tenant, - last_digest, + last_upload, target_time: state.next_upload, period: Some(period), }); @@ -218,7 +215,7 @@ impl JobGenerator ) { let UploadPending { tenant, - last_digest, + last_upload, target_time, period, } = job; @@ -231,16 +228,16 @@ impl JobGenerator let _completion = completion; let started_at = Instant::now(); - let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await { - Ok(UploadHeatmapOutcome::Uploaded(digest)) => { + let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await { + Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => { let duration = Instant::now().duration_since(started_at); SECONDARY_MODE .upload_heatmap_duration .observe(duration.as_secs_f64()); SECONDARY_MODE.upload_heatmap.inc(); - Some(digest) + Some(uploaded) } - Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest, + Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload, Err(UploadHeatmapError::Upload(e)) => { tracing::warn!( "Failed to upload heatmap for tenant {}: {e:#}", @@ -251,11 +248,11 @@ impl JobGenerator .upload_heatmap_duration .observe(duration.as_secs_f64()); SECONDARY_MODE.upload_heatmap_errors.inc(); - last_digest + last_upload } Err(UploadHeatmapError::Cancelled) => { tracing::info!("Cancelled heatmap upload, shutting down"); - last_digest + last_upload } }; @@ -277,7 +274,7 @@ impl JobGenerator WriteComplete { tenant_shard_id: *tenant.get_tenant_shard_id(), completed_at: now, - digest, + uploaded, next_upload, } }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())))) @@ -299,7 +296,7 @@ impl JobGenerator Ok(UploadPending { // Ignore our state for last digest: this forces an upload even if nothing has changed - last_digest: None, + last_upload: None, tenant, target_time: None, period: None, @@ -312,7 +309,7 @@ impl JobGenerator let WriteComplete { tenant_shard_id, completed_at, - digest, + uploaded, next_upload, } = completion; use std::collections::hash_map::Entry; @@ -322,7 +319,7 @@ impl JobGenerator } Entry::Occupied(mut entry) => { entry.get_mut().last_upload = Some(completed_at); - entry.get_mut().last_digest = digest; + entry.get_mut().last_upload_state = uploaded; entry.get_mut().next_upload = next_upload } } @@ -331,7 +328,7 @@ impl JobGenerator enum UploadHeatmapOutcome { /// We successfully wrote to remote storage, with this digest. - Uploaded(md5::Digest), + Uploaded(LastUploadState), /// We did not upload because the heatmap digest was unchanged since the last upload NoChange, /// We skipped the upload for some reason, such as tenant/timeline not ready @@ -347,12 +344,25 @@ enum UploadHeatmapError { Upload(#[from] anyhow::Error), } +/// Digests describing the heatmap we most recently uploaded successfully. +/// +/// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag, +/// which is also an md5sum. +#[derive(Clone)] +struct LastUploadState { + // Digest of json-encoded HeatMapTenant + uploaded_digest: md5::Digest, + + // Digest without atimes set. + layers_only_digest: md5::Digest, +} + /// The inner upload operation. This will skip if `last_digest` is Some and matches the digest /// of the object we would have uploaded. async fn upload_tenant_heatmap( remote_storage: GenericRemoteStorage, tenant: &Arc, - last_digest: Option, + last_upload: Option, ) -> Result { debug_assert_current_span_has_tenant_id(); @@ -368,6 +378,7 @@ async fn upload_tenant_heatmap( let mut heatmap = HeatMapTenant { timelines: Vec::new(), generation, + upload_period_ms: tenant.get_heatmap_period().map(|p| p.as_millis()), }; let timelines = tenant.timelines.lock().unwrap().clone(); @@ -396,15 +407,31 @@ async fn upload_tenant_heatmap( // Serialize the heatmap let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?; - let bytes = bytes::Bytes::from(bytes); - let size = bytes.len(); // Drop out early if nothing changed since our last upload let digest = md5::compute(&bytes); - if Some(digest) == last_digest { + if Some(&digest) == last_upload.as_ref().map(|d| &d.uploaded_digest) { return Ok(UploadHeatmapOutcome::NoChange); } + // Calculate a digest that omits atimes, so that we can distinguish actual changes in + // layers from changes only in atimes. + let heatmap_size_bytes = heatmap.get_stats().bytes; + let layers_only_bytes = + serde_json::to_vec(&heatmap.strip_atimes()).map_err(|e| anyhow::anyhow!(e))?; + let layers_only_digest = md5::compute(&layers_only_bytes); + if heatmap_size_bytes < tenant.get_checkpoint_distance() { + // For small tenants, skip upload if only atimes changed. This avoids doing frequent + // uploads from long-idle tenants whose atimes are just incremented by periodic + // size calculations. + if Some(&layers_only_digest) == last_upload.as_ref().map(|d| &d.layers_only_digest) { + return Ok(UploadHeatmapOutcome::NoChange); + } + } + + let bytes = bytes::Bytes::from(bytes); + let size = bytes.len(); + let path = remote_heatmap_path(tenant.get_tenant_shard_id()); let cancel = &tenant.cancel; @@ -436,5 +463,8 @@ async fn upload_tenant_heatmap( tracing::info!("Successfully uploaded {size} byte heatmap to {path}"); - Ok(UploadHeatmapOutcome::Uploaded(digest)) + Ok(UploadHeatmapOutcome::Uploaded(LastUploadState { + uploaded_digest: digest, + layers_only_digest, + })) } From df0f1e359b1ff01474ffaac26cec730697d1d643 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 14 May 2024 09:37:48 +0100 Subject: [PATCH 048/130] pageserver: switch on new-style local layer paths (#7660) We recently added support for local layer paths that contain a generation number: - https://github.com/neondatabase/neon/pull/7609 - https://github.com/neondatabase/neon/pull/7640 Now that we've cut a [release](https://github.com/neondatabase/neon/pull/7735) that includes those changes, we can proceed to enable writing the new format without breaking forward compatibility. --- pageserver/src/tenant/storage_layer/layer.rs | 17 +++++++---------- .../regress/test_pageserver_generations.py | 1 - 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index b5b0260327..b6f7702247 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -129,19 +129,16 @@ pub(crate) fn local_layer_path( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, layer_file_name: &LayerName, - _generation: &Generation, + generation: &Generation, ) -> Utf8PathBuf { let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id); - timeline_path.join(layer_file_name.to_string()) - - // TODO: switch to enabling new-style layer paths after next release - // if generation.is_none() { - // // Without a generation, we may only use legacy path style - // timeline_path.join(layer_file_name.to_string()) - // } else { - // timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix())) - // } + if generation.is_none() { + // Without a generation, we may only use legacy path style + timeline_path.join(layer_file_name.to_string()) + } else { + timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix())) + } } impl Layer { diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index a38bcd45da..4fdc5852f5 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -703,7 +703,6 @@ def test_multi_attach( workload.validate(pageservers[2].id) -@pytest.mark.skip(reason="To be enabled after release with new local path style") def test_upgrade_generationless_local_file_paths( neon_env_builder: NeonEnvBuilder, ): From b6ee91835b5eaf1d210805d0e4e07d01214445dd Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 14 May 2024 11:39:59 +0100 Subject: [PATCH 049/130] CI(report-benchmarks-failures): fix condition (#7745) ## Problem `report-benchmarks-failures` job is triggered for any failure in the CI pipeline, but we need it to be triggered only for failed `benchmarks` job ## Summary of changes - replace `failure()` with `needs.benchmarks.result == 'failure'` in the condition --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index f417cecd58..14d19f7ae3 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -548,7 +548,7 @@ jobs: report-benchmarks-failures: needs: [ benchmarks, create-test-report ] - if: github.ref_name == 'main' && failure() + if: github.ref_name == 'main' && needs.benchmarks.result == 'failure' runs-on: ubuntu-latest steps: From 30d15ad4032e543be37fc0378345d9b6568b20cc Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Tue, 14 May 2024 10:36:48 -0400 Subject: [PATCH 050/130] chore(test): add version check for forward compat test (#7685) A test for https://github.com/neondatabase/neon/pull/7684. This pull request checks if the pageserver version we specified is the one actually running by comparing the git hash in forward compatibility tests. --------- Signed-off-by: Alex Chi Z --- test_runner/regress/test_compatibility.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index ef35bf4696..65649e0c0a 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -1,4 +1,5 @@ import os +import re import shutil import subprocess import tempfile @@ -245,14 +246,34 @@ def test_forward_compatibility( compatibility_snapshot_dir / "repo", ) + # not using env.pageserver.version because it was initialized before + prev_pageserver_version_str = env.get_binary_version("pageserver") + prev_pageserver_version_match = re.search( + "Neon page server git-env:(.*) failpoints: (.*), features: (.*)", + prev_pageserver_version_str, + ) + if prev_pageserver_version_match is not None: + prev_pageserver_version = prev_pageserver_version_match.group(1) + else: + raise AssertionError( + "cannot find git hash in the version string: " + prev_pageserver_version_str + ) + + # does not include logs from previous runs + assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version) + neon_env_builder.start() + # ensure the specified pageserver is running + assert env.pageserver.log_contains("git-env:" + prev_pageserver_version) + check_neon_works( env, test_output_dir=test_output_dir, sql_dump_path=compatibility_snapshot_dir / "dump.sql", repo_dir=env.repo_dir, ) + except Exception: if breaking_changes_allowed: pytest.xfail( From 82960b2175211c0f666b91b5258c5e2253a245c7 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 14 May 2024 16:39:17 +0100 Subject: [PATCH 051/130] pageserver: skip waiting for logical size on shard >0 (#7744) ## Problem Shards with number >0 could hang waiting for `await_initial_logical_size`, as we don't calculate logical size on these shards. This causes them to hold onto semaphore units and starve other tenants out from proceeding with warmup activation. That doesn't hurt availability (we still have on-demand activation), but it does mean that some background tasks like consumption metrics would omit some tenants. ## Summary of changes - Skip waiting for logical size calculation on shards >0 - Upgrade unexpected code paths to use debug_assert!(), which acts as an implicit regression test for this issue, and make the info() one into a warn() --- .../src/tenant/remote_timeline_client.rs | 5 +++++ pageserver/src/tenant/timeline.rs | 19 ++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 9103760388..630ade5c13 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1127,6 +1127,11 @@ impl RemoteTimelineClient { Ok(()) } + pub(crate) fn is_deleting(&self) -> bool { + let mut locked = self.upload_queue.lock().unwrap(); + locked.stopped_mut().is_ok() + } + pub(crate) async fn preserve_initdb_archive( self: &Arc, tenant_id: &TenantId, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 9ee24a4ff0..ca34b4fadc 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2631,6 +2631,7 @@ impl Timeline { // Don't make noise. } else { warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"); + debug_assert!(false); } } }; @@ -4355,6 +4356,21 @@ impl Timeline { /// this Timeline is shut down. Calling this function will cause the initial /// logical size calculation to skip waiting for the background jobs barrier. pub(crate) async fn await_initial_logical_size(self: Arc) { + if !self.shard_identity.is_shard_zero() { + // We don't populate logical size on shard >0: skip waiting for it. + return; + } + + if self + .remote_client + .as_ref() + .map(|c| c.is_deleting()) + .unwrap_or(false) + { + // The timeline was created in a deletion-resume state, we don't expect logical size to be populated + return; + } + if let Some(await_bg_cancel) = self .current_logical_size .cancel_wait_for_background_loop_concurrency_limit_semaphore @@ -4366,9 +4382,10 @@ impl Timeline { // the logical size cancellation to skip the concurrency limit semaphore. // TODO: this is an unexpected case. We should restructure so that it // can't happen. - tracing::info!( + tracing::warn!( "await_initial_logical_size: can't get semaphore cancel token, skipping" ); + debug_assert!(false); } tokio::select!( From e67fcf9563faa1ebf0653a378a1999f446a3c1a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 14 May 2024 17:49:19 +0200 Subject: [PATCH 052/130] Update mold to 2.31 (#7757) The [2.31.0 release](https://github.com/rui314/mold/releases/tag/v2.31.0) of mold includes a 10% speed improvement for binaries with a lot of debug info. As we have such, it might be useful to update mold to the latest release. The jump is from 2.4.0 to 2.31.0, but it's not been many releases in between as the version number was raised by the mold maintainers to 2.30.0 after 2.4.1 [to avoid confusion for some tools](https://github.com/rui314/mold/releases/tag/v2.30.0). --- Dockerfile.build-tools | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 19739cc1f8..460b8c996d 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -87,7 +87,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws && rm awscliv2.zip # Mold: A Modern Linker -ENV MOLD_VERSION v2.4.0 +ENV MOLD_VERSION v2.31.0 RUN set -e \ && git clone https://github.com/rui314/mold.git \ && mkdir mold/build \ From 4eedb3b6f188408cd74397f81e02fc4c416eb5f5 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 14 May 2024 18:03:08 +0200 Subject: [PATCH 053/130] test suite: allow overriding default compaction algorithm via env var (#7747) This PR allows setting the `PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM` env var to override the `tenant_config.compaction_algorithm` field in the initial `pageserver.toml` for all tests. I tested manually that this works by halting a test using pdb and inspecting the `effective_config` in the tenant status managment API. If the env var is set, the tests are parametrized by the `kind` tag field, allowing to do a matrix build in CI and let Allure summarize everything in a nice report. If the env var is not set, the tests are not parametrized. So, merging this PR doesn't cause problems for flaky test detection. In fact, it doesn't cause any runtime change if the env var is not set. There are some tests in the test suite that set used to override the entire tenant_config using `NeonEnvBuilder.pageserver_config_override`. Since config overrides are merged non-recursively, such overrides that don't specify `kind = ` cause a fallback to pageserver's built-in `DEFAULT_COMPACTION_ALGORITHM`. Such cases can be found using ``` ["']tenant_config\s*[='"] ``` We'll deal with these tests in a future PR. closes https://github.com/neondatabase/neon/issues/7555 --- scripts/flaky_tests.py | 25 ++++++++++++++++++++++--- test_runner/fixtures/neon_fixtures.py | 18 ++++++++++++++++++ test_runner/fixtures/parametrize.py | 27 ++++++++++++++++++++++++++- 3 files changed, 66 insertions(+), 4 deletions(-) diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index 878840fcee..919a9278a9 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -5,10 +5,11 @@ import json import logging import os from collections import defaultdict -from typing import DefaultDict, Dict +from typing import Any, DefaultDict, Dict, Optional import psycopg2 import psycopg2.extras +import toml FLAKY_TESTS_QUERY = """ SELECT @@ -58,6 +59,24 @@ def main(args: argparse.Namespace): else: pageserver_virtual_file_io_engine_parameter = "" + # re-use existing records of flaky tests from before parametrization by compaction_algorithm + def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]: + """Duplicated from parametrize.py""" + toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") + if toml_table is None: + return None + v = toml.loads(toml_table) + assert isinstance(v, dict) + return v + + pageserver_default_tenant_config_compaction_algorithm_parameter = "" + if ( + explicit_default := get_pageserver_default_tenant_config_compaction_algorithm() + ) is not None: + pageserver_default_tenant_config_compaction_algorithm_parameter = ( + f"-{explicit_default['kind']}" + ) + for row in rows: # We don't want to automatically rerun tests in a performance suite if row["parent_suite"] != "test_runner.regress": @@ -66,10 +85,10 @@ def main(args: argparse.Namespace): if row["name"].endswith("]"): parametrized_test = row["name"].replace( "[", - f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}-", + f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}-", ) else: - parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}]" + parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}]" res[row["parent_suite"]][row["suite"]][parametrized_test] = True diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8432655370..62a4b974a3 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -467,6 +467,7 @@ class NeonEnvBuilder: initial_timeline: Optional[TimelineId] = None, pageserver_virtual_file_io_engine: Optional[str] = None, pageserver_aux_file_policy: Optional[AuxFileStore] = None, + pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -507,6 +508,14 @@ class NeonEnvBuilder: self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine + self.pageserver_default_tenant_config_compaction_algorithm: Optional[ + Dict[str, Any] + ] = pageserver_default_tenant_config_compaction_algorithm + if self.pageserver_default_tenant_config_compaction_algorithm is not None: + log.debug( + f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}" + ) + self.pageserver_get_vectored_impl: Optional[str] = None if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored": self.pageserver_get_vectored_impl = "vectored" @@ -1103,6 +1112,11 @@ class NeonEnv: ps_cfg["get_impl"] = config.pageserver_get_impl if config.pageserver_validate_vectored_get is not None: ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get + if config.pageserver_default_tenant_config_compaction_algorithm is not None: + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config[ + "compaction_algorithm" + ] = config.pageserver_default_tenant_config_compaction_algorithm if self.pageserver_remote_storage is not None: ps_cfg["remote_storage"] = remote_storage_to_toml_dict( @@ -1304,6 +1318,7 @@ def _shared_simple_env( pg_version: PgVersion, pageserver_virtual_file_io_engine: str, pageserver_aux_file_policy: Optional[AuxFileStore], + pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], ) -> Iterator[NeonEnv]: """ # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES @@ -1335,6 +1350,7 @@ def _shared_simple_env( test_output_dir=test_output_dir, pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, pageserver_aux_file_policy=pageserver_aux_file_policy, + pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, ) as builder: env = builder.init_start() @@ -1375,6 +1391,7 @@ def neon_env_builder( top_output_dir: Path, pageserver_virtual_file_io_engine: str, pageserver_aux_file_policy: Optional[AuxFileStore] = None, + pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None, ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1409,6 +1426,7 @@ def neon_env_builder( test_output_dir=test_output_dir, test_overlay_dir=test_overlay_dir, pageserver_aux_file_policy=pageserver_aux_file_policy, + pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, ) as builder: yield builder diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index 77523a542b..0227285822 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -1,7 +1,8 @@ import os -from typing import Optional +from typing import Any, Dict, Optional import pytest +import toml from _pytest.python import Metafunc from fixtures.pg_version import PgVersion @@ -37,6 +38,20 @@ def pageserver_aux_file_policy() -> Optional[AuxFileStore]: return None +def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]: + toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") + if toml_table is None: + return None + v = toml.loads(toml_table) + assert isinstance(v, dict) + return v + + +@pytest.fixture(scope="function", autouse=True) +def pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]: + return get_pageserver_default_tenant_config_compaction_algorithm() + + def pytest_generate_tests(metafunc: Metafunc): if (bt := os.getenv("BUILD_TYPE")) is None: build_types = ["debug", "release"] @@ -60,6 +75,16 @@ def pytest_generate_tests(metafunc: Metafunc): ): metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine]) + # Same hack for pageserver_default_tenant_config_compaction_algorithm + if ( + explicit_default := get_pageserver_default_tenant_config_compaction_algorithm() + ) is not None: + metafunc.parametrize( + "pageserver_default_tenant_config_compaction_algorithm", + [explicit_default], + ids=[explicit_default["kind"]], + ) + # For performance tests, parametrize also by platform if ( "test_runner/performance" in metafunc.definition._nodeid From 1a2a3cb446be40b7a1453876f0ea8d128c3435cd Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 10 May 2024 16:23:36 +0300 Subject: [PATCH 054/130] Add restart_lsn metric for logical slots. --- vm-image-spec.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index e9d983eba3..fa7cd014bf 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -293,6 +293,16 @@ files: values: [checkpoints_timed] query: | SELECT checkpoints_timed FROM pg_stat_bgwriter; + + # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad. + - metric_name: logical_slot_restart_lsn + type: gauge + help: 'restart_lsn of logical slots' + key_labels: + - slot_name + values: [restart_lsn] + query: | + select slot_name, restart_lsn from pg_replication_slots where slot_type = 'logical'; - filename: neon_collector_autoscaling.yml content: | collector_name: neon_collector_autoscaling From 438bacc32eb5d6888d0aec623006e6046b59299e Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 15 May 2024 12:29:12 +0100 Subject: [PATCH 055/130] CI(neon-extra-builds): Use small-arm64 runners instead of large-arm64 (#7740) ## Problem There are not enough arm runners and jobs in `neon-extra-builds` workflow take about the same amount of time on a small-arm runner as on large-arm. ## Summary of changes - Switch `neon-extra-builds` workflow from `large-arm64` to `small-arm64` runners --- .github/actionlint.yml | 1 + .github/workflows/neon_extra_builds.yml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 942861ecd8..37983798b7 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -5,6 +5,7 @@ self-hosted-runner: - large - large-arm64 - small + - small-arm64 - us-east-2 config-variables: - REMOTE_STORAGE_AZURE_CONTAINER diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index fdb03963fb..7d2187e59c 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -136,7 +136,7 @@ jobs: check-linux-arm-build: needs: [ check-permissions, build-build-tools-image ] timeout-minutes: 90 - runs-on: [ self-hosted, large-arm64 ] + runs-on: [ self-hosted, small-arm64 ] env: # Use release build only, to have less debug info around @@ -260,7 +260,7 @@ jobs: check-codestyle-rust-arm: needs: [ check-permissions, build-build-tools-image ] timeout-minutes: 90 - runs-on: [ self-hosted, large-arm64 ] + runs-on: [ self-hosted, small-arm64 ] container: image: ${{ needs.build-build-tools-image.outputs.image }} From f342b87f306f5a05cf91d8191833c2df9f6d4acd Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 15 May 2024 13:05:24 +0100 Subject: [PATCH 056/130] pageserver: remove Option<> around remote storage, clean up metadata file refs (#7752) ## Problem This is historical baggage from when the pageserver could be run with local disk only: we had a bunch of places where we had to treat remote storage as optional. Closes: https://github.com/neondatabase/neon/issues/6890 ## Changes - Remove Option<> around remote storage (in https://github.com/neondatabase/neon/pull/7722 we made remote storage clearly mandatory) - Remove code for deleting old metadata files: they're all gone now. - Remove other references to metadata files when loading directories, as none exist. I checked last 14 days of logs for "found legacy metadata", there are no instances. --- pageserver/ctl/src/draw_timeline_dir.rs | 5 - pageserver/src/bin/pageserver.rs | 43 ++-- pageserver/src/deletion_queue.rs | 23 +- pageserver/src/http/routes.rs | 44 +--- pageserver/src/lib.rs | 10 +- pageserver/src/tenant.rs | 177 ++++++--------- pageserver/src/tenant/delete.rs | 55 ++--- pageserver/src/tenant/mgr.rs | 84 ++----- .../src/tenant/remote_timeline_client.rs | 4 +- pageserver/src/tenant/secondary/downloader.rs | 8 +- pageserver/src/tenant/storage_layer/layer.rs | 56 ++--- .../src/tenant/storage_layer/layer/tests.rs | 10 +- pageserver/src/tenant/timeline.rs | 210 +++++++----------- pageserver/src/tenant/timeline/compaction.rs | 12 +- pageserver/src/tenant/timeline/delete.rs | 40 ++-- .../src/tenant/timeline/detach_ancestor.rs | 27 +-- .../src/tenant/timeline/eviction_task.rs | 7 +- pageserver/src/tenant/timeline/init.rs | 7 +- test_runner/regress/test_broken_timeline.py | 20 +- 19 files changed, 285 insertions(+), 557 deletions(-) diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 4dff8af1fc..389519c65a 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -52,7 +52,6 @@ use anyhow::{Context, Result}; use pageserver::repository::Key; -use pageserver::METADATA_FILE_NAME; use std::cmp::Ordering; use std::io::{self, BufRead}; use std::path::PathBuf; @@ -159,10 +158,6 @@ pub fn main() -> Result<()> { let line = PathBuf::from_str(&line).unwrap(); let filename = line.file_name().unwrap(); let filename = filename.to_str().unwrap(); - if filename == METADATA_FILE_NAME { - // Don't try and parse "metadata" like a key-lsn range - continue; - } let (key_range, lsn_range) = parse_filename(filename); files.push(Layer { filename: filename.to_owned(), diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 49f8a41b37..c0099aa704 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -383,7 +383,7 @@ fn start_pageserver( let shutdown_pageserver = tokio_util::sync::CancellationToken::new(); // Set up remote storage client - let remote_storage = Some(create_remote_storage_client(conf)?); + let remote_storage = create_remote_storage_client(conf)?; // Set up deletion queue let (deletion_queue, deletion_workers) = DeletionQueue::new( @@ -516,16 +516,12 @@ fn start_pageserver( } }); - let secondary_controller = if let Some(remote_storage) = &remote_storage { - secondary::spawn_tasks( - tenant_manager.clone(), - remote_storage.clone(), - background_jobs_barrier.clone(), - shutdown_pageserver.clone(), - ) - } else { - secondary::null_controller() - }; + let secondary_controller = secondary::spawn_tasks( + tenant_manager.clone(), + remote_storage.clone(), + background_jobs_barrier.clone(), + shutdown_pageserver.clone(), + ); // shared state between the disk-usage backed eviction background task and the http endpoint // that allows triggering disk-usage based eviction manually. note that the http endpoint @@ -533,15 +529,13 @@ fn start_pageserver( // been configured. let disk_usage_eviction_state: Arc = Arc::default(); - if let Some(remote_storage) = &remote_storage { - launch_disk_usage_global_eviction_task( - conf, - remote_storage.clone(), - disk_usage_eviction_state.clone(), - tenant_manager.clone(), - background_jobs_barrier.clone(), - )?; - } + launch_disk_usage_global_eviction_task( + conf, + remote_storage.clone(), + disk_usage_eviction_state.clone(), + tenant_manager.clone(), + background_jobs_barrier.clone(), + )?; // Start up the service to handle HTTP mgmt API request. We created the // listener earlier already. @@ -693,14 +687,7 @@ fn start_pageserver( // Right now that tree doesn't reach very far, and `task_mgr` is used instead. // The plan is to change that over time. shutdown_pageserver.take(); - let bg_remote_storage = remote_storage.clone(); - let bg_deletion_queue = deletion_queue.clone(); - pageserver::shutdown_pageserver( - &tenant_manager, - bg_remote_storage.map(|_| bg_deletion_queue), - 0, - ) - .await; + pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await; unreachable!() }) } diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index c937309d83..8790a9b0a8 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -632,7 +632,7 @@ impl DeletionQueue { /// /// If remote_storage is None, then the returned workers will also be None. pub fn new( - remote_storage: Option, + remote_storage: GenericRemoteStorage, control_plane_client: Option, conf: &'static PageServerConf, ) -> (Self, Option>) @@ -658,23 +658,6 @@ impl DeletionQueue { // longer to flush after Tenants have all been torn down. let cancel = CancellationToken::new(); - let remote_storage = match remote_storage { - None => { - return ( - Self { - client: DeletionQueueClient { - tx, - executor_tx, - lsn_table: lsn_table.clone(), - }, - cancel, - }, - None, - ) - } - Some(r) => r, - }; - ( Self { client: DeletionQueueClient { @@ -765,7 +748,7 @@ mod test { /// Simulate a pageserver restart by destroying and recreating the deletion queue async fn restart(&mut self) { let (deletion_queue, workers) = DeletionQueue::new( - Some(self.storage.clone()), + self.storage.clone(), Some(self.mock_control_plane.clone()), self.harness.conf, ); @@ -875,7 +858,7 @@ mod test { let mock_control_plane = MockControlPlane::new(); let (deletion_queue, worker) = DeletionQueue::new( - Some(storage.clone()), + storage.clone(), Some(mock_control_plane.clone()), harness.conf, ); diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2370561756..0a98d32f02 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -104,7 +104,7 @@ pub struct State { tenant_manager: Arc, auth: Option>, allowlist_routes: Vec, - remote_storage: Option, + remote_storage: GenericRemoteStorage, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, @@ -118,7 +118,7 @@ impl State { conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, - remote_storage: Option, + remote_storage: GenericRemoteStorage, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, @@ -813,12 +813,6 @@ async fn tenant_attach_handler( let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?; - if state.remote_storage.is_none() { - return Err(ApiError::BadRequest(anyhow!( - "attach_tenant is not possible because pageserver was configured without remote storage" - ))); - } - let tenant_shard_id = TenantShardId::unsharded(tenant_id); let shard_params = ShardParameters::default(); let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params); @@ -1643,12 +1637,6 @@ async fn tenant_time_travel_remote_storage_handler( ))); } - let Some(storage) = state.remote_storage.as_ref() else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "remote storage not configured, cannot run time travel" - ))); - }; - if timestamp > done_if_after { return Err(ApiError::BadRequest(anyhow!( "The done_if_after timestamp comes before the timestamp to recover to" @@ -1658,7 +1646,7 @@ async fn tenant_time_travel_remote_storage_handler( tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}"); remote_timeline_client::upload::time_travel_recover_tenant( - storage, + &state.remote_storage, &tenant_shard_id, timestamp, done_if_after, @@ -1903,11 +1891,6 @@ async fn deletion_queue_flush( ) -> Result, ApiError> { let state = get_state(&r); - if state.remote_storage.is_none() { - // Nothing to do if remote storage is disabled. - return json_response(StatusCode::OK, ()); - } - let execute = parse_query_param(&r, "execute")?.unwrap_or(false); let flush = async { @@ -2072,18 +2055,11 @@ async fn disk_usage_eviction_run( }; let state = get_state(&r); - - let Some(storage) = state.remote_storage.as_ref() else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "remote storage not configured, cannot run eviction iteration" - ))); - }; - let eviction_state = state.disk_usage_eviction_state.clone(); let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( &eviction_state, - storage, + &state.remote_storage, usage, &state.tenant_manager, config.eviction_order, @@ -2120,29 +2096,23 @@ async fn tenant_scan_remote_handler( let state = get_state(&request); let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - let Some(remote_storage) = state.remote_storage.as_ref() else { - return Err(ApiError::BadRequest(anyhow::anyhow!( - "Remote storage not configured" - ))); - }; - let mut response = TenantScanRemoteStorageResponse::default(); let (shards, _other_keys) = - list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone()) + list_remote_tenant_shards(&state.remote_storage, tenant_id, cancel.clone()) .await .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; for tenant_shard_id in shards { let (timeline_ids, _other_keys) = - list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone()) + list_remote_timelines(&state.remote_storage, tenant_shard_id, cancel.clone()) .await .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; let mut generation = Generation::none(); for timeline_id in timeline_ids { match download_index_part( - remote_storage, + &state.remote_storage, &tenant_shard_id, &timeline_id, Generation::MAX, diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 930700e50c..c69fb8c83b 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -57,7 +57,7 @@ pub use crate::metrics::preinitialize_metrics; #[tracing::instrument(skip_all, fields(%exit_code))] pub async fn shutdown_pageserver( tenant_manager: &TenantManager, - deletion_queue: Option, + mut deletion_queue: DeletionQueue, exit_code: i32, ) { use std::time::Duration; @@ -89,9 +89,7 @@ pub async fn shutdown_pageserver( .await; // Best effort to persist any outstanding deletions, to avoid leaking objects - if let Some(mut deletion_queue) = deletion_queue { - deletion_queue.shutdown(Duration::from_secs(5)).await; - } + deletion_queue.shutdown(Duration::from_secs(5)).await; // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. @@ -114,10 +112,6 @@ pub async fn shutdown_pageserver( std::process::exit(exit_code); } -/// The name of the metadata file pageserver creates per timeline. -/// Full path: `tenants//timelines//metadata`. -pub const METADATA_FILE_NAME: &str = "metadata"; - /// Per-tenant configuration file. /// Full path: `tenants//config`. pub(crate) const TENANT_CONFIG_NAME: &str = "config"; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 80d354d79e..026cbc107c 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -190,7 +190,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted"; #[derive(Clone)] pub struct TenantSharedResources { pub broker_client: storage_broker::BrokerClientChannel, - pub remote_storage: Option, + pub remote_storage: GenericRemoteStorage, pub deletion_queue_client: DeletionQueueClient, } @@ -292,7 +292,7 @@ pub struct Tenant { walredo_mgr: Option>, // provides access to timeline data sitting in the remote storage - pub(crate) remote_storage: Option, + pub(crate) remote_storage: GenericRemoteStorage, // Access to global deletion queue for when this tenant wants to schedule a deletion deletion_queue_client: DeletionQueueClient, @@ -551,21 +551,22 @@ impl Tenant { ); if let Some(index_part) = index_part.as_ref() { - timeline - .remote_client - .as_ref() - .unwrap() - .init_upload_queue(index_part)?; - } else if self.remote_storage.is_some() { + timeline.remote_client.init_upload_queue(index_part)?; + } else { // No data on the remote storage, but we have local metadata file. We can end up // here with timeline_create being interrupted before finishing index part upload. // By doing what we do here, the index part upload is retried. // If control plane retries timeline creation in the meantime, the mgmt API handler // for timeline creation will coalesce on the upload we queue here. + // FIXME: this branch should be dead code as we no longer write local metadata. - let rtc = timeline.remote_client.as_ref().unwrap(); - rtc.init_upload_queue_for_empty_remote(&metadata)?; - rtc.schedule_index_upload_for_full_metadata_update(&metadata)?; + + timeline + .remote_client + .init_upload_queue_for_empty_remote(&metadata)?; + timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata)?; } timeline @@ -777,14 +778,14 @@ impl Tenant { AttachType::Normal }; - let preload = match (&mode, &remote_storage) { - (SpawnMode::Create, _) => { + let preload = match &mode { + SpawnMode::Create => { None }, - (SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => { + SpawnMode::Eager | SpawnMode::Lazy => { let _preload_timer = TENANT.preload.start_timer(); let res = tenant_clone - .preload(remote_storage, task_mgr::shutdown_token()) + .preload(&remote_storage, task_mgr::shutdown_token()) .await; match res { Ok(p) => Some(p), @@ -794,10 +795,7 @@ impl Tenant { } } } - (_, None) => { - let _preload_timer = TENANT.preload.start_timer(); - None - } + }; // Remote preload is complete. @@ -1021,7 +1019,7 @@ impl Tenant { index_part, remote_metadata, TimelineResources { - remote_client: Some(remote_client), + remote_client, deletion_queue_client: self.deletion_queue_client.clone(), timeline_get_throttle: self.timeline_get_throttle.clone(), }, @@ -1047,7 +1045,7 @@ impl Tenant { Arc::clone(self), timeline_id, &index_part.metadata, - Some(remote_timeline_client), + remote_timeline_client, self.deletion_queue_client.clone(), ) .instrument(tracing::info_span!("timeline_delete", %timeline_id)) @@ -1139,9 +1137,7 @@ impl Tenant { let mut size = 0; for timeline in self.list_timelines() { - if let Some(remote_client) = &timeline.remote_client { - size += remote_client.get_remote_physical_size(); - } + size += timeline.remote_client.get_remote_physical_size(); } size @@ -1191,6 +1187,7 @@ impl Tenant { pub fn create_broken_tenant( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, + remote_storage: GenericRemoteStorage, reason: String, ) -> Arc { Arc::new(Tenant::new( @@ -1205,7 +1202,7 @@ impl Tenant { ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count), None, tenant_shard_id, - None, + remote_storage, DeletionQueueClient::broken(), )) } @@ -1398,13 +1395,7 @@ impl Tenant { tline.freeze_and_flush().await.context("freeze_and_flush")?; // Make sure the freeze_and_flush reaches remote storage. - tline - .remote_client - .as_ref() - .unwrap() - .wait_completion() - .await - .unwrap(); + tline.remote_client.wait_completion().await.unwrap(); let tl = uninit_tl.finish_creation()?; // The non-test code would call tl.activate() here. @@ -1470,20 +1461,19 @@ impl Tenant { return Err(CreateTimelineError::Conflict); } - if let Some(remote_client) = existing.remote_client.as_ref() { - // Wait for uploads to complete, so that when we return Ok, the timeline - // is known to be durable on remote storage. Just like we do at the end of - // this function, after we have created the timeline ourselves. - // - // We only really care that the initial version of `index_part.json` has - // been uploaded. That's enough to remember that the timeline - // exists. However, there is no function to wait specifically for that so - // we just wait for all in-progress uploads to finish. - remote_client - .wait_completion() - .await - .context("wait for timeline uploads to complete")?; - } + // Wait for uploads to complete, so that when we return Ok, the timeline + // is known to be durable on remote storage. Just like we do at the end of + // this function, after we have created the timeline ourselves. + // + // We only really care that the initial version of `index_part.json` has + // been uploaded. That's enough to remember that the timeline + // exists. However, there is no function to wait specifically for that so + // we just wait for all in-progress uploads to finish. + existing + .remote_client + .wait_completion() + .await + .context("wait for timeline uploads to complete")?; return Ok(existing); } @@ -1559,14 +1549,14 @@ impl Tenant { // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet. We must // not send a success to the caller until it is. The same applies to handling retries, // see the handling of [`TimelineExclusionError::AlreadyExists`] above. - if let Some(remote_client) = loaded_timeline.remote_client.as_ref() { - let kind = ancestor_timeline_id - .map(|_| "branched") - .unwrap_or("bootstrapped"); - remote_client.wait_completion().await.with_context(|| { - format!("wait for {} timeline initial uploads to complete", kind) - })?; - } + let kind = ancestor_timeline_id + .map(|_| "branched") + .unwrap_or("bootstrapped"); + loaded_timeline + .remote_client + .wait_completion() + .await + .with_context(|| format!("wait for {} timeline initial uploads to complete", kind))?; loaded_timeline.activate(self.clone(), broker_client, None, ctx); @@ -2161,32 +2151,26 @@ impl Tenant { ) -> anyhow::Result<()> { let timelines = self.timelines.lock().unwrap().clone(); for timeline in timelines.values() { - let Some(tl_client) = &timeline.remote_client else { - anyhow::bail!("Remote storage is mandatory"); - }; - - let Some(remote_storage) = &self.remote_storage else { - anyhow::bail!("Remote storage is mandatory"); - }; - // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels // to ensure that they do not start a split if currently in the process of doing these. // Upload an index from the parent: this is partly to provide freshness for the // child tenants that will copy it, and partly for general ease-of-debugging: there will // always be a parent shard index in the same generation as we wrote the child shard index. - tl_client.schedule_index_upload_for_file_changes()?; - tl_client.wait_completion().await?; + timeline + .remote_client + .schedule_index_upload_for_file_changes()?; + timeline.remote_client.wait_completion().await?; // Shut down the timeline's remote client: this means that the indices we write // for child shards will not be invalidated by the parent shard deleting layers. - tl_client.shutdown().await; + timeline.remote_client.shutdown().await; // Download methods can still be used after shutdown, as they don't flow through the remote client's // queue. In principal the RemoteTimelineClient could provide this without downloading it, but this // operation is rare, so it's simpler to just download it (and robustly guarantees that the index // we use here really is the remotely persistent one). - let result = tl_client + let result = timeline.remote_client .download_index_file(&self.cancel) .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id)) .await?; @@ -2199,7 +2183,7 @@ impl Tenant { for child_shard in child_shards { upload_index_part( - remote_storage, + &self.remote_storage, child_shard, &timeline.timeline_id, self.generation, @@ -2475,7 +2459,7 @@ impl Tenant { shard_identity: ShardIdentity, walredo_mgr: Option>, tenant_shard_id: TenantShardId, - remote_storage: Option, + remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, ) -> Tenant { let (state, mut rx) = watch::channel(state); @@ -3119,11 +3103,10 @@ impl Tenant { // We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC // could get incorrect information and remove more layers, than needed. // See also https://github.com/neondatabase/neon/issues/3865 - if let Some(remote_client) = new_timeline.remote_client.as_ref() { - remote_client - .schedule_index_upload_for_full_metadata_update(&metadata) - .context("branch initial metadata upload")?; - } + new_timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata) + .context("branch initial metadata upload")?; Ok(new_timeline) } @@ -3155,11 +3138,6 @@ impl Tenant { pgdata_path: &Utf8PathBuf, timeline_id: &TimelineId, ) -> anyhow::Result<()> { - let Some(storage) = &self.remote_storage else { - // No remote storage? No upload. - return Ok(()); - }; - let temp_path = timelines_path.join(format!( "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}" )); @@ -3183,7 +3161,7 @@ impl Tenant { backoff::retry( || async { self::remote_timeline_client::upload_initdb_dir( - storage, + &self.remote_storage, &self.tenant_shard_id.tenant_id, timeline_id, pgdata_zstd.try_clone().await?, @@ -3240,9 +3218,6 @@ impl Tenant { } } if let Some(existing_initdb_timeline_id) = load_existing_initdb { - let Some(storage) = &self.remote_storage else { - bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}"); - }; if existing_initdb_timeline_id != timeline_id { let source_path = &remote_initdb_archive_path( &self.tenant_shard_id.tenant_id, @@ -3252,7 +3227,7 @@ impl Tenant { &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id); // if this fails, it will get retried by retried control plane requests - storage + self.remote_storage .copy_object(source_path, dest_path, &self.cancel) .await .context("copy initdb tar")?; @@ -3260,7 +3235,7 @@ impl Tenant { let (initdb_tar_zst_path, initdb_tar_zst) = self::remote_timeline_client::download_initdb_tar_zst( self.conf, - storage, + &self.remote_storage, &self.tenant_shard_id, &existing_initdb_timeline_id, &self.cancel, @@ -3355,20 +3330,14 @@ impl Tenant { /// Call this before constructing a timeline, to build its required structures fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources { - let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() { - let remote_client = RemoteTimelineClient::new( - remote_storage.clone(), - self.deletion_queue_client.clone(), - self.conf, - self.tenant_shard_id, - timeline_id, - self.generation, - ); - Some(remote_client) - } else { - None - }; - + let remote_client = RemoteTimelineClient::new( + self.remote_storage.clone(), + self.deletion_queue_client.clone(), + self.conf, + self.tenant_shard_id, + timeline_id, + self.generation, + ); TimelineResources { remote_client, deletion_queue_client: self.deletion_queue_client.clone(), @@ -3392,9 +3361,9 @@ impl Tenant { let tenant_shard_id = self.tenant_shard_id; let resources = self.build_timeline_resources(new_timeline_id); - if let Some(remote_client) = &resources.remote_client { - remote_client.init_upload_queue_for_empty_remote(new_metadata)?; - } + resources + .remote_client + .init_upload_queue_for_empty_remote(new_metadata)?; let timeline_struct = self .create_timeline_struct( @@ -3562,9 +3531,7 @@ impl Tenant { tracing::info!(timeline_id=%timeline.timeline_id, "Flushing..."); timeline.freeze_and_flush().await?; tracing::info!(timeline_id=%timeline.timeline_id, "Waiting for uploads..."); - if let Some(client) = &timeline.remote_client { - client.wait_completion().await?; - } + timeline.remote_client.wait_completion().await?; Ok(()) } @@ -3878,7 +3845,7 @@ pub(crate) mod harness { ShardIdentity::unsharded(), Some(walredo_mgr), self.tenant_shard_id, - Some(self.remote_storage.clone()), + self.remote_storage.clone(), self.deletion_queue.new_client(), )); diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index 2e5259bfe2..3173a33dad 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -181,25 +181,23 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del async fn remove_tenant_remote_delete_mark( conf: &PageServerConf, - remote_storage: Option<&GenericRemoteStorage>, + remote_storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { - if let Some(remote_storage) = remote_storage { - let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; - backoff::retry( - || async { remote_storage.delete(&path, cancel).await }, - TimeoutOrCancel::caused_by_cancel, - FAILED_UPLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "remove_tenant_remote_delete_mark", - cancel, - ) - .await - .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) - .and_then(|x| x) - .context("remove_tenant_remote_delete_mark")?; - } + let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; + backoff::retry( + || async { remote_storage.delete(&path, cancel).await }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "remove_tenant_remote_delete_mark", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("remove_tenant_remote_delete_mark")?; Ok(()) } @@ -297,7 +295,7 @@ impl DeleteTenantFlow { #[instrument(skip_all)] pub(crate) async fn run( conf: &'static PageServerConf, - remote_storage: Option, + remote_storage: GenericRemoteStorage, tenants: &'static std::sync::RwLock, tenant: Arc, cancel: &CancellationToken, @@ -308,9 +306,7 @@ impl DeleteTenantFlow { let mut guard = Self::prepare(&tenant).await?; - if let Err(e) = - Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await - { + if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await { tenant.set_broken(format!("{e:#}")).await; return Err(e); } @@ -327,7 +323,7 @@ impl DeleteTenantFlow { async fn run_inner( guard: &mut OwnedMutexGuard, conf: &'static PageServerConf, - remote_storage: Option<&GenericRemoteStorage>, + remote_storage: &GenericRemoteStorage, tenant: &Tenant, cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { @@ -339,14 +335,9 @@ impl DeleteTenantFlow { ))? }); - // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend. - // Though sounds scary, different mark name? - // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state. - if let Some(remote_storage) = &remote_storage { - create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel) - .await - .context("remote_mark")? - } + create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel) + .await + .context("remote_mark")?; fail::fail_point!("tenant-delete-before-create-local-mark", |_| { Err(anyhow::anyhow!( @@ -483,7 +474,7 @@ impl DeleteTenantFlow { fn schedule_background( guard: OwnedMutexGuard, conf: &'static PageServerConf, - remote_storage: Option, + remote_storage: GenericRemoteStorage, tenants: &'static std::sync::RwLock, tenant: Arc, ) { @@ -512,7 +503,7 @@ impl DeleteTenantFlow { async fn background( mut guard: OwnedMutexGuard, conf: &PageServerConf, - remote_storage: Option, + remote_storage: GenericRemoteStorage, tenants: &'static std::sync::RwLock, tenant: &Arc, ) -> Result<(), DeleteTenantError> { @@ -551,7 +542,7 @@ impl DeleteTenantFlow { remove_tenant_remote_delete_mark( conf, - remote_storage.as_ref(), + &remote_storage, &tenant.tenant_shard_id, &task_mgr::shutdown_token(), ) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 7a3e36bf02..5abda7b64e 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -47,7 +47,7 @@ use crate::tenant::span::debug_assert_current_span_has_tenant_id; use crate::tenant::storage_layer::inmemory_layer; use crate::tenant::timeline::ShutdownMode; use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState}; -use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX}; +use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX}; use utils::crashsafe::path_with_suffix_extension; use utils::fs_ext::PathExt; @@ -391,22 +391,17 @@ async fn init_load_generations( // deletion list entries may still be valid. We provide that by pushing a recovery operation into // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions // are processed, even though we don't block on recovery completing here. - // - // Must only do this if remote storage is enabled, otherwise deletion queue - // is not running and channel push will fail. - if resources.remote_storage.is_some() { - let attached_tenants = generations - .iter() - .flat_map(|(id, start_mode)| { - match start_mode { - TenantStartupMode::Attached((_mode, generation)) => Some(generation), - TenantStartupMode::Secondary => None, - } - .map(|gen| (*id, *gen)) - }) - .collect(); - resources.deletion_queue_client.recover(attached_tenants)?; - } + let attached_tenants = generations + .iter() + .flat_map(|(id, start_mode)| { + match start_mode { + TenantStartupMode::Attached((_mode, generation)) => Some(generation), + TenantStartupMode::Secondary => None, + } + .map(|gen| (*id, *gen)) + }) + .collect(); + resources.deletion_queue_client.recover(attached_tenants)?; Ok(Some(generations)) } @@ -460,53 +455,6 @@ fn load_tenant_config( } }; - // Clean up legacy `metadata` files. - // Doing it here because every single tenant directory is visited here. - // In any later code, there's different treatment of tenant dirs - // ... depending on whether the tenant is in re-attach response or not - // ... epending on whether the tenant is ignored or not - assert_eq!( - &conf.tenant_path(&tenant_shard_id), - &tenant_dir_path, - "later use of conf....path() methods would be dubious" - ); - let timelines: Vec = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() { - Ok(iter) => { - let mut timelines = Vec::new(); - for res in iter { - let p = res?; - let Some(timeline_id) = p.file_name().parse::().ok() else { - // skip any entries that aren't TimelineId, such as - // - *.___temp dirs - // - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart) - continue; - }; - timelines.push(timeline_id); - } - timelines - } - Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![], - Err(e) => return Err(anyhow::anyhow!(e)), - }; - for timeline_id in timelines { - let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id); - let metadata_path = timeline_path.join(METADATA_FILE_NAME); - match std::fs::remove_file(&metadata_path) { - Ok(()) => { - crashsafe::fsync(timeline_path) - .context("fsync timeline dir after removing legacy metadata file")?; - info!("removed legacy metadata file at {metadata_path}"); - } - Err(e) if e.kind() == std::io::ErrorKind::NotFound => { - // something removed the file earlier, or it was never there - // We don't care, this software version doesn't write it again, so, we're good. - } - Err(e) => { - anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}"); - } - } - } - let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME); if tenant_ignore_mark_file.exists() { info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant"); @@ -611,6 +559,7 @@ pub async fn init_tenant_mgr( TenantSlot::Attached(Tenant::create_broken_tenant( conf, tenant_shard_id, + resources.remote_storage.clone(), format!("{}", e), )), ); @@ -803,6 +752,7 @@ fn tenant_spawn( "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}" ); + let remote_storage = resources.remote_storage.clone(); let tenant = match Tenant::spawn( conf, tenant_shard_id, @@ -817,7 +767,7 @@ fn tenant_spawn( Ok(tenant) => tenant, Err(e) => { error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}"); - Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}")) + Tenant::create_broken_tenant(conf, tenant_shard_id, remote_storage, format!("{e:#}")) } }; @@ -2276,7 +2226,7 @@ pub(crate) async fn load_tenant( tenant_id: TenantId, generation: Generation, broker_client: storage_broker::BrokerClientChannel, - remote_storage: Option, + remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, ctx: &RequestContext, ) -> Result<(), TenantMapInsertError> { @@ -2937,7 +2887,7 @@ pub(crate) async fn immediate_gc( } let timeline = tenant.get_timeline(timeline_id, false).ok(); - let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref()); + let rtc = timeline.as_ref().map(|x| &x.remote_client); if let Some(rtc) = rtc { // layer drops schedule actions on remote timeline client to actually do the diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 630ade5c13..c5462dac43 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -2137,7 +2137,7 @@ mod tests { tenant_ctx: _tenant_ctx, } = test_setup; - let client = timeline.remote_client.as_ref().unwrap(); + let client = &timeline.remote_client; // Download back the index.json, and check that the list of files is correct let initial_index_part = match client @@ -2328,7 +2328,7 @@ mod tests { timeline, .. } = TestSetup::new("metrics").await.unwrap(); - let client = timeline.remote_client.as_ref().unwrap(); + let client = &timeline.remote_client; let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); let local_path = local_layer_path( diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index c28e041fa2..46a3d7e81f 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -26,7 +26,7 @@ use crate::{ tasks::{warn_when_period_overrun, BackgroundLoopKind}, }, virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, - METADATA_FILE_NAME, TEMP_FILE_SUFFIX, + TEMP_FILE_SUFFIX, }; use super::{ @@ -1074,11 +1074,7 @@ async fn init_timeline_state( .fatal_err(&format!("Read metadata on {}", file_path)); let file_name = file_path.file_name().expect("created it from the dentry"); - if file_name == METADATA_FILE_NAME { - // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant. - warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config"); - continue; - } else if crate::is_temporary(&file_path) + if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) || is_ephemeral_file(file_name) { diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index b6f7702247..e8c712c4c6 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -585,9 +585,6 @@ struct LayerInner { /// [`Timeline::gate`] at the same time. timeline: Weak, - /// Cached knowledge of [`Timeline::remote_client`] being `Some`. - have_remote_client: bool, - access_stats: LayerAccessStats, /// This custom OnceCell is backed by std mutex, but only held for short time periods. @@ -732,23 +729,23 @@ impl Drop for LayerInner { if removed { timeline.metrics.resident_physical_size_sub(file_size); } - if let Some(remote_client) = timeline.remote_client.as_ref() { - let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]); + let res = timeline + .remote_client + .schedule_deletion_of_unlinked(vec![(file_name, meta)]); - if let Err(e) = res { - // test_timeline_deletion_with_files_stuck_in_upload_queue is good at - // demonstrating this deadlock (without spawn_blocking): stop will drop - // queued items, which will have ResidentLayer's, and those drops would try - // to re-entrantly lock the RemoteTimelineClient inner state. - if !timeline.is_active() { - tracing::info!("scheduling deletion on drop failed: {e:#}"); - } else { - tracing::warn!("scheduling deletion on drop failed: {e:#}"); - } - LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed); + if let Err(e) = res { + // test_timeline_deletion_with_files_stuck_in_upload_queue is good at + // demonstrating this deadlock (without spawn_blocking): stop will drop + // queued items, which will have ResidentLayer's, and those drops would try + // to re-entrantly lock the RemoteTimelineClient inner state. + if !timeline.is_active() { + tracing::info!("scheduling deletion on drop failed: {e:#}"); } else { - LAYER_IMPL_METRICS.inc_completed_deletes(); + tracing::warn!("scheduling deletion on drop failed: {e:#}"); } + LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed); + } else { + LAYER_IMPL_METRICS.inc_completed_deletes(); } }); } @@ -786,7 +783,6 @@ impl LayerInner { path: local_path, desc, timeline: Arc::downgrade(timeline), - have_remote_client: timeline.remote_client.is_some(), access_stats, wanted_deleted: AtomicBool::new(false), inner, @@ -815,8 +811,6 @@ impl LayerInner { /// in a new attempt to evict OR join the previously started attempt. #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))] pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> { - assert!(self.have_remote_client); - let mut rx = self.status.as_ref().unwrap().subscribe(); { @@ -973,10 +967,6 @@ impl LayerInner { return Err(DownloadError::NotFile(ft)); } - if timeline.remote_client.as_ref().is_none() { - return Err(DownloadError::NoRemoteStorage); - } - if let Some(ctx) = ctx { self.check_expected_download(ctx)?; } @@ -1113,12 +1103,8 @@ impl LayerInner { permit: heavier_once_cell::InitPermit, ctx: &RequestContext, ) -> anyhow::Result> { - let client = timeline + let result = timeline .remote_client - .as_ref() - .expect("checked before download_init_and_wait"); - - let result = client .download_layer_file( &self.desc.layer_name(), &self.metadata(), @@ -1293,20 +1279,10 @@ impl LayerInner { /// `DownloadedLayer` is being dropped, so it calls this method. fn on_downloaded_layer_drop(self: Arc, only_version: usize) { - let can_evict = self.have_remote_client; - // we cannot know without inspecting LayerInner::inner if we should evict or not, even // though here it is very likely let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version); - if !can_evict { - // it would be nice to assert this case out, but we are in drop - span.in_scope(|| { - tracing::error!("bug in struct Layer: ResidentOrWantedEvicted has been downgraded while we have no remote storage"); - }); - return; - } - // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might // drop while the `self.inner` is being locked, leading to a deadlock. @@ -1578,8 +1554,6 @@ pub(crate) enum EvictionError { pub(crate) enum DownloadError { #[error("timeline has already shutdown")] TimelineShutdown, - #[error("no remote storage configured")] - NoRemoteStorage, #[error("context denies downloading")] ContextAndConfigReallyDeniesDownloads, #[error("downloading is really required but not allowed by this method")] diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 52f62faa8d..fa9142d5e9 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -145,7 +145,7 @@ async fn smoke_test() { .await .expect("the local layer file still exists"); - let rtc = timeline.remote_client.as_ref().unwrap(); + let rtc = &timeline.remote_client; { let layers = &[layer]; @@ -761,13 +761,7 @@ async fn eviction_cancellation_on_drop() { timeline.freeze_and_flush().await.unwrap(); // wait for the upload to complete so our Arc::strong_count assertion holds - timeline - .remote_client - .as_ref() - .unwrap() - .wait_completion() - .await - .unwrap(); + timeline.remote_client.wait_completion().await.unwrap(); let (evicted_layer, not_evicted) = { let mut layers = { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index ca34b4fadc..df9bc9b35b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -200,7 +200,7 @@ fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) { /// The outward-facing resources required to build a Timeline pub struct TimelineResources { - pub remote_client: Option, + pub remote_client: RemoteTimelineClient, pub deletion_queue_client: DeletionQueueClient, pub timeline_get_throttle: Arc< crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>, @@ -272,7 +272,7 @@ pub struct Timeline { /// Remote storage client. /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details. - pub remote_client: Option>, + pub remote_client: Arc, // What page versions do we hold in the repository? If we get a // request > last_record_lsn, we need to wait until we receive all @@ -1375,22 +1375,14 @@ impl Timeline { /// not validated with control plane yet. /// See [`Self::get_remote_consistent_lsn_visible`]. pub(crate) fn get_remote_consistent_lsn_projected(&self) -> Option { - if let Some(remote_client) = &self.remote_client { - remote_client.remote_consistent_lsn_projected() - } else { - None - } + self.remote_client.remote_consistent_lsn_projected() } /// remote_consistent_lsn which the tenant is guaranteed not to go backward from, /// i.e. a value of remote_consistent_lsn_projected which has undergone /// generation validation in the deletion queue. pub(crate) fn get_remote_consistent_lsn_visible(&self) -> Option { - if let Some(remote_client) = &self.remote_client { - remote_client.remote_consistent_lsn_visible() - } else { - None - } + self.remote_client.remote_consistent_lsn_visible() } /// The sum of the file size of all historic layers in the layer map. @@ -1760,16 +1752,14 @@ impl Timeline { match self.freeze_and_flush().await { Ok(_) => { // drain the upload queue - if let Some(client) = self.remote_client.as_ref() { - // if we did not wait for completion here, it might be our shutdown process - // didn't wait for remote uploads to complete at all, as new tasks can forever - // be spawned. - // - // what is problematic is the shutting down of RemoteTimelineClient, because - // obviously it does not make sense to stop while we wait for it, but what - // about corner cases like s3 suddenly hanging up? - client.shutdown().await; - } + // if we did not wait for completion here, it might be our shutdown process + // didn't wait for remote uploads to complete at all, as new tasks can forever + // be spawned. + // + // what is problematic is the shutting down of RemoteTimelineClient, because + // obviously it does not make sense to stop while we wait for it, but what + // about corner cases like s3 suddenly hanging up? + self.remote_client.shutdown().await; } Err(e) => { // Non-fatal. Shutdown is infallible. Failures to flush just mean that @@ -1785,18 +1775,16 @@ impl Timeline { // Transition the remote_client into a state where it's only useful for timeline deletion. // (The deletion use case is why we can't just hook up remote_client to Self::cancel).) - if let Some(remote_client) = self.remote_client.as_ref() { - remote_client.stop(); - // As documented in remote_client.stop()'s doc comment, it's our responsibility - // to shut down the upload queue tasks. - // TODO: fix that, task management should be encapsulated inside remote_client. - task_mgr::shutdown_tasks( - Some(TaskKind::RemoteUploadTask), - Some(self.tenant_shard_id), - Some(self.timeline_id), - ) - .await; - } + self.remote_client.stop(); + // As documented in remote_client.stop()'s doc comment, it's our responsibility + // to shut down the upload queue tasks. + // TODO: fix that, task management should be encapsulated inside remote_client. + task_mgr::shutdown_tasks( + Some(TaskKind::RemoteUploadTask), + Some(self.tenant_shard_id), + Some(self.timeline_id), + ) + .await; // TODO: work toward making this a no-op. See this funciton's doc comment for more context. tracing::debug!("Waiting for tasks..."); @@ -1922,10 +1910,6 @@ impl Timeline { return Ok(None); }; - if self.remote_client.is_none() { - return Ok(Some(false)); - } - layer.download().await?; Ok(Some(true)) @@ -2190,7 +2174,7 @@ impl Timeline { walredo_mgr, walreceiver: Mutex::new(None), - remote_client: resources.remote_client.map(Arc::new), + remote_client: Arc::new(resources.remote_client), // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. last_record_lsn: SeqWait::new(RecordLsn { @@ -2437,10 +2421,6 @@ impl Timeline { discovered_layers.push((layer_file_name, local_path, file_size)); continue; } - Discovered::Metadata => { - warn!("found legacy metadata file, these should have been removed in load_tenant_config"); - continue; - } Discovered::IgnoredBackup => { continue; } @@ -2487,12 +2467,10 @@ impl Timeline { if local.metadata.file_size() == remote.file_size() { // Use the local file, but take the remote metadata so that we pick up // the correct generation. - UseLocal( - LocalLayerFileMetadata { - metadata: remote, - local_path: local.local_path - } - ) + UseLocal(LocalLayerFileMetadata { + metadata: remote, + local_path: local.local_path, + }) } else { init::cleanup_local_file_for_remote(&local, &remote)?; UseRemote { local, remote } @@ -2501,7 +2479,11 @@ impl Timeline { Ok(decision) => decision, Err(DismissedLayer::Future { local }) => { if let Some(local) = local { - init::cleanup_future_layer(&local.local_path, &name, disk_consistent_lsn)?; + init::cleanup_future_layer( + &local.local_path, + &name, + disk_consistent_lsn, + )?; } needs_cleanup.push(name); continue; @@ -2523,7 +2505,8 @@ impl Timeline { let layer = match decision { UseLocal(local) => { total_physical_size += local.metadata.file_size(); - Layer::for_resident(conf, &this, local.local_path, name, local.metadata).drop_eviction_guard() + Layer::for_resident(conf, &this, local.local_path, name, local.metadata) + .drop_eviction_guard() } Evicted(remote) | UseRemote { remote, .. } => { Layer::for_evicted(conf, &this, name, remote) @@ -2543,36 +2526,36 @@ impl Timeline { guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1); - if let Some(rtc) = self.remote_client.as_ref() { - rtc.schedule_layer_file_deletion(&needs_cleanup)?; - rtc.schedule_index_upload_for_file_changes()?; - // This barrier orders above DELETEs before any later operations. - // This is critical because code executing after the barrier might - // create again objects with the same key that we just scheduled for deletion. - // For example, if we just scheduled deletion of an image layer "from the future", - // later compaction might run again and re-create the same image layer. - // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn. - // "same" here means same key range and LSN. - // - // Without a barrier between above DELETEs and the re-creation's PUTs, - // the upload queue may execute the PUT first, then the DELETE. - // In our example, we will end up with an IndexPart referencing a non-existent object. - // - // 1. a future image layer is created and uploaded - // 2. ps restart - // 3. the future layer from (1) is deleted during load layer map - // 4. image layer is re-created and uploaded - // 5. deletion queue would like to delete (1) but actually deletes (4) - // 6. delete by name works as expected, but it now deletes the wrong (later) version - // - // See https://github.com/neondatabase/neon/issues/5878 - // - // NB: generation numbers naturally protect against this because they disambiguate - // (1) and (4) - rtc.schedule_barrier()?; - // Tenant::create_timeline will wait for these uploads to happen before returning, or - // on retry. - } + self.remote_client + .schedule_layer_file_deletion(&needs_cleanup)?; + self.remote_client + .schedule_index_upload_for_file_changes()?; + // This barrier orders above DELETEs before any later operations. + // This is critical because code executing after the barrier might + // create again objects with the same key that we just scheduled for deletion. + // For example, if we just scheduled deletion of an image layer "from the future", + // later compaction might run again and re-create the same image layer. + // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn. + // "same" here means same key range and LSN. + // + // Without a barrier between above DELETEs and the re-creation's PUTs, + // the upload queue may execute the PUT first, then the DELETE. + // In our example, we will end up with an IndexPart referencing a non-existent object. + // + // 1. a future image layer is created and uploaded + // 2. ps restart + // 3. the future layer from (1) is deleted during load layer map + // 4. image layer is re-created and uploaded + // 5. deletion queue would like to delete (1) but actually deletes (4) + // 6. delete by name works as expected, but it now deletes the wrong (later) version + // + // See https://github.com/neondatabase/neon/issues/5878 + // + // NB: generation numbers naturally protect against this because they disambiguate + // (1) and (4) + self.remote_client.schedule_barrier()?; + // Tenant::create_timeline will wait for these uploads to happen before returning, or + // on retry. info!( "loaded layer map with {} layers at {}, total physical size: {}", @@ -3025,9 +3008,6 @@ impl Timeline { /// should treat this as a cue to simply skip doing any heatmap uploading /// for this timeline. pub(crate) async fn generate_heatmap(&self) -> Option { - // no point in heatmaps without remote client - let _remote_client = self.remote_client.as_ref()?; - if !self.is_active() { return None; } @@ -3055,10 +3035,7 @@ impl Timeline { // branchpoint in the value in IndexPart::lineage self.ancestor_lsn == lsn || (self.ancestor_lsn == Lsn::INVALID - && self - .remote_client - .as_ref() - .is_some_and(|rtc| rtc.is_previous_ancestor_lsn(lsn))) + && self.remote_client.is_previous_ancestor_lsn(lsn)) } } @@ -3978,29 +3955,23 @@ impl Timeline { x.unwrap() )); - if let Some(remote_client) = &self.remote_client { - for layer in layers_to_upload { - remote_client.schedule_layer_file_upload(layer)?; - } - remote_client.schedule_index_upload_for_metadata_update(&update)?; + for layer in layers_to_upload { + self.remote_client.schedule_layer_file_upload(layer)?; } + self.remote_client + .schedule_index_upload_for_metadata_update(&update)?; Ok(()) } pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> { - if let Some(remote_client) = &self.remote_client { - remote_client - .preserve_initdb_archive( - &self.tenant_shard_id.tenant_id, - &self.timeline_id, - &self.cancel, - ) - .await?; - } else { - bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id); - } - Ok(()) + self.remote_client + .preserve_initdb_archive( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + &self.cancel, + ) + .await } // Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked @@ -4361,12 +4332,7 @@ impl Timeline { return; } - if self - .remote_client - .as_ref() - .map(|c| c.is_deleting()) - .unwrap_or(false) - { + if self.remote_client.is_deleting() { // The timeline was created in a deletion-resume state, we don't expect logical size to be populated return; } @@ -4534,9 +4500,8 @@ impl Timeline { // deletion will happen later, the layer file manager calls garbage_collect_on_drop guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics); - if let Some(remote_client) = self.remote_client.as_ref() { - remote_client.schedule_compaction_update(&remove_layers, new_deltas)?; - } + self.remote_client + .schedule_compaction_update(&remove_layers, new_deltas)?; drop_wlock(guard); @@ -4554,9 +4519,8 @@ impl Timeline { let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect(); - if let Some(remote_client) = self.remote_client.as_ref() { - remote_client.schedule_compaction_update(&drop_layers, &upload_layers)?; - } + self.remote_client + .schedule_compaction_update(&drop_layers, &upload_layers)?; Ok(()) } @@ -4566,16 +4530,14 @@ impl Timeline { self: &Arc, new_images: impl IntoIterator, ) -> anyhow::Result<()> { - let Some(remote_client) = &self.remote_client else { - return Ok(()); - }; for layer in new_images { - remote_client.schedule_layer_file_upload(layer)?; + self.remote_client.schedule_layer_file_upload(layer)?; } // should any new image layer been created, not uploading index_part will // result in a mismatch between remote_physical_size and layermap calculated // size, which will fail some tests, but should not be an issue otherwise. - remote_client.schedule_index_upload_for_file_changes()?; + self.remote_client + .schedule_index_upload_for_file_changes()?; Ok(()) } @@ -4861,9 +4823,7 @@ impl Timeline { result.layers_removed = gc_layers.len() as u64; - if let Some(remote_client) = self.remote_client.as_ref() { - remote_client.schedule_gc_update(&gc_layers)?; - } + self.remote_client.schedule_gc_update(&gc_layers)?; guard.finish_gc_timeline(&gc_layers); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 4226bf431e..ed48b4c9cb 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -295,13 +295,11 @@ impl Timeline { // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage self.rewrite_layers(replace_layers, drop_layers).await?; - if let Some(remote_client) = self.remote_client.as_ref() { - // We wait for all uploads to complete before finishing this compaction stage. This is not - // necessary for correctness, but it simplifies testing, and avoids proceeding with another - // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O - // load. - remote_client.wait_completion().await?; - } + // We wait for all uploads to complete before finishing this compaction stage. This is not + // necessary for correctness, but it simplifies testing, and avoids proceeding with another + // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O + // load. + self.remote_client.wait_completion().await?; Ok(()) } diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index d8701be170..901f5149b3 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -26,19 +26,21 @@ use super::{Timeline, TimelineResources}; /// during attach or pageserver restart. /// See comment in persist_index_part_with_deleted_flag. async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> { - if let Some(remote_client) = timeline.remote_client.as_ref() { - match remote_client.persist_index_part_with_deleted_flag().await { - // If we (now, or already) marked it successfully as deleted, we can proceed - Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), - // Bail out otherwise - // - // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents - // two tasks from performing the deletion at the same time. The first task - // that starts deletion should run it to completion. - Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) - | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { - return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); - } + match timeline + .remote_client + .persist_index_part_with_deleted_flag() + .await + { + // If we (now, or already) marked it successfully as deleted, we can proceed + Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), + // Bail out otherwise + // + // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents + // two tasks from performing the deletion at the same time. The first task + // that starts deletion should run it to completion. + Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) + | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { + return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); } } Ok(()) @@ -117,11 +119,11 @@ pub(super) async fn delete_local_timeline_directory( /// Removes remote layers and an index file after them. async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> { - if let Some(remote_client) = &timeline.remote_client { - remote_client.delete_all().await.context("delete_all")? - }; - - Ok(()) + timeline + .remote_client + .delete_all() + .await + .context("delete_all") } // This function removs remaining traces of a timeline on disk. @@ -260,7 +262,7 @@ impl DeleteTimelineFlow { tenant: Arc, timeline_id: TimelineId, local_metadata: &TimelineMetadata, - remote_client: Option, + remote_client: RemoteTimelineClient, deletion_queue_client: DeletionQueueClient, ) -> anyhow::Result<()> { // Note: here we even skip populating layer map. Timeline is essentially uninitialized. diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 9471ba860f..7f59758c87 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -70,10 +70,6 @@ pub(super) async fn prepare( ) -> Result<(completion::Completion, PreparedTimelineDetach), Error> { use Error::*; - if detached.remote_client.as_ref().is_none() { - unimplemented!("no new code for running without remote storage"); - } - let Some((ancestor, ancestor_lsn)) = detached .ancestor_timeline .as_ref() @@ -315,8 +311,6 @@ async fn upload_rewritten_layer( // FIXME: better shuttingdown error target .remote_client - .as_ref() - .unwrap() .upload_layer_file(&copied, cancel) .await .map_err(UploadRewritten)?; @@ -406,8 +400,6 @@ async fn remote_copy( // FIXME: better shuttingdown error adoptee .remote_client - .as_ref() - .unwrap() .copy_timeline_layer(adopted, &owned, cancel) .await .map(move |()| owned) @@ -421,11 +413,6 @@ pub(super) async fn complete( prepared: PreparedTimelineDetach, _ctx: &RequestContext, ) -> Result, anyhow::Error> { - let rtc = detached - .remote_client - .as_ref() - .expect("has to have a remote timeline client for timeline ancestor detach"); - let PreparedTimelineDetach { layers } = prepared; let ancestor = detached @@ -442,11 +429,13 @@ pub(super) async fn complete( // // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart // which could give us a completely wrong layer combination. - rtc.schedule_adding_existing_layers_to_index_detach_and_wait( - &layers, - (ancestor.timeline_id, ancestor_lsn), - ) - .await?; + detached + .remote_client + .schedule_adding_existing_layers_to_index_detach_and_wait( + &layers, + (ancestor.timeline_id, ancestor_lsn), + ) + .await?; let mut tasks = tokio::task::JoinSet::new(); @@ -491,8 +480,6 @@ pub(super) async fn complete( async move { let res = timeline .remote_client - .as_ref() - .expect("reparented has to have remote client because detached has one") .schedule_reparenting_and_wait(&new_parent) .await; diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 3567761b9a..8a8c38d0ce 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -23,7 +23,7 @@ use std::{ use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, info_span, instrument, warn, Instrument}; +use tracing::{debug, info, info_span, instrument, warn, Instrument}; use crate::{ context::{DownloadBehavior, RequestContext}, @@ -211,11 +211,6 @@ impl Timeline { // So, we just need to deal with this. - if self.remote_client.is_none() { - error!("no remote storage configured, cannot evict layers"); - return ControlFlow::Continue(()); - } - let mut js = tokio::task::JoinSet::new(); { let guard = self.layers.read().await; diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 66aa765015..feadc79e5e 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -9,7 +9,6 @@ use crate::{ storage_layer::LayerName, Generation, }, - METADATA_FILE_NAME, }; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; @@ -27,8 +26,6 @@ pub(super) enum Discovered { Temporary(String), /// Temporary on-demand download files, should be removed TemporaryDownload(String), - /// "metadata" file we persist locally and include in `index_part.json` - Metadata, /// Backup file from previously future layers IgnoredBackup, /// Unrecognized, warn about these @@ -49,9 +46,7 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result { - if file_name == METADATA_FILE_NAME { - Discovered::Metadata - } else if file_name.ends_with(".old") { + if file_name.ends_with(".old") { // ignore these Discovered::IgnoredBackup } else if remote_timeline_client::is_temp_download_file(direntry.path()) { diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 7d4e101189..61afd820ca 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -56,14 +56,8 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): (tenant0, timeline0, pg0) = tenant_timelines[0] log.info(f"Timeline {tenant0}/{timeline0} is left intact") - (tenant1, timeline1, pg1) = tenant_timelines[1] - metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata" - with open(metadata_path, "w") as f: - f.write("overwritten with garbage!") - log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled") - - (tenant2, timeline2, pg2) = tenant_timelines[2] - timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/" + (tenant1, timeline1, pg1) = tenant_timelines[2] + timeline_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/" for filename in os.listdir(timeline_path): if filename.startswith("00000"): # Looks like a layer file. Corrupt it @@ -72,7 +66,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): with open(p, "wb") as f: f.truncate(0) f.truncate(size) - log.info(f"Timeline {tenant2}/{timeline2} got its local layer files spoiled") + log.info(f"Timeline {tenant1}/{timeline1} got its local layer files spoiled") env.pageserver.start() @@ -80,19 +74,15 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): pg0.start() assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100 - # Tenant with corrupt local metadata works: remote storage is authoritative for metadata - pg1.start() - assert pg1.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100 - # Second timeline will fail during basebackup, because the local layer file is corrupt. # It will fail when we try to read (and reconstruct) a page from it, ergo the error message. # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err: - pg2.start() + pg1.start() log.info( - f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}" + f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}" ) From bc78b0e9cc95ea033797b13d5bb36e61d338a070 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 15 May 2024 14:18:02 +0200 Subject: [PATCH 057/130] chore(deps): use upstream svg_fmt after they merged our PR (#7764) They have merged our PR https://github.com/nical/rust_debug/pull/4 but they haven't released a new crate version yet. refs https://github.com/neondatabase/neon/issues/7763 --- Cargo.lock | 2 +- Cargo.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6ce7180d67..961101b151 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5952,7 +5952,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "svg_fmt" version = "0.4.2" -source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8" +source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" [[package]] name = "syn" diff --git a/Cargo.toml b/Cargo.toml index 17f30a1327..3ccdabee18 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -158,8 +158,8 @@ socket2 = "0.5" strum = "0.24" strum_macros = "0.24" "subtle" = "2.5.0" -# https://github.com/nical/rust_debug/pull/4 -svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" } +# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet +svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" } sync_wrapper = "0.1.2" tar = "0.4" task-local-extensions = "0.1.4" From c3dd646ab3a48e1063c7efc6d080e21fdfb48fa7 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 15 May 2024 15:04:52 +0200 Subject: [PATCH 058/130] chore!: always use async walredo, warn if sync is configured (#7754) refs https://github.com/neondatabase/neon/issues/7753 This PR is step (1) of removing sync walredo from Pageserver. Changes: * Remove the sync impl * If sync is configured, warn! and use async instead * Remove the metric that exposes `kind` * Remove the tenant status API that exposes `kind` Future Work ----------- After we've released this change to prod and are sure we won't roll back, we will 1. update the prod Ansible to remove the config flag from the prod pageserver.toml. 2. remove the remaining `kind` code in pageserver These two changes need no release inbetween. See https://github.com/neondatabase/neon/issues/7753 for details. --- libs/pageserver_api/src/models.rs | 3 - pageserver/benches/bench_walredo.rs | 139 ++---- pageserver/src/bin/pageserver.rs | 1 - pageserver/src/config.rs | 2 +- pageserver/src/metrics.rs | 23 - pageserver/src/walredo.rs | 5 +- pageserver/src/walredo/process.rs | 57 +-- .../process/process_impl/process_std.rs | 405 ------------------ test_runner/regress/test_pageserver_config.py | 35 -- 9 files changed, 67 insertions(+), 603 deletions(-) delete mode 100644 pageserver/src/walredo/process/process_impl/process_std.rs delete mode 100644 test_runner/regress/test_pageserver_config.py diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 1df5820fb9..d78d2bcbea 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -776,9 +776,6 @@ pub struct TimelineGcRequest { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct WalRedoManagerProcessStatus { pub pid: u32, - /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`. - /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`. - pub kind: Cow<'static, str>, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 5b871c5d5e..5aab10e5d9 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -30,47 +30,27 @@ //! 2024-04-15 on i3en.3xlarge //! //! ```text -//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs] -//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs] -//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs] -//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs] -//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs] -//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs] -//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs] -//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms] -//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs] -//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs] -//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs] -//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs] -//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms] -//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms] -//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms] -//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms] -//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs] -//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs] -//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs] -//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs] -//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs] -//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs] -//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs] -//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms] -//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs] -//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs] -//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs] -//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs] -//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms] -//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms] -//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms] -//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms] +//! short/1 time: [24.584 µs 24.737 µs 24.922 µs] +//! short/2 time: [33.479 µs 33.660 µs 33.888 µs] +//! short/4 time: [42.713 µs 43.046 µs 43.440 µs] +//! short/8 time: [71.814 µs 72.478 µs 73.240 µs] +//! short/16 time: [132.73 µs 134.45 µs 136.22 µs] +//! short/32 time: [258.31 µs 260.73 µs 263.27 µs] +//! short/64 time: [511.61 µs 514.44 µs 517.51 µs] +//! short/128 time: [992.64 µs 998.23 µs 1.0042 ms] +//! medium/1 time: [110.11 µs 110.50 µs 110.96 µs] +//! medium/2 time: [153.06 µs 153.85 µs 154.99 µs] +//! medium/4 time: [317.51 µs 319.92 µs 322.85 µs] +//! medium/8 time: [638.30 µs 644.68 µs 652.12 µs] +//! medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms] +//! medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms] +//! medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms] +//! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms] //! ``` use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; -use pageserver::{ - config::PageServerConf, - walrecord::NeonWalRecord, - walredo::{PostgresRedoManager, ProcessKind}, -}; +use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager}; use pageserver_api::{key::Key, shard::TenantShardId}; use std::{ sync::Arc, @@ -80,39 +60,32 @@ use tokio::{sync::Barrier, task::JoinSet}; use utils::{id::TenantId, lsn::Lsn}; fn bench(c: &mut Criterion) { - for process_kind in &[ProcessKind::Async, ProcessKind::Sync] { - { - let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; - for nclients in nclients { - let mut group = c.benchmark_group(format!("{process_kind}-short")); - group.bench_with_input( - BenchmarkId::from_parameter(nclients), - &nclients, - |b, nclients| { - let redo_work = Arc::new(Request::short_input()); - b.iter_custom(|iters| { - bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients) - }); - }, - ); - } + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group("short"); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::short_input()); + b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); + }, + ); } - - { - let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; - for nclients in nclients { - let mut group = c.benchmark_group(format!("{process_kind}-medium")); - group.bench_with_input( - BenchmarkId::from_parameter(nclients), - &nclients, - |b, nclients| { - let redo_work = Arc::new(Request::medium_input()); - b.iter_custom(|iters| { - bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients) - }); - }, - ); - } + } + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group("medium"); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::medium_input()); + b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); + }, + ); } } } @@ -120,16 +93,10 @@ criterion::criterion_group!(benches, bench); criterion::criterion_main!(benches); // Returns the sum of each client's wall-clock time spent executing their share of the n_redos. -fn bench_impl( - process_kind: ProcessKind, - redo_work: Arc, - n_redos: u64, - nclients: u64, -) -> Duration { +fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration { let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap(); - let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); - conf.walredo_process_kind = process_kind; + let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = Box::leak(Box::new(conf)); let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); @@ -158,27 +125,13 @@ fn bench_impl( }); } - let elapsed = rt.block_on(async move { + rt.block_on(async move { let mut total_wallclock_time = Duration::ZERO; while let Some(res) = tasks.join_next().await { total_wallclock_time += res.unwrap(); } total_wallclock_time - }); - - // consistency check to ensure process kind setting worked - if nredos_per_client > 0 { - assert_eq!( - manager - .status() - .process - .map(|p| p.kind) - .expect("the benchmark work causes a walredo process to be spawned"), - std::borrow::Cow::Borrowed(process_kind.into()) - ); - } - - elapsed + }) } async fn client( diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index c0099aa704..a04195e12b 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -284,7 +284,6 @@ fn start_pageserver( )) .unwrap(); pageserver::preinitialize_metrics(); - pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 258eed0b12..b0afb6414b 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -99,7 +99,7 @@ pub mod defaults { pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; - pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync"; + pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async"; /// /// Default built-in configuration file. diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index b27bfb43b0..ffcd08b4b3 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1999,29 +1999,6 @@ impl Default for WalRedoProcessCounters { pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy = Lazy::new(WalRedoProcessCounters::default); -#[cfg(not(test))] -pub mod wal_redo { - use super::*; - - static PROCESS_KIND: Lazy> = Lazy::new(|| { - std::sync::Mutex::new( - register_uint_gauge_vec!( - "pageserver_wal_redo_process_kind", - "The configured process kind for walredo", - &["kind"], - ) - .unwrap(), - ) - }); - - pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) { - // use guard to avoid races around the next two steps - let guard = PROCESS_KIND.lock().unwrap(); - guard.reset(); - guard.with_label_values(&[&format!("{kind}")]).set(1); - } -} - /// Similar to `prometheus::HistogramTimer` but does not record on drop. pub(crate) struct StorageTimeMetricsTimer { metrics: StorageTimeMetrics, diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 9776d4ce88..3decea0c6d 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -153,10 +153,7 @@ impl PostgresRedoManager { process: self .redo_process .get() - .map(|p| WalRedoManagerProcessStatus { - pid: p.id(), - kind: std::borrow::Cow::Borrowed(p.kind().into()), - }), + .map(|p| WalRedoManagerProcessStatus { pid: p.id() }), } } } diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index ad6b4e5fe9..02c9c04bf1 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -1,7 +1,10 @@ +/// Layer of indirection previously used to support multiple implementations. +/// Subject to removal: use std::time::Duration; use bytes::Bytes; use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use tracing::warn; use utils::lsn::Lsn; use crate::{config::PageServerConf, walrecord::NeonWalRecord}; @@ -12,7 +15,6 @@ mod protocol; mod process_impl { pub(super) mod process_async; - pub(super) mod process_std; } #[derive( @@ -34,10 +36,7 @@ pub enum Kind { Async, } -pub(crate) enum Process { - Sync(process_impl::process_std::WalRedoProcess), - Async(process_impl::process_async::WalRedoProcess), -} +pub(crate) struct Process(process_impl::process_async::WalRedoProcess); impl Process { #[inline(always)] @@ -46,18 +45,17 @@ impl Process { tenant_shard_id: TenantShardId, pg_version: u32, ) -> anyhow::Result { - Ok(match conf.walredo_process_kind { - Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch( - conf, - tenant_shard_id, - pg_version, - )?), - Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch( - conf, - tenant_shard_id, - pg_version, - )?), - }) + if conf.walredo_process_kind != Kind::Async { + warn!( + configured = %conf.walredo_process_kind, + "the walredo_process_kind setting has been turned into a no-op, using async implementation" + ); + } + Ok(Self(process_impl::process_async::WalRedoProcess::launch( + conf, + tenant_shard_id, + pg_version, + )?)) } #[inline(always)] @@ -69,29 +67,12 @@ impl Process { records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> anyhow::Result { - match self { - Process::Sync(p) => { - p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) - .await - } - Process::Async(p) => { - p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) - .await - } - } + self.0 + .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) + .await } pub(crate) fn id(&self) -> u32 { - match self { - Process::Sync(p) => p.id(), - Process::Async(p) => p.id(), - } - } - - pub(crate) fn kind(&self) -> Kind { - match self { - Process::Sync(_) => Kind::Sync, - Process::Async(_) => Kind::Async, - } + self.0.id() } } diff --git a/pageserver/src/walredo/process/process_impl/process_std.rs b/pageserver/src/walredo/process/process_impl/process_std.rs deleted file mode 100644 index e7a6c263c9..0000000000 --- a/pageserver/src/walredo/process/process_impl/process_std.rs +++ /dev/null @@ -1,405 +0,0 @@ -use self::no_leak_child::NoLeakChild; -use crate::{ - config::PageServerConf, - metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, - walrecord::NeonWalRecord, - walredo::process::{no_leak_child, protocol}, -}; -use anyhow::Context; -use bytes::Bytes; -use nix::poll::{PollFd, PollFlags}; -use pageserver_api::{reltag::RelTag, shard::TenantShardId}; -use postgres_ffi::BLCKSZ; -use std::os::fd::AsRawFd; -#[cfg(feature = "testing")] -use std::sync::atomic::AtomicUsize; -use std::{ - collections::VecDeque, - io::{Read, Write}, - process::{ChildStdin, ChildStdout, Command, Stdio}, - sync::{Mutex, MutexGuard}, - time::Duration, -}; -use tracing::{debug, error, instrument, Instrument}; -use utils::{lsn::Lsn, nonblock::set_nonblock}; - -pub struct WalRedoProcess { - #[allow(dead_code)] - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - // Some() on construction, only becomes None on Drop. - child: Option, - stdout: Mutex, - stdin: Mutex, - /// Counter to separate same sized walredo inputs failing at the same millisecond. - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize, -} - -struct ProcessInput { - stdin: ChildStdin, - n_requests: usize, -} - -struct ProcessOutput { - stdout: ChildStdout, - pending_responses: VecDeque>, - n_processed_responses: usize, -} - -impl WalRedoProcess { - // - // Start postgres binary in special WAL redo mode. - // - #[instrument(skip_all,fields(pg_version=pg_version))] - pub(crate) fn launch( - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - pg_version: u32, - ) -> anyhow::Result { - crate::span::debug_assert_current_span_has_tenant_id(); - - let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. - let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; - - use no_leak_child::NoLeakChildCommandExt; - // Start postgres itself - let child = Command::new(pg_bin_dir_path.join("postgres")) - // the first arg must be --wal-redo so the child process enters into walredo mode - .arg("--wal-redo") - // the child doesn't process this arg, but, having it in the argv helps indentify the - // walredo process for a particular tenant when debugging a pagserver - .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) - .stdin(Stdio::piped()) - .stderr(Stdio::piped()) - .stdout(Stdio::piped()) - .env_clear() - .env("LD_LIBRARY_PATH", &pg_lib_dir_path) - .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) - // NB: The redo process is not trusted after we sent it the first - // walredo work. Before that, it is trusted. Specifically, we trust - // it to - // 1. close all file descriptors except stdin, stdout, stderr because - // pageserver might not be 100% diligent in setting FD_CLOEXEC on all - // the files it opens, and - // 2. to use seccomp to sandbox itself before processing the first - // walredo request. - .spawn_no_leak_child(tenant_shard_id) - .context("spawn process")?; - WAL_REDO_PROCESS_COUNTERS.started.inc(); - let mut child = scopeguard::guard(child, |child| { - error!("killing wal-redo-postgres process due to a problem during launch"); - child.kill_and_wait(WalRedoKillCause::Startup); - }); - - let stdin = child.stdin.take().unwrap(); - let stdout = child.stdout.take().unwrap(); - let stderr = child.stderr.take().unwrap(); - let stderr = tokio::process::ChildStderr::from_std(stderr) - .context("convert to tokio::ChildStderr")?; - macro_rules! set_nonblock_or_log_err { - ($file:ident) => {{ - let res = set_nonblock($file.as_raw_fd()); - if let Err(e) = &res { - error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed"); - } - res - }}; - } - set_nonblock_or_log_err!(stdin)?; - set_nonblock_or_log_err!(stdout)?; - - // all fallible operations post-spawn are complete, so get rid of the guard - let child = scopeguard::ScopeGuard::into_inner(child); - - tokio::spawn( - async move { - scopeguard::defer! { - debug!("wal-redo-postgres stderr_logger_task finished"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); - } - debug!("wal-redo-postgres stderr_logger_task started"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); - - use tokio::io::AsyncBufReadExt; - let mut stderr_lines = tokio::io::BufReader::new(stderr); - let mut buf = Vec::new(); - let res = loop { - buf.clear(); - // TODO we don't trust the process to cap its stderr length. - // Currently it can do unbounded Vec allocation. - match stderr_lines.read_until(b'\n', &mut buf).await { - Ok(0) => break Ok(()), // eof - Ok(num_bytes) => { - let output = String::from_utf8_lossy(&buf[..num_bytes]); - error!(%output, "received output"); - } - Err(e) => { - break Err(e); - } - } - }; - match res { - Ok(()) => (), - Err(e) => { - error!(error=?e, "failed to read from walredo stderr"); - } - } - }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) - ); - - Ok(Self { - conf, - tenant_shard_id, - child: Some(child), - stdin: Mutex::new(ProcessInput { - stdin, - n_requests: 0, - }), - stdout: Mutex::new(ProcessOutput { - stdout, - pending_responses: VecDeque::new(), - n_processed_responses: 0, - }), - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize::default(), - }) - } - - pub(crate) fn id(&self) -> u32 { - self.child - .as_ref() - .expect("must not call this during Drop") - .id() - } - - // Apply given WAL records ('records') over an old page image. Returns - // new page image. - // - #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] - pub(crate) async fn apply_wal_records( - &self, - rel: RelTag, - blknum: u32, - base_img: &Option, - records: &[(Lsn, NeonWalRecord)], - wal_redo_timeout: Duration, - ) -> anyhow::Result { - let tag = protocol::BufferTag { rel, blknum }; - let input = self.stdin.lock().unwrap(); - - // Serialize all the messages to send the WAL redo process first. - // - // This could be problematic if there are millions of records to replay, - // but in practice the number of records is usually so small that it doesn't - // matter, and it's better to keep this code simple. - // - // Most requests start with a before-image with BLCKSZ bytes, followed by - // by some other WAL records. Start with a buffer that can hold that - // comfortably. - let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); - protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); - if let Some(img) = base_img { - protocol::build_push_page_msg(tag, img, &mut writebuf); - } - for (lsn, rec) in records.iter() { - if let NeonWalRecord::Postgres { - will_init: _, - rec: postgres_rec, - } = rec - { - protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); - } else { - anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); - } - } - protocol::build_get_page_msg(tag, &mut writebuf); - WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); - - let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout); - - if res.is_err() { - // not all of these can be caused by this particular input, however these are so rare - // in tests so capture all. - self.record_and_log(&writebuf); - } - - res - } - - fn apply_wal_records0( - &self, - writebuf: &[u8], - input: MutexGuard, - wal_redo_timeout: Duration, - ) -> anyhow::Result { - let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small. - let mut nwrite = 0usize; - - while nwrite < writebuf.len() { - let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)]; - let n = loop { - match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) { - Err(nix::errno::Errno::EINTR) => continue, - res => break res, - } - }?; - - if n == 0 { - anyhow::bail!("WAL redo timed out"); - } - - // If 'stdin' is writeable, do write. - let in_revents = stdin_pollfds[0].revents().unwrap(); - if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() { - nwrite += proc.stdin.write(&writebuf[nwrite..])?; - } - if in_revents.contains(PollFlags::POLLHUP) { - // We still have more data to write, but the process closed the pipe. - anyhow::bail!("WAL redo process closed its stdin unexpectedly"); - } - } - let request_no = proc.n_requests; - proc.n_requests += 1; - drop(proc); - - // To improve walredo performance we separate sending requests and receiving - // responses. Them are protected by different mutexes (output and input). - // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process - // then there is not warranty that T1 will first granted output mutex lock. - // To address this issue we maintain number of sent requests, number of processed - // responses and ring buffer with pending responses. After sending response - // (under input mutex), threads remembers request number. Then it releases - // input mutex, locks output mutex and fetch in ring buffer all responses until - // its stored request number. The it takes correspondent element from - // pending responses ring buffer and truncate all empty elements from the front, - // advancing processed responses number. - - let mut output = self.stdout.lock().unwrap(); - let n_processed_responses = output.n_processed_responses; - while n_processed_responses + output.pending_responses.len() <= request_no { - // We expect the WAL redo process to respond with an 8k page image. We read it - // into this buffer. - let mut resultbuf = vec![0; BLCKSZ.into()]; - let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far - while nresult < BLCKSZ.into() { - let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)]; - // We do two things simultaneously: reading response from stdout - // and forward any logging information that the child writes to its stderr to the page server's log. - let n = loop { - match nix::poll::poll( - &mut stdout_pollfds[..], - wal_redo_timeout.as_millis() as i32, - ) { - Err(nix::errno::Errno::EINTR) => continue, - res => break res, - } - }?; - - if n == 0 { - anyhow::bail!("WAL redo timed out"); - } - - // If we have some data in stdout, read it to the result buffer. - let out_revents = stdout_pollfds[0].revents().unwrap(); - if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() { - nresult += output.stdout.read(&mut resultbuf[nresult..])?; - } - if out_revents.contains(PollFlags::POLLHUP) { - anyhow::bail!("WAL redo process closed its stdout unexpectedly"); - } - } - output - .pending_responses - .push_back(Some(Bytes::from(resultbuf))); - } - // Replace our request's response with None in `pending_responses`. - // Then make space in the ring buffer by clearing out any seqence of contiguous - // `None`'s from the front of `pending_responses`. - // NB: We can't pop_front() because other requests' responses because another - // requester might have grabbed the output mutex before us: - // T1: grab input mutex - // T1: send request_no 23 - // T1: release input mutex - // T2: grab input mutex - // T2: send request_no 24 - // T2: release input mutex - // T2: grab output mutex - // T2: n_processed_responses + output.pending_responses.len() <= request_no - // 23 0 24 - // T2: enters poll loop that reads stdout - // T2: put response for 23 into pending_responses - // T2: put response for 24 into pending_resposnes - // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back - // T2: takes its response_24 - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: releases output mutex - // T1: grabs output mutex - // T1: n_processed_responses + output.pending_responses.len() > request_no - // 23 2 23 - // T1: skips poll loop that reads stdout - // T1: takes its response_23 - // pending_responses now looks like this: Front None None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Back - // n_processed_responses now has value 25 - let res = output.pending_responses[request_no - n_processed_responses] - .take() - .expect("we own this request_no, nobody else is supposed to take it"); - while let Some(front) = output.pending_responses.front() { - if front.is_none() { - output.pending_responses.pop_front(); - output.n_processed_responses += 1; - } else { - break; - } - } - Ok(res) - } - - #[cfg(feature = "testing")] - fn record_and_log(&self, writebuf: &[u8]) { - use std::sync::atomic::Ordering; - - let millis = std::time::SystemTime::now() - .duration_since(std::time::SystemTime::UNIX_EPOCH) - .unwrap() - .as_millis(); - - let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); - - // these files will be collected to an allure report - let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); - - let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); - - let res = std::fs::OpenOptions::new() - .write(true) - .create_new(true) - .read(true) - .open(path) - .and_then(|mut f| f.write_all(writebuf)); - - // trip up allowed_errors - if let Err(e) = res { - tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); - } else { - tracing::error!(filename, "erroring walredo input saved"); - } - } - - #[cfg(not(feature = "testing"))] - fn record_and_log(&self, _: &[u8]) {} -} - -impl Drop for WalRedoProcess { - fn drop(&mut self) { - self.child - .take() - .expect("we only do this once") - .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); - // no way to wait for stderr_logger_task from Drop because that is async only - } -} diff --git a/test_runner/regress/test_pageserver_config.py b/test_runner/regress/test_pageserver_config.py deleted file mode 100644 index c04348b488..0000000000 --- a/test_runner/regress/test_pageserver_config.py +++ /dev/null @@ -1,35 +0,0 @@ -import pytest -from fixtures.neon_fixtures import ( - NeonEnvBuilder, - last_flush_lsn_upload, -) - - -@pytest.mark.parametrize("kind", ["sync", "async"]) -def test_walredo_process_kind_config(neon_env_builder: NeonEnvBuilder, kind: str): - neon_env_builder.pageserver_config_override = f"walredo_process_kind = '{kind}'" - # ensure it starts - env = neon_env_builder.init_start() - # ensure the metric is set - ps_http = env.pageserver.http_client() - metrics = ps_http.get_metrics() - samples = metrics.query_all("pageserver_wal_redo_process_kind") - assert [(s.labels, s.value) for s in samples] == [({"kind": kind}, 1)] - # ensure default tenant's config kind matches - # => write some data to force-spawn walredo - ep = env.endpoints.create_start("main") - with ep.connect() as conn: - with conn.cursor() as cur: - cur.execute("create table foo(bar text)") - cur.execute("insert into foo select from generate_series(1, 100)") - last_flush_lsn_upload(env, ep, env.initial_tenant, env.initial_timeline) - ep.stop() - ep.start() - with ep.connect() as conn: - with conn.cursor() as cur: - cur.execute("select count(*) from foo") - [(count,)] = cur.fetchall() - assert count == 100 - - status = ps_http.tenant_status(env.initial_tenant) - assert status["walredo"]["process"]["kind"] == kind From 1075386d778f67ad32200c3e7d7279479a7eb84f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 15 May 2024 15:32:47 +0200 Subject: [PATCH 059/130] Add test_uploads_and_deletions test (#7758) Adds a test that is a reproducer for many tiered compaction bugs, both ones that have since been fixed as well as still unfxied ones: * (now fixed) #7296 * #7707 * #7759 * Likely also #7244 but I haven't tried that. The key ordering bug can be reproduced by switching to `merge_delta_keys` instead of `merge_delta_keys_buffered`, so reverting a big part of #7661, although it only sometimes reproduces (30-50% of cases). part of https://github.com/neondatabase/neon/issues/7554 --- test_runner/fixtures/neon_fixtures.py | 78 +++++++++++++++++ test_runner/regress/test_compaction.py | 62 +++++++++++++- .../regress/test_pageserver_generations.py | 83 +------------------ 3 files changed, 141 insertions(+), 82 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 62a4b974a3..405ef19bfc 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -59,6 +59,7 @@ from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload, + wait_for_upload_queue_empty, ) from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor @@ -79,6 +80,7 @@ from fixtures.utils import ( allure_attach_from_dir, assert_no_errors, get_self_dir, + print_gc_result, subprocess_capture, wait_until, ) @@ -4419,3 +4421,79 @@ def parse_project_git_version_output(s: str) -> str: return commit raise ValueError(f"unable to parse --version output: '{s}'") + + +def generate_uploads_and_deletions( + env: NeonEnv, + *, + init: bool = True, + tenant_id: Optional[TenantId] = None, + timeline_id: Optional[TimelineId] = None, + data: Optional[str] = None, + pageserver: NeonPageserver, +): + """ + Using the environment's default tenant + timeline, generate a load pattern + that results in some uploads and some deletions to remote storage. + """ + + if tenant_id is None: + tenant_id = env.initial_tenant + assert tenant_id is not None + + if timeline_id is None: + timeline_id = env.initial_timeline + assert timeline_id is not None + + ps_http = pageserver.http_client() + + with env.endpoints.create_start( + "main", tenant_id=tenant_id, pageserver_id=pageserver.id + ) as endpoint: + if init: + endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + last_flush_lsn_upload( + env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + ) + + def churn(data): + endpoint.safe_psql_many( + [ + f""" + INSERT INTO foo (id, val) + SELECT g, '{data}' + FROM generate_series(1, 200) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + # to ensure that GC can actually remove some layers + "VACUUM foo", + ] + ) + assert tenant_id is not None + assert timeline_id is not None + # We are waiting for uploads as well as local flush, in order to avoid leaving the system + # in a state where there are "future layers" in remote storage that will generate deletions + # after a restart. + last_flush_lsn_upload( + env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + ) + + # Compaction should generate some GC-elegible layers + for i in range(0, 2): + churn(f"{i if data is None else data}") + + gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0) + print_gc_result(gc_result) + assert gc_result["layers_removed"] > 0 + + # Stop endpoint and flush all data to pageserver, then checkpoint it: this + # ensures that the pageserver is in a fully idle state: there will be no more + # background ingest, no more uploads pending, and therefore no non-determinism + # in subsequent actions like pageserver restarts. + final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + # Finish uploads + wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn) + # Finish all remote writes (including deletions) + wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 43a3323462..93a16620a3 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -1,10 +1,12 @@ +import enum import json import os from typing import Optional import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions +from fixtures.pageserver.http import PageserverApiException from fixtures.workload import Workload AGGRESIVE_COMPACTION_TENANT_CONF = { @@ -190,3 +192,61 @@ def test_sharding_compaction( # Assert that everything is still readable workload.validate() + + +class CompactionAlgorithm(str, enum.Enum): + LEGACY = "Legacy" + TIERED = "Tiered" + + +@pytest.mark.parametrize( + "compaction_algorithm", [CompactionAlgorithm.LEGACY, CompactionAlgorithm.TIERED] +) +def test_uploads_and_deletions( + neon_env_builder: NeonEnvBuilder, + compaction_algorithm: CompactionAlgorithm, +): + """ + :param compaction_algorithm: the compaction algorithm to use. + """ + + tenant_conf = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", + "compaction_algorithm": json.dumps({"kind": compaction_algorithm.value}), + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + # TODO remove these allowed errors + # https://github.com/neondatabase/neon/issues/7707 + # https://github.com/neondatabase/neon/issues/7759 + allowed_errors = [ + ".*duplicated L1 layer.*", + ".*delta layer created with.*duplicate values.*", + ".*assertion failed: self.lsn_range.start <= lsn.*", + ".*HTTP request handler task panicked: task.*panicked.*", + ] + if compaction_algorithm == CompactionAlgorithm.TIERED: + env.pageserver.allowed_errors.extend(allowed_errors) + + try: + generate_uploads_and_deletions(env, pageserver=env.pageserver) + except PageserverApiException as e: + log.info(f"Obtained PageserverApiException: {e}") + + # The errors occur flakily and no error is ensured to occur, + # however at least one of them occurs. + if compaction_algorithm == CompactionAlgorithm.TIERED: + found_allowed_error = any(env.pageserver.log_contains(e) for e in allowed_errors) + if not found_allowed_error: + raise Exception("None of the allowed_errors occured in the log") diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 4fdc5852f5..9b97254410 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -21,11 +21,9 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, - NeonPageserver, PgBin, S3Scrubber, - flush_ep_to_pageserver, - last_flush_lsn_upload, + generate_uploads_and_deletions, ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( @@ -33,12 +31,11 @@ from fixtures.pageserver.utils import ( list_prefix, wait_for_last_record_lsn, wait_for_upload, - wait_for_upload_queue_empty, ) from fixtures.remote_storage import ( RemoteStorageKind, ) -from fixtures.utils import print_gc_result, wait_until +from fixtures.utils import wait_until from fixtures.workload import Workload # A tenant configuration that is convenient for generating uploads and deletions @@ -59,82 +56,6 @@ TENANT_CONF = { } -def generate_uploads_and_deletions( - env: NeonEnv, - *, - init: bool = True, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, - data: Optional[str] = None, - pageserver: NeonPageserver, -): - """ - Using the environment's default tenant + timeline, generate a load pattern - that results in some uploads and some deletions to remote storage. - """ - - if tenant_id is None: - tenant_id = env.initial_tenant - assert tenant_id is not None - - if timeline_id is None: - timeline_id = env.initial_timeline - assert timeline_id is not None - - ps_http = pageserver.http_client() - - with env.endpoints.create_start( - "main", tenant_id=tenant_id, pageserver_id=pageserver.id - ) as endpoint: - if init: - endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") - last_flush_lsn_upload( - env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id - ) - - def churn(data): - endpoint.safe_psql_many( - [ - f""" - INSERT INTO foo (id, val) - SELECT g, '{data}' - FROM generate_series(1, 200) g - ON CONFLICT (id) DO UPDATE - SET val = EXCLUDED.val - """, - # to ensure that GC can actually remove some layers - "VACUUM foo", - ] - ) - assert tenant_id is not None - assert timeline_id is not None - # We are waiting for uploads as well as local flush, in order to avoid leaving the system - # in a state where there are "future layers" in remote storage that will generate deletions - # after a restart. - last_flush_lsn_upload( - env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id - ) - - # Compaction should generate some GC-elegible layers - for i in range(0, 2): - churn(f"{i if data is None else data}") - - gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0) - print_gc_result(gc_result) - assert gc_result["layers_removed"] > 0 - - # Stop endpoint and flush all data to pageserver, then checkpoint it: this - # ensures that the pageserver is in a fully idle state: there will be no more - # background ingest, no more uploads pending, and therefore no non-determinism - # in subsequent actions like pageserver restarts. - final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) - ps_http.timeline_checkpoint(tenant_id, timeline_id) - # Finish uploads - wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn) - # Finish all remote writes (including deletions) - wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) - - def read_all( env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None ): From 3ef6e2121178a8c4e7f498aff9a9e7cb9376fd1c Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 15 May 2024 18:17:55 +0200 Subject: [PATCH 060/130] fixup #7747: actually use the fixture for neon_env_builder (#7767) The `= None` makes it not use the fixture. This slipped due to last-minute changes. --- test_runner/fixtures/neon_fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 405ef19bfc..d3aadbe612 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1392,8 +1392,8 @@ def neon_env_builder( test_overlay_dir: Path, top_output_dir: Path, pageserver_virtual_file_io_engine: str, + pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], pageserver_aux_file_policy: Optional[AuxFileStore] = None, - pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None, ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. From affc18f912b67a31b05a05a05a9cff468a74d75f Mon Sep 17 00:00:00 2001 From: Jure Bajic Date: Wed, 15 May 2024 18:41:12 +0200 Subject: [PATCH 061/130] Add performance regress `test_ondemand_download_churn.py` (#7242) Add performance regress test for on-demand download throughput. Closes https://github.com/neondatabase/neon/issues/7146 Co-authored-by: Christian Schwarz Co-authored-by: Alexander Bayandin --- libs/pageserver_api/src/models.rs | 10 + .../src/cmd/ondemand_download_churn.rs | 103 ++++++++--- .../pagebench/test_ondemand_download_churn.py | 175 ++++++++++++++++++ 3 files changed, 267 insertions(+), 21 deletions(-) create mode 100644 test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index d78d2bcbea..7cf54bf32a 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -745,6 +745,16 @@ impl HistoricLayerInfo { }; *field = value; } + pub fn layer_file_size(&self) -> u64 { + match self { + HistoricLayerInfo::Delta { + layer_file_size, .. + } => *layer_file_size, + HistoricLayerInfo::Image { + layer_file_size, .. + } => *layer_file_size, + } + } } #[derive(Debug, Serialize, Deserialize)] diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs index 197e782dca..1bb71b9353 100644 --- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs +++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs @@ -2,9 +2,11 @@ use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId}; use pageserver_client::mgmt_api; use rand::seq::SliceRandom; +use tokio_util::sync::CancellationToken; use tracing::{debug, info}; use utils::id::{TenantTimelineId, TimelineId}; +use std::{f64, sync::Arc}; use tokio::{ sync::{mpsc, OwnedSemaphorePermit}, task::JoinSet, @@ -12,10 +14,7 @@ use tokio::{ use std::{ num::NonZeroUsize, - sync::{ - atomic::{AtomicU64, Ordering}, - Arc, - }, + sync::atomic::{AtomicU64, Ordering}, time::{Duration, Instant}, }; @@ -51,19 +50,31 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> { Ok(()) } +#[derive(serde::Serialize)] +struct Output { + downloads_count: u64, + downloads_bytes: u64, + evictions_count: u64, + timeline_restarts: u64, + #[serde(with = "humantime_serde")] + runtime: Duration, +} + #[derive(Debug, Default)] struct LiveStats { - evictions: AtomicU64, - downloads: AtomicU64, + evictions_count: AtomicU64, + downloads_count: AtomicU64, + downloads_bytes: AtomicU64, timeline_restarts: AtomicU64, } impl LiveStats { fn eviction_done(&self) { - self.evictions.fetch_add(1, Ordering::Relaxed); + self.evictions_count.fetch_add(1, Ordering::Relaxed); } - fn download_done(&self) { - self.downloads.fetch_add(1, Ordering::Relaxed); + fn download_done(&self, size: u64) { + self.downloads_count.fetch_add(1, Ordering::Relaxed); + self.downloads_bytes.fetch_add(size, Ordering::Relaxed); } fn timeline_restart_done(&self) { self.timeline_restarts.fetch_add(1, Ordering::Relaxed); @@ -92,28 +103,49 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { ) .await?; + let token = CancellationToken::new(); let mut tasks = JoinSet::new(); - let live_stats = Arc::new(LiveStats::default()); + let periodic_stats = Arc::new(LiveStats::default()); + let total_stats = Arc::new(LiveStats::default()); + + let start = Instant::now(); tasks.spawn({ - let live_stats = Arc::clone(&live_stats); + let periodic_stats = Arc::clone(&periodic_stats); + let total_stats = Arc::clone(&total_stats); + let cloned_token = token.clone(); async move { let mut last_at = Instant::now(); loop { + if cloned_token.is_cancelled() { + return; + } tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await; let now = Instant::now(); let delta: Duration = now - last_at; last_at = now; let LiveStats { - evictions, - downloads, + evictions_count, + downloads_count, + downloads_bytes, timeline_restarts, - } = &*live_stats; - let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64(); - let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64(); + } = &*periodic_stats; + let evictions_count = evictions_count.swap(0, Ordering::Relaxed); + let downloads_count = downloads_count.swap(0, Ordering::Relaxed); + let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed); let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed); - info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}"); + + total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed); + total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed); + total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed); + total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed); + + let evictions_per_s = evictions_count as f64 / delta.as_secs_f64(); + let downloads_per_s = downloads_count as f64 / delta.as_secs_f64(); + let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64); + + info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}"); } } }); @@ -124,14 +156,42 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { args, Arc::clone(&mgmt_api_client), tl, - Arc::clone(&live_stats), + Arc::clone(&periodic_stats), + token.clone(), )); } } + if let Some(runtime) = args.runtime { + tokio::spawn(async move { + tokio::time::sleep(runtime.into()).await; + token.cancel(); + }); + } while let Some(res) = tasks.join_next().await { res.unwrap(); } + let end = Instant::now(); + let duration: Duration = end - start; + + let output = { + let LiveStats { + evictions_count, + downloads_count, + downloads_bytes, + timeline_restarts, + } = &*total_stats; + Output { + downloads_count: downloads_count.load(Ordering::Relaxed), + downloads_bytes: downloads_bytes.load(Ordering::Relaxed), + evictions_count: evictions_count.load(Ordering::Relaxed), + timeline_restarts: timeline_restarts.load(Ordering::Relaxed), + runtime: duration, + } + }; + let output = serde_json::to_string_pretty(&output).unwrap(); + println!("{output}"); + Ok(()) } @@ -140,6 +200,7 @@ async fn timeline_actor( mgmt_api_client: Arc, timeline: TenantTimelineId, live_stats: Arc, + token: CancellationToken, ) { // TODO: support sharding let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id); @@ -149,7 +210,7 @@ async fn timeline_actor( layers: Vec>, concurrency: Arc, } - loop { + while !token.is_cancelled() { debug!("restarting timeline"); let layer_map_info = mgmt_api_client .layer_map_info(tenant_shard_id, timeline.timeline_id) @@ -185,7 +246,7 @@ async fn timeline_actor( live_stats.timeline_restart_done(); - loop { + while !token.is_cancelled() { assert!(!timeline.joinset.is_empty()); if let Some(res) = timeline.joinset.try_join_next() { debug!(?res, "a layer actor exited, should not happen"); @@ -255,7 +316,7 @@ async fn layer_actor( .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name()) .await .unwrap(); - live_stats.download_done(); + live_stats.download_done(layer.layer_file_size()); did_it } }; diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py new file mode 100644 index 0000000000..644c1f559b --- /dev/null +++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py @@ -0,0 +1,175 @@ +import json +from pathlib import Path +from typing import Any, Dict, Tuple + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.pageserver.utils import wait_for_upload_queue_empty +from fixtures.remote_storage import s3_storage +from fixtures.utils import humantime_to_ms + + +@pytest.mark.parametrize("duration", [30]) +@pytest.mark.parametrize("io_engine", ["tokio-epoll-uring", "std-fs"]) +@pytest.mark.parametrize("concurrency_per_target", [1, 10, 100]) +@pytest.mark.timeout(1000) +def test_download_churn( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + io_engine: str, + concurrency_per_target: int, + duration: int, +): + def record(metric, **kwargs): + zenbenchmark.record(metric_name=f"pageserver_ondemand_download_churn.{metric}", **kwargs) + + params: Dict[str, Tuple[Any, Dict[str, Any]]] = {} + + # params from fixtures + params.update( + { + # we don't capture `duration`, but instead use the `runtime` output field from pagebench + } + ) + + # configure cache sizes like in prod + page_cache_size = 16384 + max_file_descriptors = 500000 + neon_env_builder.pageserver_config_override = ( + f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}" + ) + params.update( + { + "pageserver_config_override.page_cache_size": ( + page_cache_size * 8192, + {"unit": "byte"}, + ), + "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + } + ) + + for param, (value, kwargs) in params.items(): + record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) + + # Setup env + env = setup_env(neon_env_builder, pg_bin) + env.pageserver.allowed_errors.append( + f".*path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing" + ) + + run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration) + + +def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + # We configure tenant conf such that SQL query below produces a lot of layers. + # We don't care what's in the layers really, we just care that layers are created. + bytes_per_layer = 10 * (1024**2) + env = neon_env_builder.init_start( + initial_tenant_conf={ + "pitr_interval": "1000d", # let's not make it get in the way + "gc_period": "0s", # disable periodic gc to avoid noise + "compaction_period": "0s", # disable L0=>L1 compaction + "checkpoint_timeout": "10years", # rely solely on checkpoint_distance + "checkpoint_distance": bytes_per_layer, # 10M instead of 256M to create more smaller layers + "image_creation_threshold": 100000, # don't create image layers ever + } + ) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=tenant_id) as ep: + ep.safe_psql("CREATE TABLE data (random_text text)") + bytes_per_row = 512 # make big enough so WAL record size doesn't dominate + desired_layers = 300 + desired_bytes = bytes_per_layer * desired_layers + nrows = desired_bytes / bytes_per_row + ep.safe_psql( + f"INSERT INTO data SELECT lpad(i::text, {bytes_per_row}, '0') FROM generate_series(1, {int(nrows)}) as i", + options="-c statement_timeout=0", + ) + wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id) + # TODO: this is a bit imprecise, there could be frozen layers being written out that we don't observe here + wait_for_upload_queue_empty(client, tenant_id, timeline_id) + + return env + + +def run_benchmark( + env: NeonEnv, + pg_bin: PgBin, + record, + io_engine: str, + concurrency_per_target: int, + duration_secs: int, +): + ps_http = env.pageserver.http_client() + cmd = [ + str(env.neon_binpath / "pagebench"), + "ondemand-download-churn", + "--mgmt-api-endpoint", + ps_http.base_url, + "--runtime", + f"{duration_secs}s", + "--set-io-engine", + f"{io_engine}", + "--concurrency-per-target", + f"{concurrency_per_target}", + # don't specify the targets explicitly, let pagebench auto-discover them + ] + + log.info(f"command: {' '.join(cmd)}") + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path, "r") as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + + metric = "downloads_count" + record( + metric, + metric_value=results[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "downloads_bytes" + record( + metric, + metric_value=results[metric], + unit="byte", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "evictions_count" + record( + metric, + metric_value=results[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "timeline_restarts" + record( + metric, + metric_value=results[metric], + unit="", + report=MetricReport.LOWER_IS_BETTER, + ) + + metric = "runtime" + record( + metric, + metric_value=humantime_to_ms(results[metric]) / 1000, + unit="s", + report=MetricReport.TEST_PARAM, + ) From 4b97683338bc21e13686fe4311946b36462729c1 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Wed, 15 May 2024 13:17:57 -0400 Subject: [PATCH 062/130] feat(pageserver): use fnv hash for aux file encoding (#7742) FNV hash is simple, portable, and stable. This pull request vendors the FNV hash implementation from servo and modified it to use the u128 variant. replaces https://github.com/neondatabase/neon/pull/7644 ref https://github.com/neondatabase/neon/issues/7462 --------- Signed-off-by: Alex Chi Z --- pageserver/src/aux_file.rs | 65 ++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 20 deletions(-) diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs index e6d950487d..38e1875db1 100644 --- a/pageserver/src/aux_file.rs +++ b/pageserver/src/aux_file.rs @@ -5,14 +5,35 @@ use bytes::{Buf, BufMut, Bytes}; use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE}; use tracing::warn; -/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash]. +// BEGIN Copyright (c) 2017 Servo Contributors + +/// Const version of FNV hash. +#[inline] +#[must_use] +pub const fn fnv_hash(bytes: &[u8]) -> u128 { + const INITIAL_STATE: u128 = 0x6c62272e07bb014262b821756295c58d; + const PRIME: u128 = 0x0000000001000000000000000000013B; + + let mut hash = INITIAL_STATE; + let mut i = 0; + while i < bytes.len() { + hash ^= bytes[i] as u128; + hash = hash.wrapping_mul(PRIME); + i += 1; + } + hash +} + +// END Copyright (c) 2017 Servo Contributors + +/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, least significant 13B of FNV hash]. fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key { - let mut key = [0; METADATA_KEY_SIZE]; - let hash = twox_hash::xxh3::hash128(data).to_be_bytes(); + let mut key: [u8; 16] = [0; METADATA_KEY_SIZE]; + let hash = fnv_hash(data).to_be_bytes(); key[0] = AUX_KEY_PREFIX; key[1] = dir_level1; key[2] = dir_level2; - key[3..16].copy_from_slice(&hash[0..13]); + key[3..16].copy_from_slice(&hash[3..16]); Key::from_metadata_key_fixed_size(&key) } @@ -200,15 +221,19 @@ mod tests { fn test_hash_portable() { // AUX file encoding requires the hash to be portable across all platforms. This test case checks // if the algorithm produces the same hash across different environments. + assert_eq!( - 305317690835051308206966631765527126151, - twox_hash::xxh3::hash128("test1".as_bytes()) + 265160408618497461376862998434862070044, + super::fnv_hash("test1".as_bytes()) ); assert_eq!( - 85104974691013376326742244813280798847, - twox_hash::xxh3::hash128("test/test2".as_bytes()) + 295486155126299629456360817749600553988, + super::fnv_hash("test/test2".as_bytes()) + ); + assert_eq!( + 144066263297769815596495629667062367629, + super::fnv_hash("".as_bytes()) ); - assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes())); } #[test] @@ -216,28 +241,28 @@ mod tests { // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions // of the page server. assert_eq!( - "6200000101E5B20C5F8DD5AA3289D6D9EAFA", - encode_aux_file_key("pg_logical/mappings/test1").to_string() + "62000001017F8B83D94F7081693471ABF91C", + encode_aux_file_key("pg_logical/mappings/test1").to_string(), ); assert_eq!( - "620000010239AAC544893139B26F501B97E6", - encode_aux_file_key("pg_logical/snapshots/test2").to_string() + "62000001027F8E83D94F7081693471ABFCCD", + encode_aux_file_key("pg_logical/snapshots/test2").to_string(), ); assert_eq!( - "620000010300000000000000000000000000", - encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string() + "62000001032E07BB014262B821756295C58D", + encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string(), ); assert_eq!( - "62000001FF8635AF2134B7266EC5B4189FD6", - encode_aux_file_key("pg_logical/unsupported").to_string() + "62000001FF4F38E1C74754E7D03C1A660178", + encode_aux_file_key("pg_logical/unsupported").to_string(), ); assert_eq!( - "6200000201772D0E5D71DE14DA86142A1619", + "62000002017F8D83D94F7081693471ABFB92", encode_aux_file_key("pg_replslot/test3").to_string() ); assert_eq!( - "620000FFFF1866EBEB53B807B26A2416F317", - encode_aux_file_key("other_file_not_supported").to_string() + "620000FFFF2B6ECC8AEF93F643DC44F15E03", + encode_aux_file_key("other_file_not_supported").to_string(), ); } From c6d5ff944db91f498e46fa24eb0d667abdf94dba Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Wed, 15 May 2024 14:29:12 -0400 Subject: [PATCH 063/130] fix(test): ensure fixtures are correctly used for pageserver_aux_file_policy (#7769) Signed-off-by: Alex Chi Z --- test_runner/fixtures/neon_fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d3aadbe612..a6fd4792dd 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1393,7 +1393,7 @@ def neon_env_builder( top_output_dir: Path, pageserver_virtual_file_io_engine: str, pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], - pageserver_aux_file_policy: Optional[AuxFileStore] = None, + pageserver_aux_file_policy: Optional[AuxFileStore], ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. From 03c603970748844cbc188f1e0dc6179fa1a1e83d Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 16 May 2024 09:26:34 +0100 Subject: [PATCH 064/130] pageserver: refine tenant_id->shard lookup (#7762) ## Problem This is tech debt from when shard splitting was implemented, to handle more nicely the edge case of a client reconnect at the moment of the split. During shard splits, there were edge cases where we could incorrectly return NotFound to a getpage@lsn request, prompting an unwanted reconnect/backoff from the client. It is already the case that parent shards during splits are marked InProgress before child shards are created, so `resolve_attached_shard` will not match on them, thereby implicitly preferring child shards (good). However, we were not doing any elegant handling of InProgress in general: `get_active_tenant_with_timeout` was previously mostly dead code: it was inspecting the slot found by `resolve_attached_shard` and maybe waiting for InProgress, but that path is never taken because since ef7c9c2ccc1a385f74455f45b54faa5b101065e6 the resolve function only ever returns attached slots. Closes: https://github.com/neondatabase/neon/issues/7044 ## Summary of changes - Change return value of `resolve_attached_shard` to distinguish between true NotFound case, and the case where we skipped slots that were InProgress. - Rework `get_active_tenant_with_timeout` to loop over calling resolve_attached_shard, waiting if it sees an InProgress result. The resulting behavior during a shard split is: - If we look up a shard early in split when parent is InProgress but children aren't created yet, we'll wait for the parent to be shut down. This corresponds to the part of the split where we wait for LSNs to catch up: so a small delay to the request, but a clean enough handling. - If we look up a shard while child shards are already present, we will match on those shards rather than the parent, as intended. --- pageserver/src/bin/pageserver.rs | 25 ++-- pageserver/src/page_service.rs | 125 +++++++++++------ pageserver/src/tenant/mgr.rs | 222 +++++++++++-------------------- 3 files changed, 176 insertions(+), 196 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index a04195e12b..ba5b2608bd 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -647,17 +647,20 @@ fn start_pageserver( None, "libpq endpoint listener", true, - async move { - page_service::libpq_listener_main( - conf, - broker_client, - pg_auth, - pageserver_listener, - conf.pg_auth_type, - libpq_ctx, - task_mgr::shutdown_token(), - ) - .await + { + let tenant_manager = tenant_manager.clone(); + async move { + page_service::libpq_listener_main( + tenant_manager, + broker_client, + pg_auth, + pageserver_listener, + conf.pg_auth_type, + libpq_ctx, + task_mgr::shutdown_token(), + ) + .await + } }, ); } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index f6b251283c..35aba044b2 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -32,6 +32,7 @@ use std::str; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use std::time::Instant; use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::io::StreamReader; @@ -49,7 +50,6 @@ use utils::{ use crate::auth::check_permission; use crate::basebackup; use crate::basebackup::BasebackupError; -use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir::import_wal_from_tar; use crate::metrics; @@ -59,13 +59,15 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::task_mgr; use crate::task_mgr::TaskKind; -use crate::tenant::mgr; -use crate::tenant::mgr::get_active_tenant_with_timeout; use crate::tenant::mgr::GetActiveTenantError; +use crate::tenant::mgr::GetTenantError; +use crate::tenant::mgr::ShardResolveResult; use crate::tenant::mgr::ShardSelector; +use crate::tenant::mgr::TenantManager; use crate::tenant::timeline::WaitLsnError; use crate::tenant::GetTimelineError; use crate::tenant::PageReconstructError; +use crate::tenant::Tenant; use crate::tenant::Timeline; use crate::trace::Tracer; use pageserver_api::key::rel_block_to_key; @@ -135,7 +137,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<() /// Listens for connections, and launches a new handler task for each. /// pub async fn libpq_listener_main( - conf: &'static PageServerConf, + tenant_manager: Arc, broker_client: storage_broker::BrokerClientChannel, auth: Option>, listener: TcpListener, @@ -180,7 +182,7 @@ pub async fn libpq_listener_main( "serving compute connection task", false, page_service_conn_main( - conf, + tenant_manager.clone(), broker_client.clone(), local_auth, socket, @@ -203,7 +205,7 @@ pub async fn libpq_listener_main( #[instrument(skip_all, fields(peer_addr))] async fn page_service_conn_main( - conf: &'static PageServerConf, + tenant_manager: Arc, broker_client: storage_broker::BrokerClientChannel, auth: Option>, socket: tokio::net::TcpStream, @@ -260,7 +262,8 @@ async fn page_service_conn_main( // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. - let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx); + let mut conn_handler = + PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx); let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; match pgbackend @@ -291,11 +294,12 @@ struct HandlerTimeline { } struct PageServerHandler { - _conf: &'static PageServerConf, broker_client: storage_broker::BrokerClientChannel, auth: Option>, claims: Option, + tenant_manager: Arc, + /// The context created for the lifetime of the connection /// services by this PageServerHandler. /// For each query received over the connection, @@ -381,13 +385,13 @@ impl From for QueryError { impl PageServerHandler { pub fn new( - conf: &'static PageServerConf, + tenant_manager: Arc, broker_client: storage_broker::BrokerClientChannel, auth: Option>, connection_ctx: RequestContext, ) -> Self { PageServerHandler { - _conf: conf, + tenant_manager, broker_client, auth, claims: None, @@ -552,13 +556,9 @@ impl PageServerHandler { { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); - let tenant = mgr::get_active_tenant_with_timeout( - tenant_id, - ShardSelector::First, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await?; + let tenant = self + .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT) + .await?; // Make request tracer if needed let mut tracer = if tenant.get_trace_read_requests() { @@ -726,13 +726,9 @@ impl PageServerHandler { // Create empty timeline info!("creating new timeline"); - let tenant = get_active_tenant_with_timeout( - tenant_id, - ShardSelector::Zero, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await?; + let tenant = self + .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT) + .await?; let timeline = tenant .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx) .await?; @@ -1370,18 +1366,69 @@ impl PageServerHandler { timeline_id: TimelineId, selector: ShardSelector, ) -> Result, GetActiveTimelineError> { - let tenant = get_active_tenant_with_timeout( - tenant_id, - selector, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await - .map_err(GetActiveTimelineError::Tenant)?; + let tenant = self + .get_active_tenant_with_timeout(tenant_id, selector, ACTIVE_TENANT_TIMEOUT) + .await + .map_err(GetActiveTimelineError::Tenant)?; let timeline = tenant.get_timeline(timeline_id, true)?; set_tracing_field_shard_id(&timeline); Ok(timeline) } + + /// Get a shard's [`Tenant`] in its active state, if present. If we don't find the shard and some + /// slots for this tenant are `InProgress` then we will wait. + /// If we find the [`Tenant`] and it's not yet in state [`TenantState::Active`], we will wait. + /// + /// `timeout` is used as a total timeout for the whole wait operation. + async fn get_active_tenant_with_timeout( + &self, + tenant_id: TenantId, + shard_selector: ShardSelector, + timeout: Duration, + ) -> Result, GetActiveTenantError> { + let wait_start = Instant::now(); + let deadline = wait_start + timeout; + + // Resolve TenantId to TenantShardId. This is usually a quick one-shot thing, the loop is + // for handling the rare case that the slot we're accessing is InProgress. + let tenant_shard = loop { + let resolved = self + .tenant_manager + .resolve_attached_shard(&tenant_id, shard_selector); + match resolved { + ShardResolveResult::Found(tenant_shard) => break tenant_shard, + ShardResolveResult::NotFound => { + return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound( + tenant_id, + ))); + } + ShardResolveResult::InProgress(barrier) => { + // We can't authoritatively answer right now: wait for InProgress state + // to end, then try again + tokio::select! { + _ = self.await_connection_cancelled() => { + return Err(GetActiveTenantError::Cancelled) + }, + _ = barrier.wait() => { + // The barrier completed: proceed around the loop to try looking up again + }, + _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { + return Err(GetActiveTenantError::WaitForActiveTimeout { + latest_state: None, + wait_time: timeout, + }); + } + } + } + }; + }; + + tracing::debug!("Waiting for tenant to enter active state..."); + tenant_shard + .wait_to_become_active(deadline.duration_since(Instant::now())) + .await?; + Ok(tenant_shard) + } } #[async_trait::async_trait] @@ -1771,13 +1818,13 @@ where self.check_permission(Some(tenant_id))?; - let tenant = get_active_tenant_with_timeout( - tenant_id, - ShardSelector::Zero, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await?; + let tenant = self + .get_active_tenant_with_timeout( + tenant_id, + ShardSelector::Zero, + ACTIVE_TENANT_TIMEOUT, + ) + .await?; pgb.write_message_noflush(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), RowDescriptor::int8_col(b"checkpoint_timeout"), diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 5abda7b64e..1d8e2cf6d3 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -16,10 +16,9 @@ use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap}; use std::ops::Deref; use std::sync::Arc; -use std::time::{Duration, Instant}; +use std::time::Duration; use sysinfo::SystemExt; use tokio::fs; -use utils::timeout::{timeout_cancellable, TimeoutCancellableError}; use anyhow::Context; use once_cell::sync::Lazy; @@ -119,6 +118,7 @@ pub(crate) enum TenantsMapRemoveResult { /// When resolving a TenantId to a shard, we may be looking for the 0th /// shard, or we might be looking for whichever shard holds a particular page. +#[derive(Copy, Clone)] pub(crate) enum ShardSelector { /// Only return the 0th shard, if it is present. If a non-0th shard is present, /// ignore it. @@ -169,6 +169,14 @@ impl TenantStartupMode { } } +/// Result type for looking up a TenantId to a specific shard +pub(crate) enum ShardResolveResult { + NotFound, + Found(Arc), + // Wait for this barrrier, then query again + InProgress(utils::completion::Barrier), +} + impl TenantsMap { /// Convenience function for typical usage, where we want to get a `Tenant` object, for /// working with attached tenants. If the TenantId is in the map but in Secondary state, @@ -182,51 +190,6 @@ impl TenantsMap { } } - /// A page service client sends a TenantId, and to look up the correct Tenant we must - /// resolve this to a fully qualified TenantShardId. - fn resolve_attached_shard( - &self, - tenant_id: &TenantId, - selector: ShardSelector, - ) -> Option { - let mut want_shard = None; - match self { - TenantsMap::Initializing => None, - TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { - for slot in m.range(TenantShardId::tenant_range(*tenant_id)) { - // Ignore all slots that don't contain an attached tenant - let tenant = match &slot.1 { - TenantSlot::Attached(t) => t, - _ => continue, - }; - - match selector { - ShardSelector::First => return Some(*slot.0), - ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { - return Some(*slot.0) - } - ShardSelector::Page(key) => { - // First slot we see for this tenant, calculate the expected shard number - // for the key: we will use this for checking if this and subsequent - // slots contain the key, rather than recalculating the hash each time. - if want_shard.is_none() { - want_shard = Some(tenant.shard_identity.get_shard_number(&key)); - } - - if Some(tenant.shard_identity.number) == want_shard { - return Some(*slot.0); - } - } - _ => continue, - } - } - - // Fall through: we didn't find an acceptable shard - None - } - } - } - /// Only for use from DeleteTenantFlow. This method directly removes a TenantSlot from the map. /// /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded @@ -2053,6 +2016,72 @@ impl TenantManager { Ok(reparented) } + + /// A page service client sends a TenantId, and to look up the correct Tenant we must + /// resolve this to a fully qualified TenantShardId. + /// + /// During shard splits: we shall see parent shards in InProgress state and skip them, and + /// instead match on child shards which should appear in Attached state. Very early in a shard + /// split, or in other cases where a shard is InProgress, we will return our own InProgress result + /// to instruct the caller to wait for that to finish before querying again. + pub(crate) fn resolve_attached_shard( + &self, + tenant_id: &TenantId, + selector: ShardSelector, + ) -> ShardResolveResult { + let tenants = self.tenants.read().unwrap(); + let mut want_shard = None; + let mut any_in_progress = None; + + match &*tenants { + TenantsMap::Initializing => ShardResolveResult::NotFound, + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { + for slot in m.range(TenantShardId::tenant_range(*tenant_id)) { + // Ignore all slots that don't contain an attached tenant + let tenant = match &slot.1 { + TenantSlot::Attached(t) => t, + TenantSlot::InProgress(barrier) => { + // We might still find a usable shard, but in case we don't, remember that + // we saw at least one InProgress slot, so that we can distinguish this case + // from a simple NotFound in our return value. + any_in_progress = Some(barrier.clone()); + continue; + } + _ => continue, + }; + + match selector { + ShardSelector::First => return ShardResolveResult::Found(tenant.clone()), + ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { + return ShardResolveResult::Found(tenant.clone()) + } + ShardSelector::Page(key) => { + // First slot we see for this tenant, calculate the expected shard number + // for the key: we will use this for checking if this and subsequent + // slots contain the key, rather than recalculating the hash each time. + if want_shard.is_none() { + want_shard = Some(tenant.shard_identity.get_shard_number(&key)); + } + + if Some(tenant.shard_identity.number) == want_shard { + return ShardResolveResult::Found(tenant.clone()); + } + } + _ => continue, + } + } + + // Fall through: we didn't find a slot that was in Attached state & matched our selector. If + // we found one or more InProgress slot, indicate to caller that they should retry later. Otherwise + // this requested shard simply isn't found. + if let Some(barrier) = any_in_progress { + ShardResolveResult::InProgress(barrier) + } else { + ShardResolveResult::NotFound + } + } + } + } } #[derive(Debug, thiserror::Error)] @@ -2101,105 +2130,6 @@ pub(crate) enum GetActiveTenantError { Broken(String), } -/// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`] -/// state, then wait for up to `timeout`. If the [`Tenant`] is not currently in [`TenantState::Active`], -/// then wait for up to `timeout` (minus however long we waited for the slot). -pub(crate) async fn get_active_tenant_with_timeout( - tenant_id: TenantId, - shard_selector: ShardSelector, - timeout: Duration, - cancel: &CancellationToken, -) -> Result, GetActiveTenantError> { - enum WaitFor { - Barrier(utils::completion::Barrier), - Tenant(Arc), - } - - let wait_start = Instant::now(); - let deadline = wait_start + timeout; - - let (wait_for, tenant_shard_id) = { - let locked = TENANTS.read().unwrap(); - - // Resolve TenantId to TenantShardId - let tenant_shard_id = locked - .resolve_attached_shard(&tenant_id, shard_selector) - .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound( - tenant_id, - )))?; - - let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) - .map_err(GetTenantError::MapState)?; - match peek_slot { - Some(TenantSlot::Attached(tenant)) => { - match tenant.current_state() { - TenantState::Active => { - // Fast path: we don't need to do any async waiting. - return Ok(tenant.clone()); - } - _ => { - tenant.activate_now(); - (WaitFor::Tenant(tenant.clone()), tenant_shard_id) - } - } - } - Some(TenantSlot::Secondary(_)) => { - return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive( - tenant_shard_id, - ))) - } - Some(TenantSlot::InProgress(barrier)) => { - (WaitFor::Barrier(barrier.clone()), tenant_shard_id) - } - None => { - return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound( - tenant_id, - ))) - } - } - }; - - let tenant = match wait_for { - WaitFor::Barrier(barrier) => { - tracing::debug!("Waiting for tenant InProgress state to pass..."); - timeout_cancellable( - deadline.duration_since(Instant::now()), - cancel, - barrier.wait(), - ) - .await - .map_err(|e| match e { - TimeoutCancellableError::Timeout => GetActiveTenantError::WaitForActiveTimeout { - latest_state: None, - wait_time: wait_start.elapsed(), - }, - TimeoutCancellableError::Cancelled => GetActiveTenantError::Cancelled, - })?; - { - let locked = TENANTS.read().unwrap(); - let peek_slot = - tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) - .map_err(GetTenantError::MapState)?; - match peek_slot { - Some(TenantSlot::Attached(tenant)) => tenant.clone(), - _ => { - return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive( - tenant_shard_id, - ))) - } - } - } - } - WaitFor::Tenant(tenant) => tenant, - }; - - tracing::debug!("Waiting for tenant to enter active state..."); - tenant - .wait_to_become_active(deadline.duration_since(Instant::now())) - .await?; - Ok(tenant) -} - #[derive(Debug, thiserror::Error)] pub(crate) enum DeleteTimelineError { #[error("Tenant {0}")] From 923cf91aa4c1986c47fba0158fff0ac9bf8225ce Mon Sep 17 00:00:00 2001 From: Andrew Rudenko Date: Thu, 16 May 2024 12:04:16 +0200 Subject: [PATCH 065/130] compute_ctl: catalog API endpoints (#7575) ## Problem There are two cloud's features that require extra compute endpoints. 1. We are running pg_dump to get DB schemas. Currently, we are using a special service for this. But it would be great to execute pg_dump in an isolated environment. And we already have such an environment, it's our compute! And likely enough pg_dump already exists there too! (see https://github.com/neondatabase/cloud/issues/11644#issuecomment-2084617832) 2. We need to have a way to get databases and roles from compute after time travel (see https://github.com/neondatabase/cloud/issues/12109) ## Summary of changes It adds two API endpoints to compute_ctl HTTP API that target both of the aforementioned cases. --------- Co-authored-by: Tristan Partin --- Cargo.lock | 2 + compute_tools/Cargo.toml | 2 + compute_tools/src/catalog.rs | 116 ++++++++++++++++++++ compute_tools/src/http/api.rs | 47 ++++++++ compute_tools/src/http/openapi_spec.yaml | 112 +++++++++++++++++++ compute_tools/src/lib.rs | 1 + libs/compute_api/src/responses.rs | 8 +- test_runner/fixtures/endpoint/__init__.py | 0 test_runner/fixtures/endpoint/http.py | 23 ++++ test_runner/fixtures/neon_fixtures.py | 8 ++ test_runner/regress/test_compute_catalog.py | 34 ++++++ 11 files changed, 352 insertions(+), 1 deletion(-) create mode 100644 compute_tools/src/catalog.rs create mode 100644 test_runner/fixtures/endpoint/__init__.py create mode 100644 test_runner/fixtures/endpoint/http.py create mode 100644 test_runner/regress/test_compute_catalog.py diff --git a/Cargo.lock b/Cargo.lock index 961101b151..b1f53404ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1233,8 +1233,10 @@ dependencies = [ "serde_json", "signal-hook", "tar", + "thiserror", "tokio", "tokio-postgres", + "tokio-stream", "tokio-util", "toml_edit", "tracing", diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 759a117ee9..8f96530a9d 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -27,10 +27,12 @@ reqwest = { workspace = true, features = ["json"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tokio-util.workspace = true +tokio-stream.workspace = true tracing.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true +thiserror.workspace = true url.workspace = true compute_api.workspace = true diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs new file mode 100644 index 0000000000..4fefa831e0 --- /dev/null +++ b/compute_tools/src/catalog.rs @@ -0,0 +1,116 @@ +use compute_api::{ + responses::CatalogObjects, + spec::{Database, Role}, +}; +use futures::Stream; +use postgres::{Client, NoTls}; +use std::{path::Path, process::Stdio, result::Result, sync::Arc}; +use tokio::{ + io::{AsyncBufReadExt, BufReader}, + process::Command, + task, +}; +use tokio_stream::{self as stream, StreamExt}; +use tokio_util::codec::{BytesCodec, FramedRead}; +use tracing::warn; + +use crate::{ + compute::ComputeNode, + pg_helpers::{get_existing_dbs, get_existing_roles}, +}; + +pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { + let connstr = compute.connstr.clone(); + task::spawn_blocking(move || { + let mut client = Client::connect(connstr.as_str(), NoTls)?; + let roles: Vec; + { + let mut xact = client.transaction()?; + roles = get_existing_roles(&mut xact)?; + } + let databases: Vec = get_existing_dbs(&mut client)?.values().cloned().collect(); + + Ok(CatalogObjects { roles, databases }) + }) + .await? +} + +#[derive(Debug, thiserror::Error)] +pub enum SchemaDumpError { + #[error("Database does not exist.")] + DatabaseDoesNotExist, + #[error("Failed to execute pg_dump.")] + IO(#[from] std::io::Error), +} + +// It uses the pg_dump utility to dump the schema of the specified database. +// The output is streamed back to the caller and supposed to be streamed via HTTP. +// +// Before return the result with the output, it checks that pg_dump produced any output. +// If not, it tries to parse the stderr output to determine if the database does not exist +// and special error is returned. +// +// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature. +pub async fn get_database_schema( + compute: &Arc, + dbname: &str, +) -> Result>, SchemaDumpError> { + let pgbin = &compute.pgbin; + let basepath = Path::new(pgbin).parent().unwrap(); + let pgdump = basepath.join("pg_dump"); + let mut connstr = compute.connstr.clone(); + connstr.set_path(dbname); + let mut cmd = Command::new(pgdump) + .arg("--schema-only") + .arg(connstr.as_str()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .kill_on_drop(true) + .spawn()?; + + let stdout = cmd.stdout.take().ok_or_else(|| { + std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.") + })?; + + let stderr = cmd.stderr.take().ok_or_else(|| { + std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.") + })?; + + let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new()); + let stderr_reader = BufReader::new(stderr); + + let first_chunk = match stdout_reader.next().await { + Some(Ok(bytes)) if !bytes.is_empty() => bytes, + Some(Err(e)) => { + return Err(SchemaDumpError::IO(e)); + } + _ => { + let mut lines = stderr_reader.lines(); + if let Some(line) = lines.next_line().await? { + if line.contains(&format!("FATAL: database \"{}\" does not exist", dbname)) { + return Err(SchemaDumpError::DatabaseDoesNotExist); + } + warn!("pg_dump stderr: {}", line) + } + tokio::spawn(async move { + while let Ok(Some(line)) = lines.next_line().await { + warn!("pg_dump stderr: {}", line) + } + }); + + return Err(SchemaDumpError::IO(std::io::Error::new( + std::io::ErrorKind::Other, + "failed to start pg_dump", + ))); + } + }; + let initial_stream = stream::once(Ok(first_chunk.freeze())); + // Consume stderr and log warnings + tokio::spawn(async move { + let mut lines = stderr_reader.lines(); + while let Ok(Some(line)) = lines.next_line().await { + warn!("pg_dump stderr: {}", line) + } + }); + Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze())))) +} diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 128783b477..0286429cf2 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -5,17 +5,21 @@ use std::net::SocketAddr; use std::sync::Arc; use std::thread; +use crate::catalog::SchemaDumpError; +use crate::catalog::{get_database_schema, get_dbs_and_roles}; use crate::compute::forward_termination_signal; use crate::compute::{ComputeNode, ComputeState, ParsedSpec}; use compute_api::requests::ConfigurationRequest; use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError}; use anyhow::Result; +use hyper::header::CONTENT_TYPE; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; use tokio::task; use tracing::{error, info, warn}; use tracing_utils::http::OtelName; +use utils::http::request::must_get_query_param; fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse { ComputeStatusResponse { @@ -133,6 +137,34 @@ async fn routes(req: Request, compute: &Arc) -> Response { + info!("serving /dbs_and_roles GET request",); + match get_dbs_and_roles(compute).await { + Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())), + Err(_) => { + render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR) + } + } + } + + (&Method::GET, "/database_schema") => { + let database = match must_get_query_param(&req, "database") { + Err(e) => return e.into_response(), + Ok(database) => database, + }; + info!("serving /database_schema GET request with database: {database}",); + match get_database_schema(compute, &database).await { + Ok(res) => render_plain(Body::wrap_stream(res)), + Err(SchemaDumpError::DatabaseDoesNotExist) => { + render_json_error("database does not exist", StatusCode::NOT_FOUND) + } + Err(e) => { + error!("can't get schema dump: {}", e); + render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR) + } + } + } + // download extension files from remote extension storage on demand (&Method::POST, route) if route.starts_with("/extension_server/") => { info!("serving {:?} POST request", route); @@ -303,10 +335,25 @@ fn render_json_error(e: &str, status: StatusCode) -> Response { }; Response::builder() .status(status) + .header(CONTENT_TYPE, "application/json") .body(Body::from(serde_json::to_string(&error).unwrap())) .unwrap() } +fn render_json(body: Body) -> Response { + Response::builder() + .header(CONTENT_TYPE, "application/json") + .body(body) + .unwrap() +} + +fn render_plain(body: Body) -> Response { + Response::builder() + .header(CONTENT_TYPE, "text/plain") + .body(body) + .unwrap() +} + async fn handle_terminate_request(compute: &Arc) -> Result<(), (String, StatusCode)> { { let mut state = compute.state.lock().unwrap(); diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index d2ec54299f..b0ddaeae2b 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -68,6 +68,51 @@ paths: schema: $ref: "#/components/schemas/Info" + /dbs_and_roles: + get: + tags: + - Info + summary: Get databases and roles in the catalog. + description: "" + operationId: getDbsAndRoles + responses: + 200: + description: Compute schema objects + content: + application/json: + schema: + $ref: "#/components/schemas/DbsAndRoles" + + /database_schema: + get: + tags: + - Info + summary: Get schema dump + parameters: + - name: database + in: query + description: Database name to dump. + required: true + schema: + type: string + example: "postgres" + description: Get schema dump in SQL format. + operationId: getDatabaseSchema + responses: + 200: + description: Schema dump + content: + text/plain: + schema: + type: string + description: Schema dump in SQL format. + 404: + description: Non existing database. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + /check_writability: post: tags: @@ -229,6 +274,73 @@ components: num_cpus: type: integer + DbsAndRoles: + type: object + description: Databases and Roles + required: + - roles + - databases + properties: + roles: + type: array + items: + $ref: "#/components/schemas/Role" + databases: + type: array + items: + $ref: "#/components/schemas/Database" + + Database: + type: object + description: Database + required: + - name + - owner + - restrict_conn + - invalid + properties: + name: + type: string + owner: + type: string + options: + type: array + items: + $ref: "#/components/schemas/GenericOption" + restrict_conn: + type: boolean + invalid: + type: boolean + + Role: + type: object + description: Role + required: + - name + properties: + name: + type: string + encrypted_password: + type: string + options: + type: array + items: + $ref: "#/components/schemas/GenericOption" + + GenericOption: + type: object + description: Schema Generic option + required: + - name + - vartype + properties: + name: + type: string + value: + type: string + vartype: + type: string + ComputeState: type: object required: diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index eac808385c..18c228ba54 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -8,6 +8,7 @@ pub mod configurator; pub mod http; #[macro_use] pub mod logger; +pub mod catalog; pub mod compute; pub mod extension_server; pub mod monitor; diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index fd0c90d447..d05d625b0a 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -3,7 +3,7 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize, Serializer}; -use crate::spec::ComputeSpec; +use crate::spec::{ComputeSpec, Database, Role}; #[derive(Serialize, Debug, Deserialize)] pub struct GenericAPIError { @@ -113,6 +113,12 @@ pub struct ComputeMetrics { pub total_ext_download_size: u64, } +#[derive(Clone, Debug, Default, Serialize)] +pub struct CatalogObjects { + pub roles: Vec, + pub databases: Vec, +} + /// Response of the `/computes/{compute_id}/spec` control-plane API. /// This is not actually a compute API response, so consider moving /// to a different place. diff --git a/test_runner/fixtures/endpoint/__init__.py b/test_runner/fixtures/endpoint/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py new file mode 100644 index 0000000000..42f0539c19 --- /dev/null +++ b/test_runner/fixtures/endpoint/http.py @@ -0,0 +1,23 @@ +import requests +from requests.adapters import HTTPAdapter + + +class EndpointHttpClient(requests.Session): + def __init__( + self, + port: int, + ): + super().__init__() + self.port = port + + self.mount("http://", HTTPAdapter()) + + def dbs_and_roles(self): + res = self.get(f"http://localhost:{self.port}/dbs_and_roles") + res.raise_for_status() + return res.json() + + def database_schema(self, database: str): + res = self.get(f"http://localhost:{self.port}/database_schema?database={database}") + res.raise_for_status() + return res.text diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index a6fd4792dd..b4761f103b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -48,6 +48,7 @@ from urllib3.util.retry import Retry from fixtures import overlayfs from fixtures.broker import NeonBroker from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.endpoint.http import EndpointHttpClient from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pageserver.allowed_errors import ( @@ -3373,6 +3374,13 @@ class Endpoint(PgProtocol): self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers)) # path to conf is /endpoints//pgdata/postgresql.conf + def http_client( + self, auth_token: Optional[str] = None, retries: Optional[Retry] = None + ) -> EndpointHttpClient: + return EndpointHttpClient( + port=self.http_port, + ) + def create( self, branch_name: str, diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py new file mode 100644 index 0000000000..dd36190fcd --- /dev/null +++ b/test_runner/regress/test_compute_catalog.py @@ -0,0 +1,34 @@ +import requests +from fixtures.neon_fixtures import NeonEnv + + +def test_compute_catalog(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_config", "empty") + + endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"]) + client = endpoint.http_client() + + objects = client.dbs_and_roles() + + # Assert that 'cloud_admin' role exists in the 'roles' list + assert any( + role["name"] == "cloud_admin" for role in objects["roles"] + ), "The 'cloud_admin' role is missing" + + # Assert that 'postgres' database exists in the 'databases' list + assert any( + db["name"] == "postgres" for db in objects["databases"] + ), "The 'postgres' database is missing" + + ddl = client.database_schema(database="postgres") + + assert "-- PostgreSQL database dump" in ddl + + try: + client.database_schema(database="nonexistentdb") + raise AssertionError("Expected HTTPError was not raised") + except requests.exceptions.HTTPError as e: + assert ( + e.response.status_code == 404 + ), f"Expected 404 status code, but got {e.response.status_code}" From 790c05d67543a2f183193eff66de4e90dbe2e7f9 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 16 May 2024 12:05:50 +0100 Subject: [PATCH 066/130] proxy: swap tungstenite for a simpler impl (#7353) ## Problem I wanted to do a deep dive of the tungstenite codebase. tokio-tungstenite is incredibly convoluted... In my searching I found [fastwebsockets by deno](https://github.com/denoland/fastwebsockets), but it wasn't quite sufficient. This also removes the default 16MB/64MB frame/message size limitation. framed-websockets solves this by inserting continuation frames for partially received messages, so the whole message does not need to be entirely read into memory. ## Summary of changes I took the fastwebsockets code as a starting off point and rewrote it to be simpler, server-only, and be poll-based to support our Read/Write wrappers. I have replaced our tungstenite code with my framed-websockets fork. --- Cargo.lock | 109 ++++++++----------- Cargo.toml | 7 +- proxy/Cargo.toml | 4 +- proxy/src/serverless.rs | 9 +- proxy/src/serverless/websocket.rs | 93 ++++++++-------- test_runner/regress/test_proxy_websockets.py | 9 +- 6 files changed, 107 insertions(+), 124 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b1f53404ea..e1edd53fea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -708,7 +708,7 @@ dependencies = [ "sha1", "sync_wrapper", "tokio", - "tokio-tungstenite 0.20.0", + "tokio-tungstenite", "tower", "tower-layer", "tower-service", @@ -979,6 +979,12 @@ version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" +[[package]] +name = "bytemuck" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5" + [[package]] name = "byteorder" version = "1.4.3" @@ -1598,7 +1604,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d" dependencies = [ "cfg-if", - "hashbrown 0.14.0", + "hashbrown 0.14.5", "lock_api", "once_cell", "parking_lot_core 0.9.8", @@ -1999,6 +2005,27 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "framed-websockets" +version = "0.1.0" +source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127" +dependencies = [ + "base64 0.21.1", + "bytemuck", + "bytes", + "futures-core", + "futures-sink", + "http-body-util", + "hyper 1.2.0", + "hyper-util", + "pin-project", + "rand 0.8.5", + "sha1", + "thiserror", + "tokio", + "tokio-util", +] + [[package]] name = "fs2" version = "0.4.3" @@ -2277,9 +2304,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.0" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" dependencies = [ "ahash", "allocator-api2", @@ -2287,11 +2314,11 @@ dependencies = [ [[package]] name = "hashlink" -version = "0.8.4" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7" +checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" dependencies = [ - "hashbrown 0.14.0", + "hashbrown 0.14.5", ] [[package]] @@ -2600,21 +2627,6 @@ dependencies = [ "tokio-native-tls", ] -[[package]] -name = "hyper-tungstenite" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad" -dependencies = [ - "http-body-util", - "hyper 1.2.0", - "hyper-util", - "pin-project-lite", - "tokio", - "tokio-tungstenite 0.21.0", - "tungstenite 0.21.0", -] - [[package]] name = "hyper-util" version = "0.1.3" @@ -2692,7 +2704,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e" dependencies = [ "equivalent", - "hashbrown 0.14.0", + "hashbrown 0.14.5", ] [[package]] @@ -2954,7 +2966,7 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc" dependencies = [ - "hashbrown 0.14.0", + "hashbrown 0.14.5", ] [[package]] @@ -3007,7 +3019,7 @@ checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5" dependencies = [ "bytes", "crossbeam-utils", - "hashbrown 0.14.0", + "hashbrown 0.14.5", "itoa", "lasso", "measured-derive", @@ -3569,7 +3581,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79" dependencies = [ "dlv-list", - "hashbrown 0.14.0", + "hashbrown 0.14.5", ] [[package]] @@ -3896,7 +3908,7 @@ dependencies = [ "ahash", "bytes", "chrono", - "hashbrown 0.14.0", + "hashbrown 0.14.5", "num", "num-bigint", "paste", @@ -4380,9 +4392,10 @@ dependencies = [ "dashmap", "env_logger", "fallible-iterator", + "framed-websockets", "futures", "git-version", - "hashbrown 0.13.2", + "hashbrown 0.14.5", "hashlink", "hex", "hmac", @@ -4392,7 +4405,6 @@ dependencies = [ "humantime", "hyper 0.14.26", "hyper 1.2.0", - "hyper-tungstenite", "hyper-util", "indexmap 2.0.1", "ipnet", @@ -4437,7 +4449,6 @@ dependencies = [ "smol_str", "socket2 0.5.5", "subtle", - "sync_wrapper", "task-local-extensions", "thiserror", "tikv-jemalloc-ctl", @@ -4446,6 +4457,7 @@ dependencies = [ "tokio-postgres", "tokio-postgres-rustls", "tokio-rustls 0.25.0", + "tokio-tungstenite", "tokio-util", "tower-service", "tracing", @@ -6382,19 +6394,7 @@ dependencies = [ "futures-util", "log", "tokio", - "tungstenite 0.20.1", -] - -[[package]] -name = "tokio-tungstenite" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38" -dependencies = [ - "futures-util", - "log", - "tokio", - "tungstenite 0.21.0", + "tungstenite", ] [[package]] @@ -6408,7 +6408,7 @@ dependencies = [ "futures-io", "futures-sink", "futures-util", - "hashbrown 0.14.0", + "hashbrown 0.14.5", "pin-project-lite", "tokio", "tracing", @@ -6690,25 +6690,6 @@ dependencies = [ "utf-8", ] -[[package]] -name = "tungstenite" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1" -dependencies = [ - "byteorder", - "bytes", - "data-encoding", - "http 1.1.0", - "httparse", - "log", - "rand 0.8.5", - "sha1", - "thiserror", - "url", - "utf-8", -] - [[package]] name = "twox-hash" version = "1.6.3" @@ -7504,7 +7485,7 @@ dependencies = [ "futures-sink", "futures-util", "getrandom 0.2.11", - "hashbrown 0.14.0", + "hashbrown 0.14.5", "hex", "hmac", "hyper 0.14.26", diff --git a/Cargo.toml b/Cargo.toml index 3ccdabee18..b59a5dcd6d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -81,13 +81,14 @@ enum-map = "2.4.2" enumset = "1.0.12" fail = "0.5.0" fallible-iterator = "0.2" +framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" } fs2 = "0.4.3" futures = "0.3" futures-core = "0.3" futures-util = "0.3" git-version = "0.3" -hashbrown = "0.13" -hashlink = "0.8.4" +hashbrown = "0.14" +hashlink = "0.9.1" hdrhistogram = "7.5.2" hex = "0.4" hex-literal = "0.4" @@ -98,7 +99,7 @@ http-types = { version = "2", default-features = false } humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" -hyper-tungstenite = "0.13.0" +tokio-tungstenite = "0.20.0" indexmap = "2" inotify = "0.10.2" ipnet = "2.9.0" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 3002006aed..5f9b0aa75b 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -26,6 +26,7 @@ clap.workspace = true consumption_metrics.workspace = true dashmap.workspace = true env_logger.workspace = true +framed-websockets.workspace = true futures.workspace = true git-version.workspace = true hashbrown.workspace = true @@ -35,7 +36,6 @@ hmac.workspace = true hostname.workspace = true http.workspace = true humantime.workspace = true -hyper-tungstenite.workspace = true hyper.workspace = true hyper1 = { package = "hyper", version = "1.2", features = ["server"] } hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } @@ -76,7 +76,6 @@ smol_str.workspace = true smallvec.workspace = true socket2.workspace = true subtle.workspace = true -sync_wrapper.workspace = true task-local-extensions.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true @@ -106,6 +105,7 @@ workspace_hack.workspace = true [dev-dependencies] camino-tempfile.workspace = true fallible-iterator.workspace = true +tokio-tungstenite.workspace = true rcgen.workspace = true rstest.workspace = true tokio-postgres-rustls.workspace = true diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index f634ab4e98..24ee749e6e 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -102,7 +102,7 @@ pub async fn task_main( let connections = tokio_util::task::task_tracker::TaskTracker::new(); connections.close(); // allows `connections.wait to complete` - let server = Builder::new(hyper_util::rt::TokioExecutor::new()); + let server = Builder::new(TokioExecutor::new()); while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await { let (conn, peer_addr) = res.context("could not accept TCP stream")?; @@ -255,7 +255,6 @@ async fn connection_handler( .in_current_span() .map_ok_or_else(api_error_into_response, |r| r), ); - async move { let res = handler.await; cancel_request.disarm(); @@ -301,7 +300,7 @@ async fn request_handler( .map(|s| s.to_string()); // Check if the request is a websocket upgrade request. - if hyper_tungstenite::is_upgrade_request(&request) { + if framed_websockets::upgrade::is_upgrade_request(&request) { let ctx = RequestMonitoring::new( session_id, peer_addr, @@ -312,7 +311,7 @@ async fn request_handler( let span = ctx.span.clone(); info!(parent: &span, "performing websocket upgrade"); - let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None) + let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request) .map_err(|e| ApiError::BadRequest(e.into()))?; ws_connections.spawn( @@ -334,7 +333,7 @@ async fn request_handler( ); // Return the response so the spawned future can continue. - Ok(response) + Ok(response.map(|_: http_body_util::Empty| Full::new(Bytes::new()))) } else if request.uri().path() == "/sql" && *request.method() == Method::POST { let ctx = RequestMonitoring::new( session_id, diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 649bec2c7c..61d6d60dbe 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -7,10 +7,11 @@ use crate::{ proxy::{handle_client, ClientMode}, rate_limiter::EndpointRateLimiter, }; -use bytes::{Buf, Bytes}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use framed_websockets::{Frame, OpCode, WebSocketServer}; use futures::{Sink, Stream}; -use hyper::upgrade::Upgraded; -use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream}; +use hyper1::upgrade::OnUpgrade; +use hyper_util::rt::TokioIo; use pin_project_lite::pin_project; use std::{ @@ -21,25 +22,23 @@ use std::{ use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; use tracing::warn; -// TODO: use `std::sync::Exclusive` once it's stabilized. -// Tracking issue: https://github.com/rust-lang/rust/issues/98407. -use sync_wrapper::SyncWrapper; - pin_project! { /// This is a wrapper around a [`WebSocketStream`] that /// implements [`AsyncRead`] and [`AsyncWrite`]. - pub struct WebSocketRw { + pub struct WebSocketRw { #[pin] - stream: SyncWrapper>, - bytes: Bytes, + stream: WebSocketServer, + recv: Bytes, + send: BytesMut, } } impl WebSocketRw { - pub fn new(stream: WebSocketStream) -> Self { + pub fn new(stream: WebSocketServer) -> Self { Self { - stream: stream.into(), - bytes: Bytes::new(), + stream, + recv: Bytes::new(), + send: BytesMut::new(), } } } @@ -50,22 +49,24 @@ impl AsyncWrite for WebSocketRw { cx: &mut Context<'_>, buf: &[u8], ) -> Poll> { - let mut stream = self.project().stream.get_pin_mut(); + let this = self.project(); + let mut stream = this.stream; + this.send.put(buf); ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?; - match stream.as_mut().start_send(Message::Binary(buf.into())) { + match stream.as_mut().start_send(Frame::binary(this.send.split())) { Ok(()) => Poll::Ready(Ok(buf.len())), Err(e) => Poll::Ready(Err(io_error(e))), } } fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let stream = self.project().stream.get_pin_mut(); + let stream = self.project().stream; stream.poll_flush(cx).map_err(io_error) } fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let stream = self.project().stream.get_pin_mut(); + let stream = self.project().stream; stream.poll_close(cx).map_err(io_error) } } @@ -76,13 +77,10 @@ impl AsyncRead for WebSocketRw { cx: &mut Context<'_>, buf: &mut ReadBuf<'_>, ) -> Poll> { - if buf.remaining() > 0 { - let bytes = ready!(self.as_mut().poll_fill_buf(cx))?; - let len = std::cmp::min(bytes.len(), buf.remaining()); - buf.put_slice(&bytes[..len]); - self.consume(len); - } - + let bytes = ready!(self.as_mut().poll_fill_buf(cx))?; + let len = std::cmp::min(bytes.len(), buf.remaining()); + buf.put_slice(&bytes[..len]); + self.consume(len); Poll::Ready(Ok(())) } } @@ -94,31 +92,27 @@ impl AsyncBufRead for WebSocketRw { let mut this = self.project(); loop { - if !this.bytes.chunk().is_empty() { - let chunk = (*this.bytes).chunk(); + if !this.recv.chunk().is_empty() { + let chunk = (*this.recv).chunk(); return Poll::Ready(Ok(chunk)); } - let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx)); + let res = ready!(this.stream.as_mut().poll_next(cx)); match res.transpose().map_err(io_error)? { - Some(message) => match message { - Message::Ping(_) => {} - Message::Pong(_) => {} - Message::Text(text) => { + Some(message) => match message.opcode { + OpCode::Ping => {} + OpCode::Pong => {} + OpCode::Text => { // We expect to see only binary messages. let error = "unexpected text message in the websocket"; - warn!(length = text.len(), error); + warn!(length = message.payload.len(), error); return Poll::Ready(Err(io_error(error))); } - Message::Frame(_) => { - // This case is impossible according to Frame's doc. - panic!("unexpected raw frame in the websocket"); + OpCode::Binary | OpCode::Continuation => { + debug_assert!(this.recv.is_empty()); + *this.recv = message.payload.freeze(); } - Message::Binary(chunk) => { - assert!(this.bytes.is_empty()); - *this.bytes = Bytes::from(chunk); - } - Message::Close(_) => return EOF, + OpCode::Close => return EOF, }, None => return EOF, } @@ -126,19 +120,21 @@ impl AsyncBufRead for WebSocketRw { } fn consume(self: Pin<&mut Self>, amount: usize) { - self.project().bytes.advance(amount); + self.project().recv.advance(amount); } } pub async fn serve_websocket( config: &'static ProxyConfig, mut ctx: RequestMonitoring, - websocket: HyperWebsocket, + websocket: OnUpgrade, cancellation_handler: Arc, endpoint_rate_limiter: Arc, hostname: Option, ) -> anyhow::Result<()> { let websocket = websocket.await?; + let websocket = WebSocketServer::after_handshake(TokioIo::new(websocket)); + let conn_gauge = Metrics::get() .proxy .client_connections @@ -177,15 +173,16 @@ pub async fn serve_websocket( mod tests { use std::pin::pin; + use framed_websockets::WebSocketServer; use futures::{SinkExt, StreamExt}; - use hyper_tungstenite::{ - tungstenite::{protocol::Role, Message}, - WebSocketStream, - }; use tokio::{ io::{duplex, AsyncReadExt, AsyncWriteExt}, task::JoinSet, }; + use tokio_tungstenite::{ + tungstenite::{protocol::Role, Message}, + WebSocketStream, + }; use super::WebSocketRw; @@ -210,9 +207,7 @@ mod tests { }); js.spawn(async move { - let mut rw = pin!(WebSocketRw::new( - WebSocketStream::from_raw_socket(stream2, Role::Server, None).await - )); + let mut rw = pin!(WebSocketRw::new(WebSocketServer::after_handshake(stream2))); let mut buf = vec![0; 1024]; let n = rw.read(&mut buf).await.unwrap(); diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py index 6d1cb9765a..6211446a40 100644 --- a/test_runner/regress/test_proxy_websockets.py +++ b/test_runner/regress/test_proxy_websockets.py @@ -135,7 +135,14 @@ async def test_websockets_pipelined(static_proxy: NeonProxy): query_message = "SELECT 1".encode("utf-8") + b"\0" length2 = (4 + len(query_message)).to_bytes(4, byteorder="big") await websocket.send( - [length0, startup_message, b"p", length1, auth_message, b"Q", length2, query_message] + length0 + + startup_message + + b"p" + + length1 + + auth_message + + b"Q" + + length2 + + query_message ) startup_response = await websocket.recv() From ec069dc45ec6c0ef9b500dc0f433a0415b7e26db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Thu, 16 May 2024 16:48:49 +0200 Subject: [PATCH 067/130] tiered compaction: introduce PAGE_SZ constant and use it (#7785) pointed out by @problame : we use the literal 8192 instead of a properly defined constant. replace the literal by a PAGE_SZ constant. --- pageserver/compaction/src/bin/compaction-simulator.rs | 3 ++- pageserver/compaction/src/compact_tiered.rs | 6 +++--- pageserver/compaction/src/helpers.rs | 2 ++ pageserver/compaction/src/simulator.rs | 3 ++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pageserver/compaction/src/bin/compaction-simulator.rs b/pageserver/compaction/src/bin/compaction-simulator.rs index 1fd69407d3..c308694ae1 100644 --- a/pageserver/compaction/src/bin/compaction-simulator.rs +++ b/pageserver/compaction/src/bin/compaction-simulator.rs @@ -1,4 +1,5 @@ use clap::{Parser, Subcommand}; +use pageserver_compaction::helpers::PAGE_SZ; use pageserver_compaction::simulator::MockTimeline; use rand::Rng; use std::io::Write; @@ -51,7 +52,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> let mut executor = MockTimeline::new(); // Convert the logical size in MB into a key range. - let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192); + let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ); //let key_range = u64::MIN..u64::MAX; println!( "starting simulation with key range {:016X}-{:016X}", diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index a8f184af24..33c9948f45 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -25,7 +25,7 @@ use std::collections::{HashSet, VecDeque}; use std::ops::Range; use crate::helpers::{ - accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, + accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ, }; use crate::interface::*; use utils::lsn::Lsn; @@ -379,7 +379,7 @@ where .get_keyspace(&job.key_range, job.lsn_range.end, ctx) .await?, &self.shard_identity, - ) * 8192; + ) * PAGE_SZ; let wal_size = job .input_layers @@ -441,7 +441,7 @@ where let mut window = KeyspaceWindow::new( E::Key::MIN..E::Key::MAX, keyspace, - self.target_file_size / 8192, + self.target_file_size / PAGE_SZ, ); while let Some(key_range) = window.choose_next_image(&self.shard_identity) { new_jobs.push(CompactionJob:: { diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 2c922b0a49..8ed1d16082 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -16,6 +16,8 @@ use std::pin::Pin; use std::task::{ready, Poll}; use utils::lsn::Lsn; +pub const PAGE_SZ: u64 = 8192; + pub fn keyspace_total_size( keyspace: &CompactionKeySpace, shard_identity: &ShardIdentity, diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index 3543df64fa..a7c8bd5c1f 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -14,6 +14,7 @@ use std::ops::Range; use std::sync::Arc; use std::sync::Mutex; +use crate::helpers::PAGE_SZ; use crate::helpers::{merge_delta_keys, overlaps_with}; use crate::interface; @@ -509,7 +510,7 @@ impl interface::CompactionJobExecutor for MockTimeline { let new_layer = Arc::new(MockImageLayer { key_range: key_range.clone(), lsn_range: lsn..lsn, - file_size: accum_size * 8192, + file_size: accum_size * PAGE_SZ, deleted: Mutex::new(false), }); info!( From 4c5afb7b1000768b4ec6dd3db362c1159a189aaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Thu, 16 May 2024 19:35:13 +0200 Subject: [PATCH 068/130] Remove SSO_ACCOUNT_ID from scrubber docs and BucketConfig (#7774) As of #6202 we support `AWS_PROFILE` as well, which is more convenient. Change the docs to using it instead of `SSO_ACCOUNT_ID`. Also, remove `SSO_ACCOUNT_ID` from BucketConfig as it is confusing to the code's reader: it's not the "main" way of setting up authentication for the scrubber any more. It is a breaking change for the on-disk format as we persist `sso_account_id` to disk, but it was quite inconsistent with the other methods which are not persistet. Also, I don't think we want to support the case where one version writes the json and another version reads it. Related: #7667 --- s3_scrubber/README.md | 12 +++++++----- s3_scrubber/src/lib.rs | 24 ++++-------------------- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/s3_scrubber/README.md b/s3_scrubber/README.md index c1deab8852..8a96542ada 100644 --- a/s3_scrubber/README.md +++ b/s3_scrubber/README.md @@ -9,11 +9,13 @@ and `safekeeper`, and does housekeeping such as cleaning up objects for tenants #### S3 -Do `aws sso login --profile dev` to get the SSO access to the bucket to clean, get the SSO_ACCOUNT_ID for your profile (`cat ~/.aws/config` may help). +Do `aws sso login --profile dev` to get the SSO access to the bucket to clean. +Also, set the following environment variables: -- `SSO_ACCOUNT_ID`: Credentials id to use for accessing S3 buckets +- `AWS_PROFILE`: Profile name to use for accessing S3 buckets (e.g. `dev`) - `REGION`: A region where the bucket is located at. - `BUCKET`: Bucket name +- `BUCKET_PREFIX` (optional): Prefix inside the bucket #### Console API @@ -43,7 +45,7 @@ processing by the `purge-garbage` subcommand. Example: -`env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json` +`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json` #### `purge-garbage` @@ -59,7 +61,7 @@ to pass them on the command line Example: -`env SSO_ACCOUNT_ID=123456 cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json` +`env AWS_PROFILE=dev cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json` Add the `--delete` argument before `purge-garbage` to enable deletion. This is intentionally not provided inline in the example above to avoid accidents. Without the `--delete` flag @@ -72,7 +74,7 @@ Errors are logged to stderr and summary to stdout. For pageserver: ``` -env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver +env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver Timelines: 31106 With errors: 3 diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index 7966fb6a88..e0f99ecd9c 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -200,30 +200,15 @@ impl RootTarget { } #[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] pub struct BucketConfig { pub region: String, pub bucket: String, pub prefix_in_bucket: Option, - - /// Use SSO if this is set, else rely on AWS_* environment vars - pub sso_account_id: Option, -} - -impl Display for BucketConfig { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{}/{}/{}", - self.sso_account_id.as_deref().unwrap_or(""), - self.region, - self.bucket - ) - } } impl BucketConfig { pub fn from_env() -> anyhow::Result { - let sso_account_id = env::var("SSO_ACCOUNT_ID").ok(); let region = env::var("REGION").context("'REGION' param retrieval")?; let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?; let prefix_in_bucket = env::var("BUCKET_PREFIX").ok(); @@ -232,7 +217,6 @@ impl BucketConfig { region, bucket, prefix_in_bucket, - sso_account_id, }) } } @@ -276,7 +260,7 @@ pub fn init_logging(file_name: &str) -> WorkerGuard { guard } -pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Client { +pub fn init_s3_client(bucket_region: Region) -> Client { let credentials_provider = { // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" let chain = CredentialsProviderChain::first_try( @@ -290,7 +274,7 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie ); // Use SSO if we were given an account ID - match account_id { + match std::env::var("SSO_ACCOUNT_ID").ok() { Some(sso_account) => chain.or_else( "sso", SsoCredentialsProvider::builder() @@ -334,7 +318,7 @@ fn init_remote( ) -> anyhow::Result<(Arc, RootTarget)> { let bucket_region = Region::new(bucket_config.region); let delimiter = "/".to_string(); - let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region)); + let s3_client = Arc::new(init_s3_client(bucket_region)); let s3_root = match node_kind { NodeKind::Pageserver => RootTarget::Pageserver(S3Target { From 4b8809b280b04c92d0c9e2cfb21cbc230de7995b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Thu, 16 May 2024 22:25:19 +0200 Subject: [PATCH 069/130] Tiered compaction: improvements to the windows (#7787) Tiered compaction employs two sliding windows over the keyspace: `KeyspaceWindow` for the image layer generation and `Window` for the delta layer generation. Do some fixes to both windows: * The distinction between the two windows is not very clear. Do the absolute minimum to mention where they are used in the rustdoc description of the struct. Maybe we should rename them (say `WindowForImage` and `WindowForDelta`) or merge them into one window implementation. * Require the keys to strictly increase. The `accum_key_values` already combines the key, so there is no logic needed in `Window::feed` for the same key repeating. This is a follow-up to address the request in https://github.com/neondatabase/neon/pull/7671#pullrequestreview-2051995541 * In `choose_next_delta`, we claimed in the comment to use 1.25 as the factor but it was 1.66 instead. Fix this discrepancy by using `*5/4` as the two operations. --- pageserver/compaction/src/compact_tiered.rs | 24 +++++++++++---------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 33c9948f45..20f88868f9 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -663,8 +663,8 @@ where } } -// Sliding window through keyspace and values -// This is used by over_with_images to decide on good split points +/// Sliding window through keyspace and values for image layer +/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points struct KeyspaceWindow { head: KeyspaceWindowHead, @@ -804,9 +804,9 @@ struct WindowElement { accum_size: u64, } -// Sliding window through keyspace and values -// -// This is used to decide what layer to write next, from the beginning of the window. +/// Sliding window through keyspace and values for delta layer tiling +/// +/// This is used to decide which delta layer to write next. struct Window { elems: VecDeque>, @@ -830,11 +830,13 @@ where fn feed(&mut self, key: K, size: u64) { let last_size; if let Some(last) = self.elems.back_mut() { - assert!(last.last_key <= key); - if key == last.last_key { - last.accum_size += size; - return; - } + // We require the keys to be strictly increasing for the window. + // Keys should already have been deduplicated by `accum_key_values` + assert!( + last.last_key < key, + "last_key(={}) >= key(={key})", + last.last_key + ); last_size = last.accum_size; } else { last_size = 0; @@ -922,7 +924,7 @@ where // If we're willing to stretch it up to 1.25 target size, could we // gobble up the rest of the work? This avoids creating very small // "tail" layers at the end of the keyspace - if !has_more && self.remain_size() < target_size * 5 / 3 { + if !has_more && self.remain_size() < target_size * 5 / 4 { self.commit_upto(self.elems.len()); } else { let delta_split_at = self.find_size_split(target_size); From 6d951e69d636dc1fbfda2bc0282547fcae19ec81 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 17 May 2024 12:24:02 +0200 Subject: [PATCH 070/130] test_suite: patch, don't replace, the `tenant_config` field, where appropriate (#7771) Before this PR, the changed tests would overwrite the entire `tenant_config` because `pageserver_config_override` is merged non-recursively into the `ps_cfg`. This meant they would override the `PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM`, impacting our matrix build for `compaction_algorithm=Tiered|Legacy` in https://github.com/neondatabase/neon/pull/7748. I found the tests fixed in this PR using the `NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM` env var that I added in #7748. Therefore, I think this is an exhaustive fix. This is better than just searching the code base for `tenant_config`, which is what I had sketched in #7747. refs #7749 --- test_runner/fixtures/neon_fixtures.py | 14 +++++++---- test_runner/regress/test_branch_behind.py | 3 +-- test_runner/regress/test_gc_aggressive.py | 9 +++---- test_runner/regress/test_old_request_lsn.py | 3 +-- test_runner/regress/test_pageserver_api.py | 2 ++ test_runner/regress/test_pitr_gc.py | 6 ++--- test_runner/regress/test_recovery.py | 8 +++--- test_runner/regress/test_tenant_conf.py | 27 +++++++++++++-------- test_runner/regress/test_tenant_size.py | 11 ++++++--- test_runner/regress/test_timeline_size.py | 20 +++++++++------ test_runner/regress/test_wal_receiver.py | 13 +++++++--- 11 files changed, 70 insertions(+), 46 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b4761f103b..23f30804b4 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -455,7 +455,7 @@ class NeonEnvBuilder: test_overlay_dir: Optional[Path] = None, pageserver_remote_storage: Optional[RemoteStorage] = None, # toml that will be decomposed into `--config-override` flags during `pageserver --init` - pageserver_config_override: Optional[str] = None, + pageserver_config_override: Optional[str | Callable[[Dict[str, Any]], None]] = None, num_safekeepers: int = 1, num_pageservers: int = 1, # Use non-standard SK ids to check for various parsing bugs @@ -1127,10 +1127,14 @@ class NeonEnv: ) if config.pageserver_config_override is not None: - for o in config.pageserver_config_override.split(";"): - override = toml.loads(o) - for key, value in override.items(): - ps_cfg[key] = value + if callable(config.pageserver_config_override): + config.pageserver_config_override(ps_cfg) + else: + assert isinstance(config.pageserver_config_override, str) + for o in config.pageserver_config_override.split(";"): + override = toml.loads(o) + for key, value in override.items(): + ps_cfg[key] = value # Create a corresponding NeonPageserver object self.pageservers.append( diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index ac2fc79be4..0a5336f5a2 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -11,8 +11,7 @@ from fixtures.utils import print_gc_result, query_scalar # def test_branch_behind(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) error_regexes = [ ".*invalid branch start lsn.*", diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index e5067bba8b..44133f2350 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -67,8 +67,7 @@ async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId): # def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) timeline = env.neon_cli.create_branch("test_gc_aggressive", "main") endpoint = env.endpoints.create_start("test_gc_aggressive") @@ -94,13 +93,11 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): # def test_gc_index_upload(neon_env_builder: NeonEnvBuilder): - # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" num_index_uploads = 0 neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - - env = neon_env_builder.init_start() + # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) tenant_id = env.initial_tenant timeline_id = env.neon_cli.create_branch("test_gc_index_upload", "main") endpoint = env.endpoints.create_start("test_gc_index_upload") diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index 43b0bb56f0..f1dd3fb67d 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -16,8 +16,7 @@ from fixtures.utils import print_gc_result, query_scalar # def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) env.neon_cli.create_branch("test_old_request_lsn", "main") endpoint = env.endpoints.create_start("test_old_request_lsn") diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 80a1d72f4a..abbea59113 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -42,6 +42,8 @@ def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path): "listen_http_addr", "pg_auth_type", "http_auth_type", + # TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748 + # "tenant_config", ] required_config_overrides = [ f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 6434f431a4..7e676b5515 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -10,11 +10,9 @@ from fixtures.utils import print_gc_result, query_scalar # def test_pitr_gc(neon_env_builder: NeonEnvBuilder): # Set pitr interval such that we need to keep the data - neon_env_builder.pageserver_config_override = ( - "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" + env = neon_env_builder.init_start( + initial_tenant_conf={"pitr_interval": "1 day", "gc_horizon": "0"} ) - - env = neon_env_builder.init_start() endpoint_main = env.endpoints.create_start("main") main_pg_conn = endpoint_main.connect() diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index ab5c8be256..e21f9bb6f6 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -10,9 +10,11 @@ from fixtures.neon_fixtures import NeonEnvBuilder # def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): # Override default checkpointer settings to run it more often - neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" - - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": "1048576", + } + ) env.pageserver.is_testing_enabled_or_skip() # We expect the pageserver to exit, which will cause storage storage controller diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index a345464208..2cbb036c0d 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -1,5 +1,6 @@ import json from contextlib import closing +from typing import Any, Dict import psycopg2.extras from fixtures.common_types import Lsn @@ -14,16 +15,22 @@ from fixtures.utils import wait_until def test_tenant_config(neon_env_builder: NeonEnvBuilder): """Test per tenant configuration""" - # set some non-default global config - neon_env_builder.pageserver_config_override = """ -page_cache_size=444; -wait_lsn_timeout='111 s'; -[tenant_config] -checkpoint_distance = 10000 -compaction_target_size = 1048576 -evictions_low_residence_duration_metric_threshold = "2 days" -eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "23 hours" } -""" + + def set_some_nondefault_global_config(ps_cfg: Dict[str, Any]): + ps_cfg["page_cache_size"] = 444 + ps_cfg["wait_lsn_timeout"] = "111 s" + + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config["checkpoint_distance"] = 10000 + tenant_config["compaction_target_size"] = 1048576 + tenant_config["evictions_low_residence_duration_metric_threshold"] = "2 days" + tenant_config["eviction_policy"] = { + "kind": "LayerAccessThreshold", + "period": "20s", + "threshold": "23 hours", + } + + neon_env_builder.pageserver_config_override = set_some_nondefault_global_config env = neon_env_builder.init_start() # we configure eviction but no remote storage, there might be error lines diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 7894f6933d..d3a228dbeb 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -502,9 +502,14 @@ def test_get_tenant_size_with_multiple_branches( gc_horizon = 128 * 1024 - neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}" - - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + "compaction_period": "0s", + "gc_period": "0s", + "pitr_interval": "0sec", + "gc_horizon": gc_horizon, + } + ) # FIXME: we have a race condition between GC and delete timeline. GC might fail with this # error. Similar to https://github.com/neondatabase/neon/issues/2671 diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 18063bf104..a6d06df3b6 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -415,11 +415,12 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder # Disable background compaction as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed - neon_env_builder.pageserver_config_override = ( - "tenant_config={checkpoint_distance=100000, compaction_period='10m'}" + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": "100000", + "compaction_period": "10m", + } ) - - env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction") @@ -462,9 +463,14 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed - neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}" - - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": "100000", + "compaction_period": "0s", + "gc_period": "0s", + "pitr_interval": "1s", + } + ) pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc") diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index d9265dcbcd..6582b34218 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -1,4 +1,5 @@ import time +from typing import Any, Dict from fixtures.common_types import Lsn, TenantId from fixtures.log_helper import log @@ -42,10 +43,14 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder): # Kills one of the safekeepers and ensures that only the active ones are printed in the state. def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder): # Trigger WAL wait timeout faster - neon_env_builder.pageserver_config_override = """ - wait_lsn_timeout = "1s" - tenant_config={walreceiver_connect_timeout = "2s", lagging_wal_timeout = "2s"} - """ + def customize_pageserver_toml(ps_cfg: Dict[str, Any]): + ps_cfg["wait_lsn_timeout"] = "1s" + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config["walreceiver_connect_timeout"] = "2s" + tenant_config["lagging_wal_timeout"] = "2s" + + neon_env_builder.pageserver_config_override = customize_pageserver_toml + # Have notable SK ids to ensure we check logs for their presence, not some other random numbers neon_env_builder.safekeepers_id_start = 12345 neon_env_builder.num_safekeepers = 3 From c1390bfc3bbd3a7f00df334a39220ca312fc888e Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 17 May 2024 13:25:01 +0300 Subject: [PATCH 071/130] chore: update defaults for timeline_detach_ancestor (#7779) by having 100 copy operations in flight twe climb up to 2500 requests per min or 41/s. This is still probably less than is allowed, but fast enough for our purposes. --- pageserver/src/tenant/timeline/detach_ancestor.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 7f59758c87..4d8e570181 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -56,7 +56,7 @@ impl Default for Options { fn default() -> Self { Self { rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(), - copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(), + copy_concurrency: std::num::NonZeroUsize::new(100).unwrap(), } } } From a8e6d259cb49d1bf156dfc2215b92c04d1e8a08f Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 17 May 2024 13:24:03 +0100 Subject: [PATCH 072/130] pageserver: fixes for layer path changes (#7786) ## Problem - When a layer with legacy local path format is evicted and then re-downloaded, a panic happened because the path downloaded by remote storage didn't match the path stored in Layer. - While investigating, I also realized that secondary locations would have a similar issue with evictions. Closes: #7783 ## Summary of changes - Make remote timeline client take local paths as an input: it should not have its own ideas about local paths, instead it just uses the layer path that the Layer has. - Make secondary state store an explicit local path, populated on scan of local disk at startup. This provides the same behavior as for Layer, that our local_layer_path is a _default_, but the layer path can actually be anything (e.g. an old style one). - Add tests for both cases. --- pageserver/src/disk_usage_eviction_task.rs | 8 +- .../src/tenant/remote_timeline_client.rs | 2 + .../tenant/remote_timeline_client/download.rs | 11 +-- pageserver/src/tenant/secondary.rs | 54 +++--------- pageserver/src/tenant/secondary/downloader.rs | 47 ++++++++--- pageserver/src/tenant/storage_layer/layer.rs | 1 + .../regress/test_pageserver_generations.py | 84 +++++++++++++++---- 7 files changed, 119 insertions(+), 88 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index ebeb8bbb20..7f25e49570 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -535,17 +535,11 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( } EvictionLayer::Secondary(layer) => { let file_size = layer.metadata.file_size(); - let tenant_manager = tenant_manager.clone(); js.spawn(async move { layer .secondary_tenant - .evict_layer( - tenant_manager.get_conf(), - layer.timeline_id, - layer.name, - layer.metadata, - ) + .evict_layer(layer.timeline_id, layer.name) .await; Ok(file_size) }); diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index c5462dac43..07d6af696c 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -518,6 +518,7 @@ impl RemoteTimelineClient { &self, layer_file_name: &LayerName, layer_metadata: &LayerFileMetadata, + local_path: &Utf8Path, cancel: &CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { @@ -536,6 +537,7 @@ impl RemoteTimelineClient { self.timeline_id, layer_file_name, layer_metadata, + local_path, cancel, ctx, ) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index f3c9e64533..70c5cae05e 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -21,7 +21,6 @@ use crate::config::PageServerConf; use crate::context::RequestContext; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; -use crate::tenant::storage_layer::layer::local_layer_path; use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; @@ -50,19 +49,13 @@ pub async fn download_layer_file<'a>( timeline_id: TimelineId, layer_file_name: &'a LayerName, layer_metadata: &'a LayerFileMetadata, + local_path: &Utf8Path, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id); - let local_path = local_layer_path( - conf, - &tenant_shard_id, - &timeline_id, - layer_file_name, - &layer_metadata.generation, - ); let remote_path = remote_layer_path( &tenant_shard_id.tenant_id, @@ -82,7 +75,7 @@ pub async fn download_layer_file<'a>( // For more context about durable_rename check this email from postgres mailing list: // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com // If pageserver crashes the temp file will be deleted on startup and re-downloaded. - let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); + let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION); let bytes_amount = download_retry( || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await }, diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 7075044baf..252b6eb11b 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -6,11 +6,9 @@ mod scheduler; use std::{sync::Arc, time::SystemTime}; use crate::{ - config::PageServerConf, context::RequestContext, disk_usage_eviction_task::DiskUsageEvictionInfo, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, - virtual_file::MaybeFatalIo, }; use self::{ @@ -21,9 +19,8 @@ use self::{ use super::{ config::{SecondaryLocationConfig, TenantConfOpt}, mgr::TenantManager, - remote_timeline_client::LayerFileMetadata, span::debug_assert_current_span_has_tenant_id, - storage_layer::{layer::local_layer_path, LayerName}, + storage_layer::LayerName, }; use pageserver_api::{ @@ -178,13 +175,7 @@ impl SecondaryTenant { /// Cancellation safe, but on cancellation the eviction will go through #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))] - pub(crate) async fn evict_layer( - self: &Arc, - conf: &PageServerConf, - timeline_id: TimelineId, - name: LayerName, - metadata: LayerFileMetadata, - ) { + pub(crate) async fn evict_layer(self: &Arc, timeline_id: TimelineId, name: LayerName) { debug_assert_current_span_has_tenant_id(); let guard = match self.gate.enter() { @@ -197,41 +188,11 @@ impl SecondaryTenant { let now = SystemTime::now(); - let local_path = local_layer_path( - conf, - &self.tenant_shard_id, - &timeline_id, - &name, - &metadata.generation, - ); - let this = self.clone(); // spawn it to be cancellation safe tokio::task::spawn_blocking(move || { let _guard = guard; - // We tolerate ENOENT, because between planning eviction and executing - // it, the secondary downloader could have seen an updated heatmap that - // resulted in a layer being deleted. - // Other local I/O errors are process-fatal: these should never happen. - let deleted = std::fs::remove_file(local_path); - - let not_found = deleted - .as_ref() - .is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound); - - let deleted = if not_found { - false - } else { - deleted - .map(|()| true) - .fatal_err("Deleting layer during eviction") - }; - - if !deleted { - // skip updating accounting and putting perhaps later timestamp - return; - } // Update the timeline's state. This does not have to be synchronized with // the download process, because: @@ -250,8 +211,15 @@ impl SecondaryTenant { // of the cache. let mut detail = this.detail.lock().unwrap(); if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) { - timeline_detail.on_disk_layers.remove(&name); - timeline_detail.evicted_at.insert(name, now); + let removed = timeline_detail.on_disk_layers.remove(&name); + + // We might race with removal of the same layer during downloads, if it was removed + // from the heatmap. If we see that the OnDiskState is gone, then no need to + // do a physical deletion or store in evicted_at. + if let Some(removed) = removed { + removed.remove_blocking(); + timeline_detail.evicted_at.insert(name, now); + } } }) .await diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 46a3d7e81f..8f27220771 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -111,6 +111,7 @@ struct SecondaryDownloader { pub(super) struct OnDiskState { metadata: LayerFileMetadata, access_time: SystemTime, + local_path: Utf8PathBuf, } impl OnDiskState { @@ -121,12 +122,26 @@ impl OnDiskState { _ame: LayerName, metadata: LayerFileMetadata, access_time: SystemTime, + local_path: Utf8PathBuf, ) -> Self { Self { metadata, access_time, + local_path, } } + + // This is infallible, because all errors are either acceptable (ENOENT), or totally + // unexpected (fatal). + pub(super) fn remove_blocking(&self) { + // We tolerate ENOENT, because between planning eviction and executing + // it, the secondary downloader could have seen an updated heatmap that + // resulted in a layer being deleted. + // Other local I/O errors are process-fatal: these should never happen. + std::fs::remove_file(&self.local_path) + .or_else(fs_ext::ignore_not_found) + .fatal_err("Deleting secondary layer") + } } #[derive(Debug, Clone, Default)] @@ -816,20 +831,12 @@ impl<'a> TenantDownloader<'a> { if cfg!(debug_assertions) { // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think // are already present on disk are really there. - let local_path = local_layer_path( - self.conf, - tenant_shard_id, - &timeline.timeline_id, - &layer.name, - &layer.metadata.generation, - ); - - match tokio::fs::metadata(&local_path).await { + match tokio::fs::metadata(&on_disk.local_path).await { Ok(meta) => { tracing::debug!( "Layer {} present at {}, size {}", layer.name, - local_path, + on_disk.local_path, meta.len(), ); } @@ -837,7 +844,7 @@ impl<'a> TenantDownloader<'a> { tracing::warn!( "Layer {} not found at {} ({})", layer.name, - local_path, + on_disk.local_path, e ); debug_assert!(false); @@ -926,6 +933,13 @@ impl<'a> TenantDownloader<'a> { v.get_mut().access_time = t.access_time; } Entry::Vacant(e) => { + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + &timeline.timeline_id, + &t.name, + &t.metadata.generation, + ); e.insert(OnDiskState::new( self.conf, tenant_shard_id, @@ -933,6 +947,7 @@ impl<'a> TenantDownloader<'a> { t.name, LayerFileMetadata::from(&t.metadata), t.access_time, + local_path, )); } } @@ -955,6 +970,14 @@ impl<'a> TenantDownloader<'a> { &self.secondary_state.cancel ); + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + timeline_id, + &layer.name, + &layer.metadata.generation, + ); + // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally let downloaded_bytes = match download_layer_file( self.conf, @@ -963,6 +986,7 @@ impl<'a> TenantDownloader<'a> { *timeline_id, &layer.name, &LayerFileMetadata::from(&layer.metadata), + &local_path, &self.secondary_state.cancel, ctx, ) @@ -1116,6 +1140,7 @@ async fn init_timeline_state( name, LayerFileMetadata::from(&remote_meta.metadata), remote_meta.access_time, + file_path, ), ); } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index e8c712c4c6..97b349f635 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1108,6 +1108,7 @@ impl LayerInner { .download_layer_file( &self.desc.layer_name(), &self.metadata(), + &self.path, &timeline.cancel, ctx, ) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 9b97254410..0235cf6d20 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -25,6 +25,7 @@ from fixtures.neon_fixtures import ( S3Scrubber, generate_uploads_and_deletions, ) +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( assert_tenant_state, @@ -632,39 +633,86 @@ def test_upgrade_generationless_local_file_paths( generation numbers: it should accept these layer files, and avoid doing a delete/download cycle on them. """ - env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}' + ) workload = Workload(env, tenant_id, timeline_id) workload.init() workload.write_rows(1000) - env.pageserver.stop() + attached_pageserver = env.get_tenant_pageserver(tenant_id) + secondary_pageserver = list([ps for ps in env.pageservers if ps.id != attached_pageserver.id])[ + 0 + ] + + attached_pageserver.http_client().tenant_heatmap_upload(tenant_id) + secondary_pageserver.http_client().tenant_secondary_download(tenant_id) # Rename the local paths to legacy format, to simulate what - # we would see when upgrading - timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id) - files_renamed = 0 - for filename in os.listdir(timeline_dir): - path = os.path.join(timeline_dir, filename) - log.info(f"Found file {path}") - if path.endswith("-v1-00000001"): - new_path = path[:-12] - os.rename(path, new_path) - log.info(f"Renamed {path} -> {new_path}") - files_renamed += 1 + # we would see when upgrading. Do this on both attached and secondary locations, as we will + # test the behavior of both. + for pageserver in env.pageservers: + pageserver.stop() + timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id) + files_renamed = 0 + for filename in os.listdir(timeline_dir): + path = os.path.join(timeline_dir, filename) + log.info(f"Found file {path}") + if path.endswith("-v1-00000001"): + new_path = path[:-12] + os.rename(path, new_path) + log.info(f"Renamed {path} -> {new_path}") + files_renamed += 1 - assert files_renamed > 0 + assert files_renamed > 0 - env.pageserver.start() + pageserver.start() workload.validate() # Assert that there were no on-demand downloads assert ( - env.pageserver.http_client().get_metric_value( + attached_pageserver.http_client().get_metric_value( "pageserver_remote_ondemand_downloaded_layers_total" ) == 0 ) + + # Do a secondary download and ensure there were no layer downloads + secondary_pageserver.http_client().tenant_secondary_download(tenant_id) + assert ( + secondary_pageserver.http_client().get_metric_value( + "pageserver_secondary_download_layer_total" + ) + == 0 + ) + + # Check that when we evict and promote one of the legacy-named layers, everything works as + # expected + local_layers = list( + ( + parse_layer_file_name(path.name), + os.path.join(attached_pageserver.timeline_dir(tenant_id, timeline_id), path), + ) + for path in attached_pageserver.list_layers(tenant_id, timeline_id) + ) + (victim_layer_name, victim_path) = local_layers[0] + assert os.path.exists(victim_path) + + attached_pageserver.http_client().evict_layer( + tenant_id, timeline_id, victim_layer_name.to_str() + ) + assert not os.path.exists(victim_path) + + attached_pageserver.http_client().download_layer( + tenant_id, timeline_id, victim_layer_name.to_str() + ) + # We should download into the same local path we started with + assert os.path.exists(victim_path) From af99c959ef460326b35716239c09b3a572c43b4c Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 17 May 2024 16:44:33 +0100 Subject: [PATCH 073/130] storage controller: use SERIALIZABLE isolation level (#7792) ## Problem The storage controller generally assumes that things like updating generation numbers are atomic: it should use a strict isolation level. ## Summary of changes - Wrap all database operations in a SERIALIZABLE transaction. - Retry serialization failures, as these do not indicate problems and are normal when plenty of concurrent work is happening. Using this isolation level for all reads is overkill, but much simpler than reasoning about it on a per-operation basis, and does not hurt performance. Tested this with a modified version of storage_controller_many_tenants test with 128k shards, to check that our performance is still fine: it is. --- storage_controller/src/persistence.rs | 230 ++++++++++++++------------ 1 file changed, 126 insertions(+), 104 deletions(-) diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index dca37166ba..67c05296d5 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -173,7 +173,7 @@ impl Persistence { /// Wraps `with_conn` in order to collect latency and error metrics async fn with_measured_conn(&self, op: DatabaseOperation, func: F) -> DatabaseResult where - F: FnOnce(&mut PgConnection) -> DatabaseResult + Send + 'static, + F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, R: Send + 'static, { let latency = &METRICS_REGISTRY @@ -199,13 +199,48 @@ impl Persistence { /// Call the provided function in a tokio blocking thread, with a Diesel database connection. async fn with_conn(&self, func: F) -> DatabaseResult where - F: FnOnce(&mut PgConnection) -> DatabaseResult + Send + 'static, + F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, R: Send + 'static, { + // A generous allowance for how many times we may retry serializable transactions + // before giving up. This is not expected to be hit: it is a defensive measure in case we + // somehow engineer a situation where duelling transactions might otherwise live-lock. + const MAX_RETRIES: usize = 128; + let mut conn = self.connection_pool.get()?; - tokio::task::spawn_blocking(move || -> DatabaseResult { func(&mut conn) }) - .await - .expect("Task panic") + tokio::task::spawn_blocking(move || -> DatabaseResult { + let mut retry_count = 0; + loop { + match conn.build_transaction().serializable().run(|c| func(c)) { + Ok(r) => break Ok(r), + Err( + err @ DatabaseError::Query(diesel::result::Error::DatabaseError( + diesel::result::DatabaseErrorKind::SerializationFailure, + _, + )), + ) => { + retry_count += 1; + if retry_count > MAX_RETRIES { + tracing::error!( + "Exceeded max retries on SerializationFailure errors: {err:?}" + ); + break Err(err); + } else { + // Retry on serialization errors: these are expected, because even though our + // transactions don't fight for the same rows, they will occasionally collide + // on index pages (e.g. increment_generation for unrelated shards can collide) + tracing::debug!( + "Retrying transaction on serialization failure {err:?}" + ); + continue; + } + } + Err(e) => break Err(e), + } + } + }) + .await + .expect("Task panic") } /// When a node is first registered, persist it before using it for anything @@ -358,14 +393,11 @@ impl Persistence { self.with_measured_conn( DatabaseOperation::InsertTenantShards, move |conn| -> DatabaseResult<()> { - conn.transaction(|conn| -> QueryResult<()> { - for tenant in &shards { - diesel::insert_into(tenant_shards) - .values(tenant) - .execute(conn)?; - } - Ok(()) - })?; + for tenant in &shards { + diesel::insert_into(tenant_shards) + .values(tenant) + .execute(conn)?; + } Ok(()) }, ) @@ -533,8 +565,11 @@ impl Persistence { let update = ShardUpdate { generation: input_generation.map(|g| g.into().unwrap() as i32), placement_policy: input_placement_policy + .as_ref() .map(|p| serde_json::to_string(&p).unwrap()), - config: input_config.map(|c| serde_json::to_string(&c).unwrap()), + config: input_config + .as_ref() + .map(|c| serde_json::to_string(&c).unwrap()), scheduling_policy: input_scheduling_policy .map(|p| serde_json::to_string(&p).unwrap()), }; @@ -581,55 +616,51 @@ impl Persistence { ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> { - conn.transaction(|conn| -> DatabaseResult<()> { - // Mark parent shards as splitting + // Mark parent shards as splitting - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .filter(shard_count.eq(old_shard_count.literal() as i32)) - .set((splitting.eq(1),)) - .execute(conn)?; - if u8::try_from(updated) - .map_err(|_| DatabaseError::Logical( - format!("Overflow existing shard count {} while splitting", updated)) - )? != old_shard_count.count() { - // Perhaps a deletion or another split raced with this attempt to split, mutating - // the parent shards that we intend to split. In this case the split request should fail. - return Err(DatabaseError::Logical( - format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count()) - )); + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(old_shard_count.literal() as i32)) + .set((splitting.eq(1),)) + .execute(conn)?; + if u8::try_from(updated) + .map_err(|_| DatabaseError::Logical( + format!("Overflow existing shard count {} while splitting", updated)) + )? != old_shard_count.count() { + // Perhaps a deletion or another split raced with this attempt to split, mutating + // the parent shards that we intend to split. In this case the split request should fail. + return Err(DatabaseError::Logical( + format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count()) + )); + } + + // FIXME: spurious clone to sidestep closure move rules + let parent_to_children = parent_to_children.clone(); + + // Insert child shards + for (parent_shard_id, children) in parent_to_children { + let mut parent = crate::schema::tenant_shards::table + .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32)) + .load::(conn)?; + let parent = if parent.len() != 1 { + return Err(DatabaseError::Logical(format!( + "Parent shard {parent_shard_id} not found" + ))); + } else { + parent.pop().unwrap() + }; + for mut shard in children { + // Carry the parent's generation into the child + shard.generation = parent.generation; + + debug_assert!(shard.splitting == SplitState::Splitting); + diesel::insert_into(tenant_shards) + .values(shard) + .execute(conn)?; } - - // FIXME: spurious clone to sidestep closure move rules - let parent_to_children = parent_to_children.clone(); - - // Insert child shards - for (parent_shard_id, children) in parent_to_children { - let mut parent = crate::schema::tenant_shards::table - .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string())) - .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32)) - .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32)) - .load::(conn)?; - let parent = if parent.len() != 1 { - return Err(DatabaseError::Logical(format!( - "Parent shard {parent_shard_id} not found" - ))); - } else { - parent.pop().unwrap() - }; - for mut shard in children { - // Carry the parent's generation into the child - shard.generation = parent.generation; - - debug_assert!(shard.splitting == SplitState::Splitting); - diesel::insert_into(tenant_shards) - .values(shard) - .execute(conn)?; - } - } - - Ok(()) - })?; + } Ok(()) }) @@ -647,22 +678,18 @@ impl Persistence { self.with_measured_conn( DatabaseOperation::CompleteShardSplit, move |conn| -> DatabaseResult<()> { - conn.transaction(|conn| -> QueryResult<()> { - // Drop parent shards - diesel::delete(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .filter(shard_count.eq(old_shard_count.literal() as i32)) - .execute(conn)?; + // Drop parent shards + diesel::delete(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(old_shard_count.literal() as i32)) + .execute(conn)?; - // Clear sharding flag - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .set((splitting.eq(0),)) - .execute(conn)?; - debug_assert!(updated > 0); - - Ok(()) - })?; + // Clear sharding flag + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .set((splitting.eq(0),)) + .execute(conn)?; + debug_assert!(updated > 0); Ok(()) }, @@ -681,39 +708,34 @@ impl Persistence { self.with_measured_conn( DatabaseOperation::AbortShardSplit, move |conn| -> DatabaseResult { - let aborted = - conn.transaction(|conn| -> DatabaseResult { - // Clear the splitting state on parent shards - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .filter(shard_count.ne(new_shard_count.literal() as i32)) - .set((splitting.eq(0),)) - .execute(conn)?; + // Clear the splitting state on parent shards + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.ne(new_shard_count.literal() as i32)) + .set((splitting.eq(0),)) + .execute(conn)?; - // Parent shards are already gone: we cannot abort. - if updated == 0 { - return Ok(AbortShardSplitStatus::Complete); - } + // Parent shards are already gone: we cannot abort. + if updated == 0 { + return Ok(AbortShardSplitStatus::Complete); + } - // Sanity check: if parent shards were present, their cardinality should - // be less than the number of child shards. - if updated >= new_shard_count.count() as usize { - return Err(DatabaseError::Logical(format!( - "Unexpected parent shard count {updated} while aborting split to \ + // Sanity check: if parent shards were present, their cardinality should + // be less than the number of child shards. + if updated >= new_shard_count.count() as usize { + return Err(DatabaseError::Logical(format!( + "Unexpected parent shard count {updated} while aborting split to \ count {new_shard_count:?} on tenant {split_tenant_id}" - ))); - } + ))); + } - // Erase child shards - diesel::delete(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .filter(shard_count.eq(new_shard_count.literal() as i32)) - .execute(conn)?; + // Erase child shards + diesel::delete(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(new_shard_count.literal() as i32)) + .execute(conn)?; - Ok(AbortShardSplitStatus::Aborted) - })?; - - Ok(aborted) + Ok(AbortShardSplitStatus::Aborted) }, ) .await From c84656a53e92ca4628ffa8061e34102263576f43 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 17 May 2024 17:01:24 +0100 Subject: [PATCH 074/130] pageserver: implement auto-splitting (#7681) ## Problem Currently tenants are only split into multiple shards if a human being calls the API to do it. Issue: #7388 ## Summary of changes - Add a pageserver API for returning the top tenants by size - Add a step to the controller's background loop where if there is no reconciliation or optimization to be done, it looks for things to split. - Add a test that runs pgbench on many tenants concurrently, and checks that splitting happens as expected as tenants grow, without interrupting the client I/O. This PR is quite basic: there is a tasklist in https://github.com/neondatabase/neon/issues/7388 for further work. This PR is meant to be safe (off by default), and sufficient to enable our staging environment to run lots of sharded tenants without a human having to set them up. --- control_plane/src/local_env.rs | 4 + control_plane/src/storage_controller.rs | 4 + libs/pageserver_api/src/models.rs | 49 +++ libs/pageserver_api/src/shard.rs | 2 +- pageserver/client/src/mgmt_api.rs | 12 + pageserver/src/http/routes.rs | 98 ++++++ pageserver/src/metrics.rs | 64 ++-- pageserver/src/tenant.rs | 26 ++ .../src/tenant/remote_timeline_client.rs | 6 +- storage_controller/src/main.rs | 5 + storage_controller/src/pageserver_client.rs | 14 +- storage_controller/src/service.rs | 114 ++++++- test_runner/fixtures/pageserver/http.py | 15 + .../performance/test_sharding_autosplit.py | 280 ++++++++++++++++++ test_runner/regress/test_sharding.py | 42 +++ 15 files changed, 689 insertions(+), 46 deletions(-) create mode 100644 test_runner/performance/test_sharding_autosplit.py diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index d13884198e..0edcf1be4e 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -152,6 +152,9 @@ pub struct NeonStorageControllerConf { /// Heartbeat timeout before marking a node offline #[serde(with = "humantime_serde")] pub max_unavailable: Duration, + + /// Threshold for auto-splitting a tenant into shards + pub split_threshold: Option, } impl NeonStorageControllerConf { @@ -164,6 +167,7 @@ impl Default for NeonStorageControllerConf { fn default() -> Self { Self { max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL, + split_threshold: None, } } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index f1c43f4036..96e8276f4d 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -305,6 +305,10 @@ impl StorageController { )); } + if let Some(split_threshold) = self.config.split_threshold.as_ref() { + args.push(format!("--split-threshold={split_threshold}")) + } + background_process::start_process( COMMAND, &self.env.base_data_dir, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 7cf54bf32a..d52fb5e93d 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -824,6 +824,55 @@ pub struct TenantScanRemoteStorageResponse { pub shards: Vec, } +#[derive(Serialize, Deserialize, Debug, Clone)] +#[serde(rename_all = "snake_case")] +pub enum TenantSorting { + ResidentSize, + MaxLogicalSize, +} + +impl Default for TenantSorting { + fn default() -> Self { + Self::ResidentSize + } +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct TopTenantShardsRequest { + // How would you like to sort the tenants? + pub order_by: TenantSorting, + + // How many results? + pub limit: usize, + + // Omit tenants with more than this many shards (e.g. if this is the max number of shards + // that the caller would ever split to) + pub where_shards_lt: Option, + + // Omit tenants where the ordering metric is less than this (this is an optimization to + // let us quickly exclude numerous tiny shards) + pub where_gt: Option, +} + +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] +pub struct TopTenantShardItem { + pub id: TenantShardId, + + /// Total size of layers on local disk for all timelines in this tenant + pub resident_size: u64, + + /// Total size of layers in remote storage for all timelines in this tenant + pub physical_size: u64, + + /// The largest logical size of a timeline within this tenant + pub max_logical_size: u64, +} + +#[derive(Serialize, Deserialize, Debug, Default)] +pub struct TopTenantShardsResponse { + pub shards: Vec, +} + pub mod virtual_file { #[derive( Copy, diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index ff6d3d91b6..43d9b2e48c 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -125,7 +125,7 @@ impl ShardCount { /// `v` may be zero, or the number of shards in the tenant. `v` is what /// [`Self::literal`] would return. - pub fn new(val: u8) -> Self { + pub const fn new(val: u8) -> Self { Self(val) } } diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 6df8b2170d..5904713da9 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -486,6 +486,18 @@ impl Client { .map_err(Error::ReceiveBody) } + pub async fn top_tenant_shards( + &self, + request: TopTenantShardsRequest, + ) -> Result { + let uri = format!("{}/v1/top_tenants", self.mgmt_api_endpoint); + self.request(Method::POST, uri, request) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn layer_map_info( &self, tenant_shard_id: TenantShardId, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 0a98d32f02..b8d5c67ce0 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,6 +1,8 @@ //! //! Management HTTP API //! +use std::cmp::Reverse; +use std::collections::BinaryHeap; use std::collections::HashMap; use std::str::FromStr; use std::sync::Arc; @@ -24,7 +26,11 @@ use pageserver_api::models::TenantScanRemoteStorageShard; use pageserver_api::models::TenantShardLocation; use pageserver_api::models::TenantShardSplitRequest; use pageserver_api::models::TenantShardSplitResponse; +use pageserver_api::models::TenantSorting; use pageserver_api::models::TenantState; +use pageserver_api::models::TopTenantShardItem; +use pageserver_api::models::TopTenantShardsRequest; +use pageserver_api::models::TopTenantShardsResponse; use pageserver_api::models::{ DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest, TenantLoadRequest, TenantLocationConfigRequest, @@ -2323,6 +2329,97 @@ async fn get_utilization( .map_err(ApiError::InternalServerError) } +/// Report on the largest tenants on this pageserver, for the storage controller to identify +/// candidates for splitting +async fn post_top_tenants( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&r, None)?; + let request: TopTenantShardsRequest = json_request(&mut r).await?; + let state = get_state(&r); + + fn get_size_metric(sizes: &TopTenantShardItem, order_by: &TenantSorting) -> u64 { + match order_by { + TenantSorting::ResidentSize => sizes.resident_size, + TenantSorting::MaxLogicalSize => sizes.max_logical_size, + } + } + + #[derive(Eq, PartialEq)] + struct HeapItem { + metric: u64, + sizes: TopTenantShardItem, + } + + impl PartialOrd for HeapItem { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + + /// Heap items have reverse ordering on their metric: this enables using BinaryHeap, which + /// supports popping the greatest item but not the smallest. + impl Ord for HeapItem { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + Reverse(self.metric).cmp(&Reverse(other.metric)) + } + } + + let mut top_n: BinaryHeap = BinaryHeap::with_capacity(request.limit); + + // FIXME: this is a lot of clones to take this tenant list + for (tenant_shard_id, tenant_slot) in state.tenant_manager.list() { + if let Some(shards_lt) = request.where_shards_lt { + // Ignore tenants which already have >= this many shards + if tenant_shard_id.shard_count >= shards_lt { + continue; + } + } + + let sizes = match tenant_slot { + TenantSlot::Attached(tenant) => tenant.get_sizes(), + TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => { + continue; + } + }; + let metric = get_size_metric(&sizes, &request.order_by); + + if let Some(gt) = request.where_gt { + // Ignore tenants whose metric is <= the lower size threshold, to do less sorting work + if metric <= gt { + continue; + } + }; + + match top_n.peek() { + None => { + // Top N list is empty: candidate becomes first member + top_n.push(HeapItem { metric, sizes }); + } + Some(i) if i.metric > metric && top_n.len() < request.limit => { + // Lowest item in list is greater than our candidate, but we aren't at limit yet: push to end + top_n.push(HeapItem { metric, sizes }); + } + Some(i) if i.metric > metric => { + // List is at limit and lowest value is greater than our candidate, drop it. + } + Some(_) => top_n.push(HeapItem { metric, sizes }), + } + + while top_n.len() > request.limit { + top_n.pop(); + } + } + + json_response( + StatusCode::OK, + TopTenantShardsResponse { + shards: top_n.into_iter().map(|i| i.sizes).collect(), + }, + ) +} + /// Common functionality of all the HTTP API handlers. /// /// - Adds a tracing span to each request (by `request_span`) @@ -2609,5 +2706,6 @@ pub fn make_router( ) .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) .get("/v1/utilization", |r| api_handler(r, get_utilization)) + .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants)) .any(handler_404)) } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index ffcd08b4b3..5315f0b936 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -2098,7 +2098,7 @@ pub(crate) struct TimelineMetrics { pub garbage_collect_histo: StorageTimeMetrics, pub find_gc_cutoffs_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, - resident_physical_size_gauge: UIntGauge, + pub resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, pub aux_file_size_gauge: IntGauge, @@ -2312,6 +2312,7 @@ use pin_project_lite::pin_project; use std::collections::HashMap; use std::num::NonZeroUsize; use std::pin::Pin; +use std::sync::atomic::AtomicU64; use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; use std::time::{Duration, Instant}; @@ -2321,35 +2322,35 @@ use crate::task_mgr::TaskKind; use crate::tenant::mgr::TenantSlot; /// Maintain a per timeline gauge in addition to the global gauge. -struct PerTimelineRemotePhysicalSizeGauge { - last_set: u64, +pub(crate) struct PerTimelineRemotePhysicalSizeGauge { + last_set: AtomicU64, gauge: UIntGauge, } impl PerTimelineRemotePhysicalSizeGauge { fn new(per_timeline_gauge: UIntGauge) -> Self { Self { - last_set: per_timeline_gauge.get(), + last_set: AtomicU64::new(0), gauge: per_timeline_gauge, } } - fn set(&mut self, sz: u64) { + pub(crate) fn set(&self, sz: u64) { self.gauge.set(sz); - if sz < self.last_set { - REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz); + let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed); + if sz < prev { + REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz); } else { - REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set); + REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev); }; - self.last_set = sz; } - fn get(&self) -> u64 { + pub(crate) fn get(&self) -> u64 { self.gauge.get() } } impl Drop for PerTimelineRemotePhysicalSizeGauge { fn drop(&mut self) { - REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set); + REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed)); } } @@ -2357,7 +2358,7 @@ pub(crate) struct RemoteTimelineClientMetrics { tenant_id: String, shard_id: String, timeline_id: String, - remote_physical_size_gauge: Mutex>, + pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge, calls: Mutex>, bytes_started_counter: Mutex>, bytes_finished_counter: Mutex>, @@ -2365,38 +2366,27 @@ pub(crate) struct RemoteTimelineClientMetrics { impl RemoteTimelineClientMetrics { pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self { + let tenant_id_str = tenant_shard_id.tenant_id.to_string(); + let shard_id_str = format!("{}", tenant_shard_id.shard_slug()); + let timeline_id_str = timeline_id.to_string(); + + let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new( + REMOTE_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str]) + .unwrap(), + ); + RemoteTimelineClientMetrics { - tenant_id: tenant_shard_id.tenant_id.to_string(), - shard_id: format!("{}", tenant_shard_id.shard_slug()), - timeline_id: timeline_id.to_string(), + tenant_id: tenant_id_str, + shard_id: shard_id_str, + timeline_id: timeline_id_str, calls: Mutex::new(HashMap::default()), bytes_started_counter: Mutex::new(HashMap::default()), bytes_finished_counter: Mutex::new(HashMap::default()), - remote_physical_size_gauge: Mutex::new(None), + remote_physical_size_gauge, } } - pub(crate) fn remote_physical_size_set(&self, sz: u64) { - let mut guard = self.remote_physical_size_gauge.lock().unwrap(); - let gauge = guard.get_or_insert_with(|| { - PerTimelineRemotePhysicalSizeGauge::new( - REMOTE_PHYSICAL_SIZE - .get_metric_with_label_values(&[ - &self.tenant_id, - &self.shard_id, - &self.timeline_id, - ]) - .unwrap(), - ) - }); - gauge.set(sz); - } - - pub(crate) fn remote_physical_size_get(&self) -> u64 { - let guard = self.remote_physical_size_gauge.lock().unwrap(); - guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0) - } - pub fn remote_operation_time( &self, file_kind: &RemoteOpFileKind, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 026cbc107c..54b63f7042 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -21,6 +21,7 @@ use futures::FutureExt; use futures::StreamExt; use pageserver_api::models; use pageserver_api::models::TimelineState; +use pageserver_api::models::TopTenantShardItem; use pageserver_api::models::WalRedoManagerStatus; use pageserver_api::shard::ShardIdentity; use pageserver_api::shard::ShardStripeSize; @@ -2196,6 +2197,31 @@ impl Tenant { Ok(()) } + + pub(crate) fn get_sizes(&self) -> TopTenantShardItem { + let mut result = TopTenantShardItem { + id: self.tenant_shard_id, + resident_size: 0, + physical_size: 0, + max_logical_size: 0, + }; + + for timeline in self.timelines.lock().unwrap().values() { + result.resident_size += timeline.metrics.resident_physical_size_gauge.get(); + + result.physical_size += timeline + .remote_client + .metrics + .remote_physical_size_gauge + .get(); + result.max_logical_size = std::cmp::max( + result.max_logical_size, + timeline.metrics.current_logical_size_gauge.get(), + ); + } + + result + } } /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id), diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 07d6af696c..3a1113cf01 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -317,7 +317,7 @@ pub struct RemoteTimelineClient { upload_queue: Mutex, - metrics: Arc, + pub(crate) metrics: Arc, storage_impl: GenericRemoteStorage, @@ -461,11 +461,11 @@ impl RemoteTimelineClient { } else { 0 }; - self.metrics.remote_physical_size_set(size); + self.metrics.remote_physical_size_gauge.set(size); } pub fn get_remote_physical_size(&self) -> u64 { - self.metrics.remote_physical_size_get() + self.metrics.remote_physical_size_gauge.get() } // diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index f1454af533..ce8f8d0cdd 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -66,6 +66,10 @@ struct Cli { #[arg(long)] max_unavailable_interval: Option, + /// Size threshold for automatically splitting shards (disabled by default) + #[arg(long)] + split_threshold: Option, + /// Maximum number of reconcilers that may run in parallel #[arg(long)] reconciler_concurrency: Option, @@ -255,6 +259,7 @@ async fn async_main() -> anyhow::Result<()> { reconciler_concurrency: args .reconciler_concurrency .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), + split_threshold: args.split_threshold, }; // After loading secrets & config, but before starting anything else, apply database migrations diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 25b6b67e12..769aba80ca 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -2,7 +2,7 @@ use pageserver_api::{ models::{ LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, - TimelineCreateRequest, TimelineInfo, + TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, }, shard::TenantShardId, }; @@ -234,4 +234,16 @@ impl PageserverClient { self.inner.get_utilization().await ) } + + pub(crate) async fn top_tenant_shards( + &self, + request: TopTenantShardsRequest, + ) -> Result { + measured_request!( + "top_tenants", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.top_tenant_shards(request).await + ) + } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index ae7e8d3d7d..f914f4e0bb 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -32,10 +32,10 @@ use pageserver_api::{ TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore, }, - models::{SecondaryProgress, TenantConfigRequest}, + models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest}, }; use reqwest::StatusCode; -use tracing::instrument; +use tracing::{instrument, Instrument}; use crate::pageserver_client::PageserverClient; use pageserver_api::{ @@ -222,6 +222,10 @@ pub struct Config { /// How many Reconcilers may be spawned concurrently pub reconciler_concurrency: usize, + + /// How large must a shard grow in bytes before we split it? + /// None disables auto-splitting. + pub split_threshold: Option, } impl From for ApiError { @@ -699,7 +703,7 @@ impl Service { /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible /// for those retries. #[instrument(skip_all)] - async fn background_reconcile(&self) { + async fn background_reconcile(self: &Arc) { self.startup_complete.clone().wait().await; const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20); @@ -711,7 +715,11 @@ impl Service { let reconciles_spawned = self.reconcile_all(); if reconciles_spawned == 0 { // Run optimizer only when we didn't find any other work to do - self.optimize_all().await; + let optimizations = self.optimize_all().await; + if optimizations == 0 { + // Run new splits only when no optimizations are pending + self.autosplit_tenants().await; + } } } _ = self.cancel.cancelled() => return @@ -4766,6 +4774,104 @@ impl Service { validated_work } + /// Look for shards which are oversized and in need of splitting + async fn autosplit_tenants(self: &Arc) { + let Some(split_threshold) = self.config.split_threshold else { + // Auto-splitting is disabled + return; + }; + + let nodes = self.inner.read().unwrap().nodes.clone(); + + const SPLIT_TO_MAX: ShardCount = ShardCount::new(8); + + let mut top_n = Vec::new(); + + // Call into each node to look for big tenants + let top_n_request = TopTenantShardsRequest { + // We currently split based on logical size, for simplicity: logical size is a signal of + // the user's intent to run a large database, whereas physical/resident size can be symptoms + // of compaction issues. Eventually we should switch to using resident size to bound the + // disk space impact of one shard. + order_by: models::TenantSorting::MaxLogicalSize, + limit: 10, + where_shards_lt: Some(SPLIT_TO_MAX), + where_gt: Some(split_threshold), + }; + for node in nodes.values() { + let request_ref = &top_n_request; + match node + .with_client_retries( + |client| async move { + let request = request_ref.clone(); + client.top_tenant_shards(request.clone()).await + }, + &self.config.jwt_token, + 3, + 3, + Duration::from_secs(5), + &self.cancel, + ) + .await + { + Some(Ok(node_top_n)) => { + top_n.extend(node_top_n.shards.into_iter()); + } + Some(Err(mgmt_api::Error::Cancelled)) => { + continue; + } + Some(Err(e)) => { + tracing::warn!("Failed to fetch top N tenants from {node}: {e}"); + continue; + } + None => { + // Node is shutting down + continue; + } + }; + } + + // Pick the biggest tenant to split first + top_n.sort_by_key(|i| i.resident_size); + let Some(split_candidate) = top_n.into_iter().next() else { + tracing::debug!("No split-elegible shards found"); + return; + }; + + // We spawn a task to run this, so it's exactly like some external API client requesting it. We don't + // want to block the background reconcile loop on this. + tracing::info!("Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}"); + + let this = self.clone(); + tokio::spawn( + async move { + match this + .tenant_shard_split( + split_candidate.id.tenant_id, + TenantShardSplitRequest { + // Always split to the max number of shards: this avoids stepping through + // intervening shard counts and encountering the overrhead of a split+cleanup + // each time as a tenant grows, and is not too expensive because our max shard + // count is relatively low anyway. + // This policy will be adjusted in future once we support higher shard count. + new_shard_count: SPLIT_TO_MAX.literal(), + new_stripe_size: Some(ShardParameters::DEFAULT_STRIPE_SIZE), + }, + ) + .await + { + Ok(_) => { + tracing::info!("Successful auto-split"); + } + Err(e) => { + tracing::error!("Auto-split failed: {e}"); + } + } + } + .instrument(tracing::info_span!("auto_split", tenant_id=%split_candidate.id.tenant_id)), + ); + } + /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but /// also wait for any generated Reconcilers to complete. Calling this until it returns zero should /// put the system into a quiescent state where future background reconciliations won't do anything. diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 0b2963d89c..4d563a532b 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -890,3 +890,18 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert current_logical_size == non_incremental assert isinstance(current_logical_size, int) return current_logical_size + + def top_tenants( + self, order_by: str, limit: int, where_shards_lt: int, where_gt: int + ) -> dict[Any, Any]: + res = self.post( + f"http://localhost:{self.port}/v1/top_tenants", + json={ + "order_by": order_by, + "limit": limit, + "where_shards_lt": where_shards_lt, + "where_gt": where_gt, + }, + ) + self.verbose_error(res) + return res.json() # type: ignore diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py new file mode 100644 index 0000000000..9cd83f0959 --- /dev/null +++ b/test_runner/performance/test_sharding_autosplit.py @@ -0,0 +1,280 @@ +import concurrent.futures +import re +from pathlib import Path + +import pytest +from fixtures.common_types import TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + tenant_get_shards, +) + + +@pytest.mark.timeout(600) +def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + """ + Check that sharding, including auto-splitting, "just works" under pgbench workloads. + + This is not a benchmark, but it lives in the same place as benchmarks in order to be run + on a dedicated node that can sustain some significant throughput. + + Other tests validate the details of shard splitting, error cases etc. This test is + the sanity check that it all really works as expected with realistic amounts of data + and under load. + + Success conditions: + - Tenants auto-split when their capacity grows + - Client workloads are not interrupted while that happens + """ + + neon_env_builder.num_pageservers = 8 + neon_env_builder.storage_controller_config = { + # Split tenants at 500MB: it's up to the storage controller how it interprets this (logical + # sizes, physical sizes, etc). We will write this much data logically, therefore other sizes + # will reliably be greater. + "split_threshold": 1024 * 1024 * 500 + } + + tenant_conf = { + # We want layer rewrites to happen as soon as possible (this is the most stressful + # case for the system), so set PITR interval to something tiny. + "pitr_interval": "5s", + # Scaled down thresholds. We will run at ~1GB scale but would like to emulate + # the behavior of a system running at ~100GB scale. + "checkpoint_distance": f"{1024 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{1024 * 1024}", + "image_creation_threshold": "2", + "image_layer_creation_check_threshold": "0", + } + + env = neon_env_builder.init_start() + + for ps in env.pageservers: + ps.allowed_errors.extend( + [ + # We shut down pageservers while they might have some compaction work going on + ".*Compaction failed.*shutting down.*" + ] + ) + + env.storage_controller.allowed_errors.extend( + [ + # The neon_local functionality for updating computes is flaky for unknown reasons + ".*Local notification hook failed.*", + ".*Marking shard.*for notification retry.*", + ".*Failed to notify compute.*", + ] + ) + + # Total tenants + tenant_count = 4 + + # Transaction rate: we set this rather than running at full-speed because we + # might run on a slow node that doesn't cope well with many full-speed pgbenches running concurrently. + transaction_rate = 100 + + class TenantState: + def __init__(self, timeline_id, endpoint): + self.timeline_id = timeline_id + self.endpoint = endpoint + + # Create tenants + tenants = {} + for tenant_id in set(TenantId.generate() for _i in range(0, tenant_count)): + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id, conf=tenant_conf) + endpoint = env.endpoints.create("main", tenant_id=tenant_id) + tenants[tenant_id] = TenantState(timeline_id, endpoint) + endpoint.start() + + def run_pgbench_init(endpoint): + pg_bin.run_capture( + [ + "pgbench", + "-s50", + "-i", + f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres", + ] + ) + + def check_pgbench_output(out_path: str): + """ + When we run pgbench, we want not just an absence of errors, but also continuous evidence + of I/O progressing: our shard splitting and migration should not interrrupt the benchmark. + """ + matched_lines = 0 + stderr = Path(f"{out_path}.stderr").read_text() + + low_watermark = None + + # Apply this as a threshold for what we consider an unacceptable interruption to I/O + min_tps = transaction_rate // 10 + + for line in stderr.split("\n"): + match = re.match(r"progress: ([0-9\.]+) s, ([0-9\.]+) tps, .* ([0-9]+) failed", line) + if match is None: + # Fall back to older-version pgbench output (omits failure count) + match = re.match(r"progress: ([0-9\.]+) s, ([0-9\.]+) tps, .*", line) + if match is None: + continue + else: + (_time, tps) = match.groups() + tps = float(tps) + failed = 0 + else: + (_time, tps, failed) = match.groups() # type: ignore + tps = float(tps) + failed = int(failed) + + matched_lines += 1 + + if failed > 0: + raise RuntimeError( + f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has failed > 0" + ) + + if low_watermark is None or low_watermark > tps: + low_watermark = tps + + # Temporarily disabled: have seen some 0 tps regions on Hetzner runners, but not + # at the same time as a shard split. + # if tps < min_tps: + # raise RuntimeError( + # f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has tps < {min_tps}" + # ) + + log.info(f"Checked {matched_lines} progress lines, lowest TPS was {min_tps}") + + if matched_lines == 0: + raise RuntimeError(f"pgbench output at {out_path} contained no progress lines") + + def run_pgbench_main(endpoint): + out_path = pg_bin.run_capture( + [ + "pgbench", + "-s50", + "-T", + "180", + "-R", + f"{transaction_rate}", + "-P", + "1", + f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres", + ] + ) + + check_pgbench_output(out_path) + + def run_pgbench_read(endpoint): + out_path = pg_bin.run_capture( + [ + "pgbench", + "-s50", + "-T", + "30", + "-R", + f"{transaction_rate}", + "-S", + "-P", + "1", + f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres", + ] + ) + + check_pgbench_output(out_path) + + with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads: + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench inits") + for fut in pgbench_futs: + fut.result() + + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench read/write pass") + for fut in pgbench_futs: + fut.result() + + def assert_all_split(): + for tenant_id in tenants.keys(): + shards = tenant_get_shards(env, tenant_id) + assert len(shards) == 8 + + # This is not a wait_until, because we wanted the splits to happen _while_ pgbench is running: otherwise + # this test is not properly doing its job of validating that splits work nicely under load. + assert_all_split() + + env.storage_controller.assert_log_contains(".*Successful auto-split.*") + + # Log timeline sizes, useful for debug, and implicitly validates that the shards + # are available in the places the controller thinks they should be. + for tenant_id, tenant_state in tenants.items(): + (shard_zero_id, shard_zero_ps) = tenant_get_shards(env, tenant_id)[0] + timeline_info = shard_zero_ps.http_client().timeline_detail( + shard_zero_id, tenant_state.timeline_id + ) + log.info(f"{shard_zero_id} timeline: {timeline_info}") + + # Run compaction for all tenants, restart endpoint so that on subsequent reads we will + # definitely hit pageserver for reads. This compaction passis expected to drop unwanted + # layers but not do any rewrites (we're still in the same generation) + for tenant_id, tenant_state in tenants.items(): + tenant_state.endpoint.stop() + for shard_id, shard_ps in tenant_get_shards(env, tenant_id): + shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None) + shard_ps.http_client().timeline_compact(shard_id, tenant_state.timeline_id) + tenant_state.endpoint.start() + + with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads: + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench read pass") + for fut in pgbench_futs: + fut.result() + + env.storage_controller.consistency_check() + + # Restart the storage controller + env.storage_controller.stop() + env.storage_controller.start() + + env.storage_controller.consistency_check() + + # Restart all pageservers + for ps in env.pageservers: + ps.stop() + ps.start() + + # Freshen gc_info in Timeline, so that when compaction runs in the background in the + # subsequent pgbench period, the last_gc_cutoff is updated and enables the conditions for a rewrite to pass. + for tenant_id, tenant_state in tenants.items(): + for shard_id, shard_ps in tenant_get_shards(env, tenant_id): + shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None) + + # One last check data remains readable after everything has restarted + with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads: + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench read pass") + for fut in pgbench_futs: + fut.result() + + # Assert that some rewrites happened + # TODO: uncomment this after https://github.com/neondatabase/neon/pull/7531 is merged + # assert any(ps.log_contains(".*Rewriting layer after shard split.*") for ps in env.pageservers) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 87544af598..1bfeec6f4b 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1326,3 +1326,45 @@ def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder): # Ensure that post-endpoint-restart modifications are ingested happily by pageserver wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id) + + +def test_top_tenants(neon_env_builder: NeonEnvBuilder): + """ + The top_tenants API is used in shard auto-splitting to find candidates. + """ + + env = neon_env_builder.init_configs() + neon_env_builder.start() + + tenants = [] + n_tenants = 8 + for i in range(0, n_tenants): + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id) + + # Write a different amount of data to each tenant + w = Workload(env, tenant_id, timeline_id) + w.init() + w.write_rows(i * 1000) + w.stop() + + logical_size = env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[ + "current_logical_size" + ] + tenants.append((tenant_id, timeline_id, logical_size)) + + log.info(f"Created {tenant_id}/{timeline_id} with size {logical_size}") + + # Ask for 1 largest tenant + top_1 = env.pageserver.http_client().top_tenants("max_logical_size", 1, 8, 0) + assert len(top_1["shards"]) == 1 + assert top_1["shards"][0]["id"] == str(tenants[-1][0]) + assert top_1["shards"][0]["max_logical_size"] == tenants[-1][2] + + # Apply a lower bound limit + top = env.pageserver.http_client().top_tenants( + "max_logical_size", 100, 8, where_gt=tenants[3][2] + ) + assert len(top["shards"]) == n_tenants - 4 + assert set(i["id"] for i in top["shards"]) == set(str(i[0]) for i in tenants[4:]) From aaf60819fa479e37a4b477b20e1fbcee2d5a046f Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Fri, 17 May 2024 15:22:49 -0400 Subject: [PATCH 075/130] feat(pageserver): persist aux file policy in index part (#7668) Part of https://github.com/neondatabase/neon/issues/7462 ## Summary of changes Tenant config is not persisted unless it's attached on the storage controller. In this pull request, we persist the aux file policy flag in the `index_part.json`. Admins can set `switch_aux_file_policy` in the storage controller or using the page server API. Upon the first aux file gets written, the write path will compare the aux file policy target with the current policy. If it is switch-able, we will do the switch. Otherwise, the original policy will be used. The test cases show what the admins can do / cannot do. The `last_aux_file_policy` is stored in `IndexPart`. Updates to the persisted policy are done via `schedule_index_upload_for_aux_file_policy_update`. On the write path, the writer will update the field. --------- Signed-off-by: Alex Chi Z Co-authored-by: Joonas Koivunen --- libs/pageserver_api/src/models.rs | 134 ++++++++ pageserver/ctl/src/main.rs | 1 + pageserver/src/http/routes.rs | 2 + pageserver/src/pgdatadir_mapping.rs | 33 +- pageserver/src/tenant.rs | 296 ++++++++++++++++++ pageserver/src/tenant/config.rs | 4 +- .../src/tenant/remote_timeline_client.rs | 13 + .../tenant/remote_timeline_client/index.rs | 87 ++++- pageserver/src/tenant/timeline.rs | 9 +- pageserver/src/tenant/timeline/delete.rs | 2 + pageserver/src/tenant/upload_queue.rs | 6 + test_runner/regress/test_aux_files.py | 72 +++++ 12 files changed, 648 insertions(+), 11 deletions(-) create mode 100644 test_runner/regress/test_aux_files.py diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index d52fb5e93d..80ca696313 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -10,6 +10,7 @@ use std::{ io::{BufRead, Read}, num::{NonZeroU64, NonZeroUsize}, str::FromStr, + sync::atomic::AtomicUsize, time::{Duration, SystemTime}, }; @@ -308,13 +309,88 @@ pub struct TenantConfig { pub switch_aux_file_policy: Option, } +/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy` +/// tenant config. When the first aux file written, the policy will be persisted in the +/// `index_part.json` file and has a limited migration path. +/// +/// Currently, we only allow the following migration path: +/// +/// Unset -> V1 +/// -> V2 +/// -> CrossValidation -> V2 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum AuxFilePolicy { + /// V1 aux file policy: store everything in AUX_FILE_KEY V1, + /// V2 aux file policy: store in the AUX_FILE keyspace V2, + /// Cross validation runs both formats on the write path and does validation + /// on the read path. CrossValidation, } +impl AuxFilePolicy { + pub fn is_valid_migration_path(from: Option, to: Self) -> bool { + matches!( + (from, to), + (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2) + ) + } + + /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used. + pub fn default_tenant_config() -> Self { + Self::V1 + } +} + +/// The aux file policy memory flag. Users can store `Option` into this atomic flag. 0 == unspecified. +pub struct AtomicAuxFilePolicy(AtomicUsize); + +impl AtomicAuxFilePolicy { + pub fn new(policy: Option) -> Self { + Self(AtomicUsize::new( + policy.map(AuxFilePolicy::to_usize).unwrap_or_default(), + )) + } + + pub fn load(&self) -> Option { + match self.0.load(std::sync::atomic::Ordering::Acquire) { + 0 => None, + other => Some(AuxFilePolicy::from_usize(other)), + } + } + + pub fn store(&self, policy: Option) { + self.0.store( + policy.map(AuxFilePolicy::to_usize).unwrap_or_default(), + std::sync::atomic::Ordering::Release, + ); + } +} + +impl AuxFilePolicy { + pub fn to_usize(self) -> usize { + match self { + Self::V1 => 1, + Self::CrossValidation => 2, + Self::V2 => 3, + } + } + + pub fn try_from_usize(this: usize) -> Option { + match this { + 1 => Some(Self::V1), + 2 => Some(Self::CrossValidation), + 3 => Some(Self::V2), + _ => None, + } + } + + pub fn from_usize(this: usize) -> Self { + Self::try_from_usize(this).unwrap() + } +} + impl FromStr for AuxFilePolicy { type Err = anyhow::Error; @@ -604,6 +680,9 @@ pub struct TimelineInfo { pub state: TimelineState, pub walreceiver_status: String, + + /// The last aux file policy being used on this timeline + pub last_aux_file_policy: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -1505,4 +1584,59 @@ mod tests { assert_eq!(actual, expected, "example on {line}"); } } + + #[test] + fn test_aux_file_migration_path() { + assert!(AuxFilePolicy::is_valid_migration_path( + None, + AuxFilePolicy::V1 + )); + assert!(AuxFilePolicy::is_valid_migration_path( + None, + AuxFilePolicy::V2 + )); + assert!(AuxFilePolicy::is_valid_migration_path( + None, + AuxFilePolicy::CrossValidation + )); + // Self-migration is not a valid migration path, and the caller should handle it by itself. + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V1), + AuxFilePolicy::V1 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V2), + AuxFilePolicy::V2 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::CrossValidation), + AuxFilePolicy::CrossValidation + )); + // Migrations not allowed + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::CrossValidation), + AuxFilePolicy::V1 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V1), + AuxFilePolicy::V2 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V2), + AuxFilePolicy::V1 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V2), + AuxFilePolicy::CrossValidation + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V1), + AuxFilePolicy::CrossValidation + )); + // Migrations allowed + assert!(AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::CrossValidation), + AuxFilePolicy::V2 + )); + } } diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 1fb75584fc..e92c352dab 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -219,6 +219,7 @@ fn handle_metadata( let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; println!("Current metadata:\n{meta:?}"); let mut update_meta = false; + // TODO: simplify this part if let Some(disk_consistent_lsn) = disk_consistent_lsn { meta = TimelineMetadata::new( *disk_consistent_lsn, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index b8d5c67ce0..7efd48afc7 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -439,6 +439,8 @@ async fn build_timeline_info_common( state, walreceiver_status, + + last_aux_file_policy: timeline.last_aux_file_policy.load(), }; Ok(info) } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 1092d64d33..402f075365 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -35,7 +35,7 @@ use std::ops::ControlFlow; use std::ops::Range; use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; -use tracing::{debug, trace, warn}; +use tracing::{debug, info, trace, warn}; use utils::bin_ser::DeserializeError; use utils::vec_map::{VecMap, VecMapOrdering}; use utils::{bin_ser::BeSer, lsn::Lsn}; @@ -718,10 +718,11 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) -> Result, PageReconstructError> { - match self.get_switch_aux_file_policy() { - AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await, - AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await, - AuxFilePolicy::CrossValidation => { + let current_policy = self.last_aux_file_policy.load(); + match current_policy { + Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await, + Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await, + Some(AuxFilePolicy::CrossValidation) => { let v1_result = self.list_aux_files_v1(lsn, ctx).await; let v2_result = self.list_aux_files_v2(lsn, ctx).await; match (v1_result, v2_result) { @@ -1469,7 +1470,27 @@ impl<'a> DatadirModification<'a> { content: &[u8], ctx: &RequestContext, ) -> anyhow::Result<()> { - let policy = self.tline.get_switch_aux_file_policy(); + let switch_policy = self.tline.get_switch_aux_file_policy(); + + let policy = { + let current_policy = self.tline.last_aux_file_policy.load(); + // Allowed switch path: + // * no aux files -> v1/v2/cross-validation + // * cross-validation->v2 + if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) { + self.tline.last_aux_file_policy.store(Some(switch_policy)); + self.tline + .remote_client + .schedule_index_upload_for_aux_file_policy_update(Some(switch_policy))?; + info!(current=?current_policy, next=?switch_policy, "switching aux file policy"); + switch_policy + } else { + // This branch handles non-valid migration path, and the case that switch_policy == current_policy. + // And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit. + current_policy.unwrap_or(AuxFilePolicy::default_tenant_config()) + } + }; + if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy { let key = aux_file::encode_aux_file_key(path); // retrieve the key from the engine diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 54b63f7042..d42b9082b7 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -20,6 +20,7 @@ use futures::stream::FuturesUnordered; use futures::FutureExt; use futures::StreamExt; use pageserver_api::models; +use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::TimelineState; use pageserver_api::models::TopTenantShardItem; use pageserver_api::models::WalRedoManagerStatus; @@ -529,6 +530,7 @@ impl Tenant { index_part: Option, metadata: TimelineMetadata, ancestor: Option>, + last_aux_file_policy: Option, _ctx: &RequestContext, ) -> anyhow::Result<()> { let tenant_id = self.tenant_shard_id; @@ -539,6 +541,10 @@ impl Tenant { ancestor.clone(), resources, CreateTimelineCause::Load, + // This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`, + // there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence. + // Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2. + last_aux_file_policy, )?; let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); anyhow::ensure!( @@ -553,6 +559,10 @@ impl Tenant { if let Some(index_part) = index_part.as_ref() { timeline.remote_client.init_upload_queue(index_part)?; + + timeline + .last_aux_file_policy + .store(index_part.last_aux_file_policy()); } else { // No data on the remote storage, but we have local metadata file. We can end up // here with timeline_create being interrupted before finishing index part upload. @@ -1173,12 +1183,15 @@ impl Tenant { None }; + let last_aux_file_policy = index_part.last_aux_file_policy(); + self.timeline_init_and_sync( timeline_id, resources, Some(index_part), remote_metadata, ancestor, + last_aux_file_policy, ctx, ) .await @@ -1358,6 +1371,7 @@ impl Tenant { create_guard, initdb_lsn, None, + None, ) .await } @@ -2441,6 +2455,7 @@ impl Tenant { ancestor: Option>, resources: TimelineResources, cause: CreateTimelineCause, + last_aux_file_policy: Option, ) -> anyhow::Result> { let state = match cause { CreateTimelineCause::Load => { @@ -2469,6 +2484,7 @@ impl Tenant { resources, pg_version, state, + last_aux_file_policy, self.cancel.child_token(), ); @@ -3119,6 +3135,7 @@ impl Tenant { timeline_create_guard, start_lsn + 1, Some(Arc::clone(src_timeline)), + src_timeline.last_aux_file_policy.load(), ) .await?; @@ -3312,6 +3329,7 @@ impl Tenant { timeline_create_guard, pgdata_lsn, None, + None, ) .await?; @@ -3383,6 +3401,7 @@ impl Tenant { create_guard: TimelineCreateGuard<'a>, start_lsn: Lsn, ancestor: Option>, + last_aux_file_policy: Option, ) -> anyhow::Result { let tenant_shard_id = self.tenant_shard_id; @@ -3398,6 +3417,7 @@ impl Tenant { ancestor, resources, CreateTimelineCause::Load, + last_aux_file_policy, ) .context("Failed to create timeline data structure")?; @@ -5621,4 +5641,280 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn test_branch_copies_dirty_aux_file_flag() { + let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap(); + + // the default aux file policy to switch is v1 if not set by the admins + assert_eq!( + harness.tenant_conf.switch_aux_file_policy, + AuxFilePolicy::V1 + ); + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + // no aux file is written at this point, so the persistent flag should be unset + assert_eq!(tline.last_aux_file_policy.load(), None); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // there is no tenant manager to pass the configuration through, so lets mimic it + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + assert_eq!( + tline.get_switch_aux_file_policy(), + AuxFilePolicy::V2, + "wanted state has been updated" + ); + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V1), + "aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1" + ); + + // we can read everything from the storage + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"second", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V1), + "keep v1 storage format when new files are written" + ); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"second")) + ); + + let child = tenant + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx) + .await + .unwrap(); + + // child copies the last flag even if that is not on remote storage yet + assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2); + assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1)); + + let files = child.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!(files.get("pg_logical/mappings/test1"), None); + assert_eq!(files.get("pg_logical/mappings/test2"), None); + + // even if we crash here without flushing parent timeline with it's new + // last_aux_file_policy we are safe, because child was never meant to access ancestor's + // files. the ancestor can even switch back to V1 because of a migration safely. + } + + #[tokio::test] + async fn aux_file_policy_switch() { + let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap(); + harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + None, + "no aux file is written so it should be unset" + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // there is no tenant manager to pass the configuration through, so lets mimic it + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + assert_eq!( + tline.get_switch_aux_file_policy(), + AuxFilePolicy::V2, + "wanted state has been updated" + ); + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::CrossValidation), + "dirty index_part.json reflected state is yet to be updated" + ); + + // we can still read the auxfile v1 before we ingest anything new + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"second", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V2), + "ingesting a file should apply the wanted switch state when applicable" + ); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")), + "cross validation writes to both v1 and v2 so this should be available in v2" + ); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"second")) + ); + + // mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file) + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V1), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"third", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.get_switch_aux_file_policy(), + AuxFilePolicy::V1, + "wanted state has been updated again, even if invalid request" + ); + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V2), + "ingesting a file should apply the wanted switch state when applicable" + ); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"third")) + ); + + // mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file) + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test3", b"last", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2); + + assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2)); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"third")) + ); + assert_eq!( + files.get("pg_logical/mappings/test3"), + Some(&bytes::Bytes::from_static(b"last")) + ); + } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index a743ce3c16..a695363cdc 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -373,6 +373,8 @@ pub struct TenantConf { /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions. + /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux + /// file is written. pub switch_aux_file_policy: AuxFilePolicy, } @@ -574,7 +576,7 @@ impl Default for TenantConf { lazy_slru_download: false, timeline_get_throttle: crate::tenant::throttle::Config::disabled(), image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, - switch_aux_file_policy: AuxFilePolicy::V1, + switch_aux_file_policy: AuxFilePolicy::default_tenant_config(), } } } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 3a1113cf01..d3adae6841 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -189,6 +189,7 @@ use camino::Utf8Path; use chrono::{NaiveDateTime, Utc}; pub(crate) use download::download_initdb_tar_zst; +use pageserver_api::models::AuxFilePolicy; use pageserver_api::shard::{ShardIndex, TenantShardId}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; @@ -611,6 +612,17 @@ impl RemoteTimelineClient { Ok(()) } + /// Launch an index-file upload operation in the background, with only aux_file_policy flag updated. + pub(crate) fn schedule_index_upload_for_aux_file_policy_update( + self: &Arc, + last_aux_file_policy: Option, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + upload_queue.last_aux_file_policy = last_aux_file_policy; + self.schedule_index_upload(upload_queue); + Ok(()) + } /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -1851,6 +1863,7 @@ impl RemoteTimelineClient { dangling_files: HashMap::default(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), + last_aux_file_policy: initialized.last_aux_file_policy, }; let upload_queue = std::mem::replace( diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index b114d6aa10..032dda7ff3 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -5,6 +5,7 @@ use std::collections::HashMap; use chrono::NaiveDateTime; +use pageserver_api::models::AuxFilePolicy; use serde::{Deserialize, Serialize}; use utils::id::TimelineId; @@ -88,6 +89,16 @@ pub struct IndexPart { #[serde(default)] pub(crate) lineage: Lineage, + + /// Describes the kind of aux files stored in the timeline. + /// + /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable. + /// A V1 setting after V2 files have been committed is not accepted. + /// + /// None means no aux files have been written to the storage before the point + /// when this flag is introduced. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) last_aux_file_policy: Option, } impl IndexPart { @@ -101,10 +112,11 @@ impl IndexPart { /// is always generated from the keys of `layer_metadata`) /// - 4: timeline_layers is fully removed. /// - 5: lineage was added - const LATEST_VERSION: usize = 5; + /// - 6: last_aux_file_policy is added. + const LATEST_VERSION: usize = 6; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -113,6 +125,7 @@ impl IndexPart { disk_consistent_lsn: Lsn, metadata: TimelineMetadata, lineage: Lineage, + last_aux_file_policy: Option, ) -> Self { let layer_metadata = layers_and_metadata .iter() @@ -126,6 +139,7 @@ impl IndexPart { metadata, deleted_at: None, lineage, + last_aux_file_policy, } } @@ -155,8 +169,13 @@ impl IndexPart { example_metadata.disk_consistent_lsn(), example_metadata, Default::default(), + Some(AuxFilePolicy::V1), ) } + + pub(crate) fn last_aux_file_policy(&self) -> Option { + self.last_aux_file_policy + } } impl From<&UploadQueueInitialized> for IndexPart { @@ -165,7 +184,13 @@ impl From<&UploadQueueInitialized> for IndexPart { let metadata = uq.latest_metadata.clone(); let lineage = uq.latest_lineage.clone(); - Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage) + Self::new( + &uq.latest_files, + disk_consistent_lsn, + metadata, + lineage, + uq.last_aux_file_policy, + ) } } @@ -299,6 +324,7 @@ mod tests { metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, lineage: Lineage::default(), + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -340,6 +366,7 @@ mod tests { metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, lineage: Lineage::default(), + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -383,6 +410,7 @@ mod tests { deleted_at: Some(chrono::NaiveDateTime::parse_from_str( "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), lineage: Lineage::default(), + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -428,6 +456,7 @@ mod tests { .unwrap(), deleted_at: None, lineage: Lineage::default(), + last_aux_file_policy: None, }; let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -468,6 +497,7 @@ mod tests { metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), lineage: Lineage::default(), + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -511,6 +541,57 @@ mod tests { reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), }, + last_aux_file_policy: None, + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v6_indexpart_is_parsed() { + let example = r#"{ + "version":6, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], + "deleted_at": "2023-07-31T09:00:00.123", + "lineage":{ + "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"], + "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"] + }, + "last_aux_file_policy": "V2" + }"#; + + let expected = IndexPart { + version: 6, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + // serde_json should always parse this but this might be a double with jq for + // example. + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), + deleted_at: Some(chrono::NaiveDateTime::parse_from_str( + "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), + lineage: Lineage { + reparenting_history_truncated: false, + reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], + original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), + }, + last_aux_file_policy: Some(AuxFilePolicy::V2), }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index df9bc9b35b..1fb1928079 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -23,7 +23,7 @@ use pageserver_api::{ }, keyspace::{KeySpaceAccum, SparseKeyPartitioning}, models::{ - AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo, + AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState, }, @@ -413,7 +413,11 @@ pub struct Timeline { /// Keep aux directory cache to avoid it's reconstruction on each update pub(crate) aux_files: tokio::sync::Mutex, + /// Size estimator for aux file v2 pub(crate) aux_file_size_estimator: AuxFileSizeEstimator, + + /// Indicate whether aux file v2 storage is enabled. + pub(crate) last_aux_file_policy: AtomicAuxFilePolicy, } pub struct WalReceiverInfo { @@ -2133,6 +2137,7 @@ impl Timeline { resources: TimelineResources, pg_version: u32, state: TimelineState, + aux_file_policy: Option, cancel: CancellationToken, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -2257,6 +2262,8 @@ impl Timeline { }), aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics), + + last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy), }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 901f5149b3..b5dfc86e77 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -280,6 +280,8 @@ impl DeleteTimelineFlow { // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. CreateTimelineCause::Delete, + // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace + None, ) .context("create_timeline_struct")?; diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index a2f761fa94..c0cc8f3124 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -8,6 +8,7 @@ use std::collections::{HashMap, VecDeque}; use std::fmt::Debug; use chrono::NaiveDateTime; +use pageserver_api::models::AuxFilePolicy; use std::sync::Arc; use tracing::info; use utils::lsn::AtomicLsn; @@ -60,6 +61,9 @@ pub(crate) struct UploadQueueInitialized { /// Part of the flattened "next" `index_part.json`. pub(crate) latest_lineage: Lineage, + /// The last aux file policy used on this timeline. + pub(crate) last_aux_file_policy: Option, + /// `disk_consistent_lsn` from the last metadata file that was successfully /// uploaded. `Lsn(0)` if nothing was uploaded yet. /// Unlike `latest_files` or `latest_metadata`, this value is never ahead. @@ -189,6 +193,7 @@ impl UploadQueue { dangling_files: HashMap::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), + last_aux_file_policy: Default::default(), }; *self = UploadQueue::Initialized(state); @@ -239,6 +244,7 @@ impl UploadQueue { dangling_files: HashMap::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), + last_aux_file_policy: index_part.last_aux_file_policy(), }; *self = UploadQueue::Initialized(state); diff --git a/test_runner/regress/test_aux_files.py b/test_runner/regress/test_aux_files.py new file mode 100644 index 0000000000..be9c41a867 --- /dev/null +++ b/test_runner/regress/test_aux_files.py @@ -0,0 +1,72 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + logical_replication_sync, +) + + +def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main") + client = env.pageserver.http_client() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + tenant_config = client.tenant_config(tenant_id).effective_config + tenant_config["switch_aux_file_policy"] = "V2" + client.set_tenant_config(tenant_id, tenant_config) + # aux file v2 is enabled on the write path, so for now, it should be unset (or null) + assert ( + client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["last_aux_file_policy"] + is None + ) + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + cur.execute("create table t(pk integer primary key, payload integer)") + cur.execute( + "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));" + ) + cur.execute("create publication pub1 for table t, replication_example") + + # now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils) + # instead of going through the full logical replication process. + vanilla_pg.start() + vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)") + vanilla_pg.safe_psql( + "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);" + ) + connstr = endpoint.connstr().replace("'", "''") + log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") + + # Wait logical replication channel to be established + logical_replication_sync(vanilla_pg, endpoint) + vanilla_pg.stop() + endpoint.stop() + + with env.pageserver.http_client() as client: + # aux file v2 flag should be enabled at this point + assert client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] == "V2" + with env.pageserver.http_client() as client: + tenant_config = client.tenant_config(tenant_id).effective_config + tenant_config["switch_aux_file_policy"] = "V1" + client.set_tenant_config(tenant_id, tenant_config) + # the flag should still be enabled + assert ( + client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[ + "last_aux_file_policy" + ] + == "V2" + ) + env.pageserver.restart() + with env.pageserver.http_client() as client: + # aux file v2 flag should be persisted + assert ( + client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[ + "last_aux_file_policy" + ] + == "V2" + ) From e1a9669d05374ea27685a1cf527676fe01df7722 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Fri, 17 May 2024 16:04:02 -0400 Subject: [PATCH 076/130] feat(pagebench): add aux file bench (#7746) part of https://github.com/neondatabase/neon/issues/7462 ## Summary of changes This pull request adds two APIs to the pageserver management API: list_aux_files and ingest_aux_files. The aux file pagebench is intended to be used on an empty timeline because the data do not go through the safekeeper. LSNs are advanced by 8 for each ingestion, to avoid invariant checks inside the pageserver. For now, I only care about space amplification / read amplification, so the bench is designed in a very simple way: ingest 10000 files, and I will manually dump the layer map to analyze. --------- Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/models.rs | 10 +++ pageserver/client/src/mgmt_api.rs | 57 +++++++++++++ pageserver/pagebench/src/cmd/aux_files.rs | 98 +++++++++++++++++++++++ pageserver/pagebench/src/main.rs | 3 + pageserver/src/http/routes.rs | 75 +++++++++++++++++ 5 files changed, 243 insertions(+) create mode 100644 pageserver/pagebench/src/cmd/aux_files.rs diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 80ca696313..451ee1a13c 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -841,6 +841,16 @@ pub struct DownloadRemoteLayersTaskSpawnRequest { pub max_concurrent_downloads: NonZeroUsize, } +#[derive(Debug, Serialize, Deserialize)] +pub struct IngestAuxFilesRequest { + pub aux_files: HashMap, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct ListAuxFilesRequest { + pub lsn: Lsn, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct DownloadRemoteLayersTaskInfo { pub task_id: String, diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 5904713da9..69b86d9c46 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,8 +1,12 @@ +use std::collections::HashMap; + +use bytes::Bytes; use pageserver_api::{models::*, shard::TenantShardId}; use reqwest::{IntoUrl, Method, StatusCode}; use utils::{ http::error::HttpErrorBody, id::{TenantId, TimelineId}, + lsn::Lsn, }; pub mod util; @@ -561,4 +565,57 @@ impl Client { }), } } + + pub async fn ingest_aux_files( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + aux_files: HashMap, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/ingest_aux_files", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id + ); + let resp = self + .request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files }) + .await?; + match resp.status() { + StatusCode::OK => Ok(true), + status => Err(match resp.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) + } + }), + } + } + + pub async fn list_aux_files( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Result> { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/list_aux_files", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id + ); + let resp = self + .request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn }) + .await?; + match resp.status() { + StatusCode::OK => { + let resp: HashMap = resp.json().await.map_err(|e| { + Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}")) + })?; + Ok(resp) + } + status => Err(match resp.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) + } + }), + } + } } diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs new file mode 100644 index 0000000000..eb5b242a5f --- /dev/null +++ b/pageserver/pagebench/src/cmd/aux_files.rs @@ -0,0 +1,98 @@ +use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest}; +use pageserver_api::shard::TenantShardId; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + +use std::collections::HashMap; +use std::sync::Arc; + +/// Ingest aux files into the pageserver. +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long, default_value = "postgres://postgres@localhost:64000")] + page_service_connstring: String, + #[clap(long)] + pageserver_jwt: Option, + + targets: Option>, +} + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + let main_task = rt.spawn(main_impl(args)); + rt.block_on(main_task).unwrap() +} + +async fn main_impl(args: Args) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: None, + targets: { + if let Some(targets) = &args.targets { + if targets.len() != 1 { + anyhow::bail!("must specify exactly one target"); + } + Some(targets.clone()) + } else { + None + } + }, + }, + ) + .await?; + + let timeline = timelines[0]; + let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id); + let timeline_id = timeline.timeline_id; + + println!("operating on timeline {}", timeline); + + mgmt_api_client + .tenant_config(&TenantConfigRequest { + tenant_id: timeline.tenant_id, + config: TenantConfig { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + }) + .await?; + + for batch in 0..100 { + let items = (0..100) + .map(|id| { + ( + format!("pg_logical/mappings/{:03}.{:03}", batch, id), + format!("{:08}", id), + ) + }) + .collect::>(); + let file_cnt = items.len(); + mgmt_api_client + .ingest_aux_files(tenant_shard_id, timeline_id, items) + .await?; + println!("ingested {file_cnt} files"); + } + + let files = mgmt_api_client + .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1)) + .await?; + + println!("{} files found", files.len()); + + anyhow::Ok(()) +} diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs index 743102d853..5527557450 100644 --- a/pageserver/pagebench/src/main.rs +++ b/pageserver/pagebench/src/main.rs @@ -14,6 +14,7 @@ mod util { /// The pagebench CLI sub-commands, dispatched in [`main`] below. mod cmd { + pub(super) mod aux_files; pub(super) mod basebackup; pub(super) mod getpage_latest_lsn; pub(super) mod ondemand_download_churn; @@ -27,6 +28,7 @@ enum Args { GetPageLatestLsn(cmd::getpage_latest_lsn::Args), TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args), OndemandDownloadChurn(cmd::ondemand_download_churn::Args), + AuxFiles(cmd::aux_files::Args), } fn main() { @@ -46,6 +48,7 @@ fn main() { cmd::trigger_initial_size_calculation::main(args) } Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args), + Args::AuxFiles(args) => cmd::aux_files::main(args), } .unwrap() } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 7efd48afc7..0eab6510ca 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -16,6 +16,8 @@ use hyper::header; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; +use pageserver_api::models::IngestAuxFilesRequest; +use pageserver_api::models::ListAuxFilesRequest; use pageserver_api::models::LocationConfig; use pageserver_api::models::LocationConfigListResponse; use pageserver_api::models::ShardParameters; @@ -2331,6 +2333,71 @@ async fn get_utilization( .map_err(ApiError::InternalServerError) } +async fn list_aux_files( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let body: ListAuxFilesRequest = json_request(&mut request).await?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + let process = || async move { + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let files = timeline.list_aux_files(body.lsn, &ctx).await?; + Ok::<_, anyhow::Error>(files) + }; + + match process().await { + Ok(st) => json_response(StatusCode::OK, st), + Err(err) => json_response( + StatusCode::INTERNAL_SERVER_ERROR, + ApiError::InternalServerError(err).to_string(), + ), + } +} + +async fn ingest_aux_files( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let body: IngestAuxFilesRequest = json_request(&mut request).await?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + let process = || async move { + let mut modification = timeline.begin_modification(Lsn( + timeline.get_last_record_lsn().0 + 8 + ) /* advance LSN by 8 */); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + for (fname, content) in body.aux_files { + modification + .put_file(&fname, content.as_bytes(), &ctx) + .await?; + } + modification.commit(&ctx).await?; + Ok::<_, anyhow::Error>(()) + }; + + match process().await { + Ok(st) => json_response(StatusCode::OK, st), + Err(err) => Err(ApiError::InternalServerError(err)), + } +} + /// Report on the largest tenants on this pageserver, for the storage controller to identify /// candidates for splitting async fn post_top_tenants( @@ -2708,6 +2775,14 @@ pub fn make_router( ) .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) .get("/v1/utilization", |r| api_handler(r, get_utilization)) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files", + |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files), + ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/list_aux_files", + |r| testing_api_handler("list_aux_files", r, list_aux_files), + ) .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants)) .any(handler_404)) } From 5caee4ca54ea16905ccc3e7f60b3221f33a74b91 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 19 May 2024 20:49:49 +0300 Subject: [PATCH 077/130] Fix calculation in test The comment says that this checks if there's enough space on the page for logical message *and* an XLOG_SWITCH. So the sizes of the logical message and the XLOG_SWITCH record should be added together, not subtracted. I saw a panic in the test that led me to investigate and notice this (https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7803/9142396223/index.html): RuntimeError: Run ['/tmp/neon/bin/wal_craft', 'in-existing', 'last_wal_record_xlog_switch_ends_on_page_boundary', "host=localhost port=16165 user=cloud_admin dbname=postgres options='-cstatement_timeout=120s '"] failed: stdout: stderr: thread 'main' panicked at libs/postgres_ffi/wal_craft/src/lib.rs:370:27: attempt to subtract with overflow stack backtrace: 0: rust_begin_unwind at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/std/src/panicking.rs:645:5 1: core::panicking::panic_fmt at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/core/src/panicking.rs:72:14 2: core::panicking::panic at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/core/src/panicking.rs:145:5 3: ::craft:: at libs/postgres_ffi/wal_craft/src/lib.rs:370:27 4: wal_craft::main::{closure#0} at libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs:21:17 5: wal_craft::main at libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs:66:47 6: core::result::Result<(), anyhow::Error> as core::ops::function::FnOnce<()>>::call_once at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/core/src/ops/function.rs:250:5 note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace. --- libs/postgres_ffi/wal_craft/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 262068cbda..b6769629a8 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -359,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { // Is there enough space on the page for another logical message and an // XLOG_SWITCH? If not, start over. let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64; - if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 { + if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 { continue; } From a5ecca976ec3abf97c8c3db4f78c230b4553b509 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Sun, 19 May 2024 20:45:53 +0100 Subject: [PATCH 078/130] proxy: bump parquet (#7782) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary of changes Updates the parquet lib. one change left that we need is in an open PR against upstream, hopefully we can remove the git dependency by 52.0.0 https://github.com/apache/arrow-rs/pull/5773 I'm not sure why the parquet files got a little bit bigger. I tested them and they still open fine. 🤷 side effect of the update, chrono updated and added yet another deprecation warning (hence why the safekeepers change) --- Cargo.lock | 29 ++++++++++---- Cargo.toml | 8 ++-- proxy/src/context/parquet.rs | 73 +++++++++++++++++++----------------- safekeeper/src/broker.rs | 2 +- workspace_hack/Cargo.toml | 4 +- 5 files changed, 67 insertions(+), 49 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e1edd53fea..e6060c82f5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1072,9 +1072,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.31" +version = "0.4.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" dependencies = [ "android-tzdata", "iana-time-zone", @@ -1082,7 +1082,7 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-targets 0.48.0", + "windows-targets 0.52.4", ] [[package]] @@ -1109,7 +1109,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" dependencies = [ "ciborium-io", - "half", + "half 1.8.2", ] [[package]] @@ -2278,6 +2278,17 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", +] + [[package]] name = "hash32" version = "0.3.1" @@ -3902,12 +3913,13 @@ dependencies = [ [[package]] name = "parquet" -version = "49.0.0" -source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9" +version = "51.0.0" +source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829" dependencies = [ "ahash", "bytes", "chrono", + "half 2.4.1", "hashbrown 0.14.5", "num", "num-bigint", @@ -3916,12 +3928,13 @@ dependencies = [ "thrift", "twox-hash", "zstd", + "zstd-sys", ] [[package]] name = "parquet_derive" -version = "49.0.0" -source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9" +version = "51.0.0" +source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829" dependencies = [ "parquet", "proc-macro2", diff --git a/Cargo.toml b/Cargo.toml index b59a5dcd6d..2a7dea447e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -122,8 +122,8 @@ opentelemetry = "0.20.0" opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions = "0.12.0" parking_lot = "0.12" -parquet = { version = "49.0.0", default-features = false, features = ["zstd"] } -parquet_derive = "49.0.0" +parquet = { version = "51.0.0", default-features = false, features = ["zstd"] } +parquet_derive = "51.0.0" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" procfs = "0.14" @@ -244,8 +244,8 @@ tonic-build = "0.9" tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } # bug fixes for UUID -parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" } -parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" } +parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" } +parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" } ################# Binary contents sections diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 8104fe6087..392821c430 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -307,7 +307,7 @@ where } async fn upload_parquet( - w: SerializedFileWriter>, + mut w: SerializedFileWriter>, len: i64, storage: &GenericRemoteStorage, ) -> anyhow::Result> { @@ -319,11 +319,15 @@ async fn upload_parquet( // I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry. // finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253 - let (writer, metadata) = tokio::task::spawn_blocking(move || w.finish()) + let (mut buffer, metadata) = + tokio::task::spawn_blocking(move || -> parquet::errors::Result<_> { + let metadata = w.finish()?; + let buffer = std::mem::take(w.inner_mut().get_mut()); + Ok((buffer, metadata)) + }) .await .unwrap()?; - let mut buffer = writer.into_inner(); let data = buffer.split().freeze(); let compression = len as f64 / len_uncompressed as f64; @@ -474,10 +478,11 @@ mod tests { RequestData { session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(), peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(), - timestamp: chrono::NaiveDateTime::from_timestamp_millis( + timestamp: chrono::DateTime::from_timestamp_millis( rng.gen_range(1703862754..1803862754), ) - .unwrap(), + .unwrap() + .naive_utc(), application_name: Some("test".to_owned()), username: Some(hex::encode(rng.gen::<[u8; 4]>())), endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())), @@ -560,15 +565,15 @@ mod tests { assert_eq!( file_stats, [ - (1315008, 3, 6000), - (1315001, 3, 6000), - (1315061, 3, 6000), - (1315018, 3, 6000), - (1315148, 3, 6000), - (1314990, 3, 6000), - (1314782, 3, 6000), - (1315018, 3, 6000), - (438575, 1, 2000) + (1315314, 3, 6000), + (1315307, 3, 6000), + (1315367, 3, 6000), + (1315324, 3, 6000), + (1315454, 3, 6000), + (1315296, 3, 6000), + (1315088, 3, 6000), + (1315324, 3, 6000), + (438713, 1, 2000) ] ); @@ -598,11 +603,11 @@ mod tests { assert_eq!( file_stats, [ - (1221738, 5, 10000), - (1227888, 5, 10000), - (1229682, 5, 10000), - (1229044, 5, 10000), - (1220322, 5, 10000) + (1222212, 5, 10000), + (1228362, 5, 10000), + (1230156, 5, 10000), + (1229518, 5, 10000), + (1220796, 5, 10000) ] ); @@ -634,11 +639,11 @@ mod tests { assert_eq!( file_stats, [ - (1207385, 5, 10000), - (1207116, 5, 10000), - (1207409, 5, 10000), - (1207397, 5, 10000), - (1207652, 5, 10000) + (1207859, 5, 10000), + (1207590, 5, 10000), + (1207883, 5, 10000), + (1207871, 5, 10000), + (1208126, 5, 10000) ] ); @@ -663,15 +668,15 @@ mod tests { assert_eq!( file_stats, [ - (1315008, 3, 6000), - (1315001, 3, 6000), - (1315061, 3, 6000), - (1315018, 3, 6000), - (1315148, 3, 6000), - (1314990, 3, 6000), - (1314782, 3, 6000), - (1315018, 3, 6000), - (438575, 1, 2000) + (1315314, 3, 6000), + (1315307, 3, 6000), + (1315367, 3, 6000), + (1315324, 3, 6000), + (1315454, 3, 6000), + (1315296, 3, 6000), + (1315088, 3, 6000), + (1315324, 3, 6000), + (438713, 1, 2000) ] ); @@ -708,7 +713,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)] + [(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 98f58d3e49..ea16ce450f 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -319,7 +319,7 @@ async fn task_stats(stats: Arc) { let now = BrokerStats::now_millis(); if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 { - let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp"); + let ts = chrono::DateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp"); info!("no broker updates for some time, last update: {:?}", ts); } } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index b605757f64..7582562450 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -51,7 +51,7 @@ num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } -parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] } +parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } @@ -102,7 +102,7 @@ num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } -parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] } +parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } prost = { version = "0.11" } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } From 291fcb9e4f7086237c409580b1fd3accf46d5ba3 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 20 May 2024 09:25:25 +0100 Subject: [PATCH 079/130] pageserver: use the heatmap upload interval to set the secondary download interval (#7793) ## Problem The heatmap upload period is configurable, but secondary mode downloads were using a fixed download period. Closes: #6200 ## Summary of changes - Use the upload period in the heatmap to adjust the download period. In practice, this will reduce the frequency of downloads from its current 60 second period to what heatmaps use, which is 5-10m depending on environment. This is an improvement rather than being optimal: we could be smarter about periods, and schedule downloads to occur around the time we expect the next upload, rather than just using the same period, but that's something we can address in future if it comes up. --- pageserver/src/tenant/secondary/downloader.rs | 84 ++++++++++--------- .../regress/test_pageserver_secondary.py | 18 ++-- 2 files changed, 58 insertions(+), 44 deletions(-) diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 8f27220771..609e1431cf 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -62,14 +62,10 @@ use super::{ CommandRequest, DownloadCommand, }; -/// For each tenant, how long must have passed since the last download_tenant call before -/// calling it again. This is approximately the time by which local data is allowed -/// to fall behind remote data. -/// -/// TODO: this should just be a default, and the actual period should be controlled -/// via the heatmap itself -/// `` -const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000); +/// For each tenant, default period for how long must have passed since the last download_tenant call before +/// calling it again. This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first +/// download, if the uploader populated it. +const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000); /// Range of concurrency we may use when downloading layers within a timeline. This is independent /// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in @@ -152,14 +148,22 @@ pub(super) struct SecondaryDetailTimeline { pub(super) evicted_at: HashMap, } +// Aspects of a heatmap that we remember after downloading it +#[derive(Clone, Debug)] +struct DownloadSummary { + etag: Etag, + #[allow(unused)] + mtime: SystemTime, + upload_period: Duration, +} + /// This state is written by the secondary downloader, it is opaque /// to TenantManager #[derive(Debug)] pub(super) struct SecondaryDetail { pub(super) config: SecondaryLocationConfig, - last_download: Option, - last_etag: Option, + last_download: Option, next_download: Option, pub(super) timelines: HashMap, } @@ -189,7 +193,6 @@ impl SecondaryDetail { Self { config, last_download: None, - last_etag: None, next_download: None, timelines: HashMap::new(), } @@ -243,9 +246,8 @@ impl SecondaryDetail { struct PendingDownload { secondary_state: Arc, - last_download: Option, + last_download: Option, target_time: Option, - period: Option, } impl scheduler::PendingJob for PendingDownload { @@ -295,10 +297,17 @@ impl JobGenerator SchedulingResult { @@ -331,11 +340,11 @@ impl JobGenerator next_download { @@ -343,7 +352,6 @@ impl JobGenerator TenantDownloader<'a> { let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); // We will use the etag from last successful download to make the download conditional on changes - let last_etag = self + let last_download = self .secondary_state .detail .lock() .unwrap() - .last_etag + .last_download .clone(); // Download the tenant's heatmap @@ -539,7 +540,7 @@ impl<'a> TenantDownloader<'a> { etag: heatmap_etag, bytes: heatmap_bytes, } = match tokio::select!( - bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?}, + bytes = self.download_heatmap(last_download.as_ref().map(|d| &d.etag)) => {bytes?}, _ = self.secondary_state.cancel.cancelled() => return Ok(()) ) { HeatMapDownload::Unmodified => { @@ -599,7 +600,14 @@ impl<'a> TenantDownloader<'a> { // Only update last_etag after a full successful download: this way will not skip // the next download, even if the heatmap's actual etag is unchanged. - self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag); + self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary { + etag: heatmap_etag, + mtime: heatmap_mtime, + upload_period: heatmap + .upload_period_ms + .map(|ms| Duration::from_millis(ms as u64)) + .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL), + }); Ok(()) } diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index fdc09a063d..127340a1e7 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -575,7 +575,10 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): tenant_timelines = {} # This mirrors a constant in `downloader.rs` - freshen_interval_secs = 60 + default_download_period_secs = 60 + + # The upload period, which will also be the download once the secondary has seen its first heatmap + upload_period_secs = 20 for _i in range(0, tenant_count): tenant_id = TenantId.generate() @@ -587,7 +590,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): placement_policy='{"Attached":1}', # Run with a low heatmap period so that we can avoid having to do synthetic API calls # to trigger the upload promptly. - conf={"heatmap_period": "1s"}, + conf={"heatmap_period": f"{upload_period_secs}s"}, ) env.neon_cli.create_timeline("main2", tenant_id, timeline_b) @@ -597,7 +600,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): # Wait long enough that the background downloads should happen; we expect all the inital layers # of all the initial timelines to show up on the secondary location of each tenant. - time.sleep(freshen_interval_secs * 1.5) + time.sleep(default_download_period_secs * 1.5) for tenant_id, timelines in tenant_timelines.items(): attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] @@ -613,8 +616,8 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): # Delete the second timeline: this should be reflected later on the secondary env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1]) - # Wait long enough for the secondary locations to see the deletion - time.sleep(freshen_interval_secs * 1.5) + # Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor + time.sleep(upload_period_secs * 2.5) for tenant_id, timelines in tenant_timelines.items(): attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] @@ -626,6 +629,9 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): assert ps_secondary.list_layers(tenant_id, timelines[0]) # This one was deleted + log.info( + f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}" + ) assert not ps_secondary.list_layers(tenant_id, timelines[1]) t_end = time.time() @@ -640,7 +646,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start) - expect_download_rate = 1.0 / freshen_interval_secs + expect_download_rate = 1.0 / upload_period_secs log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min") assert download_rate < expect_download_rate * 2 From a7b84cca5aef3dacc90895f39f0e0ad2b6c950cc Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Mon, 20 May 2024 12:07:25 +0200 Subject: [PATCH 080/130] Upgrade of pgvector to 0.7.0 (#7726) Upgrade pgvector to 0.7.0. This PR is based on Heikki's PR #6753 and just uses pgvector 0.7.0 instead of 0.6.0 I have now done all planned manual tests. The pull request is ready to be reviewed and merged and can be deployed in production together / after swap enablement. See (https://github.com/neondatabase/autoscaling/issues/800) Fixes https://github.com/neondatabase/neon/issues/6516 Fixes https://github.com/neondatabase/neon/issues/7780 ## Documentation input for usage recommendations ### maintenance_work_mem In Neon `maintenance_work_mem` is very small by default (depends on configured RAM for your compute but can be as low as 64 MB). To optimize pgvector index build time you may have to bump it up according to your working set size (size of tuples for vector index creation). You can do so in the current session using `SET maintenance_work_mem='10 GB';` The target value you choose should fit into the memory of your compute size and not exceed 50-60% of available RAM. The value above has been successfully used on a 7CU endpoint. ### max_parallel_maintenance_workers max_parallel_maintenance_workers is also small by default (2). For efficient parallel pgvector index creation you have to bump it up with `SET max_parallel_maintenance_workers = 7` to make use of all the CPUs available, assuming you have configured your endpoint to use 7CU. ## ID input for changelog pgvector extension in Neon has been upgraded from version 0.5.1 to version 0.7.0. Please see https://github.com/pgvector/pgvector/ for documentation of new capabilities in pgvector version 0.7.0 If you have existing databases with pgvector 0.5.1 already installed there is a slight difference in behavior in the following corner cases even if you don't run `ALTER EXTENSION UPDATE`: ### L2 distance from NULL::vector For the following script, comparing the NULL::vector to non-null vectors the resulting output changes: ```sql SET enable_seqscan = off; CREATE TABLE t (val vector(3)); INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); CREATE INDEX ON t USING hnsw (val vector_l2_ops); INSERT INTO t (val) VALUES ('[1,2,4]'); SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector); ``` and now the output is ``` val --------- [1,1,1] [1,2,4] [1,2,3] [0,0,0] (4 rows) ``` For the following script ```sql SET enable_seqscan = off; CREATE TABLE t (val vector(3)); INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); CREATE INDEX ON t USING ivfflat (val vector_l2_ops) WITH (lists = 1); INSERT INTO t (val) VALUES ('[1,2,4]'); SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector); ``` the output now is ``` val --------- [0,0,0] [1,2,3] [1,1,1] [1,2,4] (4 rows) ``` ### changed error messages If you provide invalid literals for datatype vector you may get improved/changed error messages, for example: ```sql neondb=> SELECT '[4e38,1]'::vector; ERROR: "4e38" is out of range for type vector LINE 1: SELECT '[4e38,1]'::vector; ^ ``` --------- Co-authored-by: Heikki Linnakangas --- .dockerignore | 1 + Dockerfile.compute-node | 7 +++- patches/pgvector.patch | 78 ++++++++++++++++++++++++++++++++++++++ pgxn/neon/pagestore_smgr.c | 19 +++++++++- 4 files changed, 101 insertions(+), 4 deletions(-) create mode 100644 patches/pgvector.patch diff --git a/.dockerignore b/.dockerignore index f7a6232ba1..1258532db8 100644 --- a/.dockerignore +++ b/.dockerignore @@ -17,6 +17,7 @@ !libs/ !neon_local/ !pageserver/ +!patches/ !pgxn/ !proxy/ !s3_scrubber/ diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index bd4534ce1d..5bf3246f34 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz - FROM build-deps AS vector-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \ - echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \ +COPY patches/pgvector.patch /pgvector.patch + +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \ + echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \ + patch -p1 < /pgvector.patch && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control diff --git a/patches/pgvector.patch b/patches/pgvector.patch new file mode 100644 index 0000000000..84ac6644c5 --- /dev/null +++ b/patches/pgvector.patch @@ -0,0 +1,78 @@ +From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001 +From: Heikki Linnakangas +Date: Fri, 2 Feb 2024 22:26:45 +0200 +Subject: [PATCH 1/1] Make v0.6.0 work with Neon + +Now that the WAL-logging happens as a separate step at the end of the +build, we need a few neon-specific hints to make it work. +--- + src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++ + 1 file changed, 36 insertions(+) + +diff --git a/src/hnswbuild.c b/src/hnswbuild.c +index 680789b..ec54dea 100644 +--- a/src/hnswbuild.c ++++ b/src/hnswbuild.c +@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc) + + hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false); + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(RelationGetSmgr(indexRel)); ++#endif ++ + /* Perform inserts */ + HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false); + ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel)); ++#endif ++ + /* Close relations within worker */ + index_close(indexRel, indexLockmode); + table_close(heapRel, heapLockmode); +@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, + SeedRandom(42); + #endif + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(RelationGetSmgr(index)); ++#endif ++ + InitBuildState(buildstate, heap, index, indexInfo, forkNum); + + BuildGraph(buildstate, forkNum); + ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index)); ++#endif ++ + if (RelationNeedsWAL(index)) ++ { + log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true); + ++#ifdef NEON_SMGR ++ { ++#if PG_VERSION_NUM >= 160000 ++ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator; ++#else ++ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node; ++#endif ++ ++ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, ++ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); ++ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM); ++ } ++#endif ++ } ++ ++#ifdef NEON_SMGR ++ smgr_end_unlogged_build(RelationGetSmgr(index)); ++#endif ++ + FreeBuildState(buildstate); + } + +-- +2.39.2 + diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index e3b841f526..249ad313b0 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -45,6 +45,7 @@ */ #include "postgres.h" +#include "access/parallel.h" #include "access/xact.h" #include "access/xlog.h" #include "access/xlogdefs.h" @@ -2822,10 +2823,14 @@ neon_start_unlogged_build(SMgrRelation reln) reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; /* + * Create the local file. In a parallel build, the leader is expected to + * call this first and do it. + * * FIXME: should we pass isRedo true to create the tablespace dir if it * doesn't exist? Is it needed? */ - mdcreate(reln, MAIN_FORKNUM, false); + if (!IsParallelWorker()) + mdcreate(reln, MAIN_FORKNUM, false); } /* @@ -2849,7 +2854,17 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); - unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; + /* + * In a parallel build, (only) the leader process performs the 2nd + * phase. + */ + if (IsParallelWorker()) + { + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + } + else + unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; } /* From e3f51abadf835784471034c583605c6b41154f2c Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 13 May 2024 13:18:18 +0300 Subject: [PATCH 081/130] safekeeper: close connection when COPY stream ends. We can't gracefully exit COPY mode (and don't need that), so close connection to prevent further attempts to use it. --- libs/postgres_backend/src/lib.rs | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 260018ad89..6c41b7f347 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -820,10 +820,11 @@ impl PostgresBackend { Ok(ProcessMsgResult::Continue) } - /// Log as info/error result of handling COPY stream and send back - /// ErrorResponse if that makes sense. Shutdown the stream if we got - /// Terminate. TODO: transition into waiting for Sync msg if we initiate the - /// close. + /// - Log as info/error result of handling COPY stream and send back + /// ErrorResponse if that makes sense. + /// - Shutdown the stream if we got Terminate. + /// - Then close the connection because we don't handle exiting from COPY + /// stream normally. pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) { use CopyStreamHandlerEnd::*; @@ -849,10 +850,6 @@ impl PostgresBackend { } } - if let Terminate = &end { - self.state = ProtoState::Closed; - } - let err_to_send_and_errcode = match &end { ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)), Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)), @@ -882,6 +879,12 @@ impl PostgresBackend { error!("failed to send ErrorResponse: {}", ee); } } + + // Proper COPY stream finishing to continue using the connection is not + // implemented at the server side (we don't need it so far). To prevent + // further usages of the connection, close it. + self.framed.shutdown().await.ok(); + self.state = ProtoState::Closed; } } From de8dfee4bda97deb8a2f02e4e4cc0c7641f56d09 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 13 May 2024 13:19:12 +0300 Subject: [PATCH 082/130] safekeeper: log LSNs on walreceiver/walsender exit. Useful for observability. --- safekeeper/src/receive_wal.rs | 29 ++++++++++++++++++++++------- safekeeper/src/send_wal.rs | 11 +++++++---- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 015b53bb2e..0356def7df 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -183,9 +183,19 @@ impl SafekeeperPostgresHandler { &mut self, pgb: &mut PostgresBackend, ) -> Result<(), QueryError> { - if let Err(end) = self.handle_start_wal_push_guts(pgb).await { + let mut tli: Option> = None; + if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await { // Log the result and probably send it to the client, closing the stream. - pgb.handle_copy_stream_end(end).await; + let handle_end_fut = pgb.handle_copy_stream_end(end); + // If we managed to create the timeline, augment logging with current LSNs etc. + if let Some(tli) = tli { + let info = tli.get_safekeeper_info(&self.conf).await; + handle_end_fut + .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.commit_lsn))) + .await; + } else { + handle_end_fut.await; + } } Ok(()) } @@ -193,6 +203,7 @@ impl SafekeeperPostgresHandler { pub async fn handle_start_wal_push_guts( &mut self, pgb: &mut PostgresBackend, + tli: &mut Option>, ) -> Result<(), CopyStreamHandlerEnd> { // Notify the libpq client that it's allowed to send `CopyData` messages pgb.write_message(&BeMessage::CopyBothResponse).await?; @@ -222,13 +233,17 @@ impl SafekeeperPostgresHandler { // Read first message and create timeline if needed. let res = network_reader.read_first_message().await; - let res = if let Ok((tli, next_msg)) = res { + let network_res = if let Ok((timeline, next_msg)) = res { let pageserver_feedback_rx: tokio::sync::broadcast::Receiver = - tli.get_walreceivers().pageserver_feedback_tx.subscribe(); + timeline + .get_walreceivers() + .pageserver_feedback_tx + .subscribe(); + *tli = Some(timeline.clone()); tokio::select! { // todo: add read|write .context to these errors - r = network_reader.run(msg_tx, msg_rx, reply_tx, tli.clone(), next_msg) => r, + r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline.clone(), next_msg) => r, r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r, } } else { @@ -244,13 +259,13 @@ impl SafekeeperPostgresHandler { match acceptor_handle { None => { // failed even before spawning; read_network should have error - Err(res.expect_err("no error with WalAcceptor not spawn")) + Err(network_res.expect_err("no error with WalAcceptor not spawn")) } Some(handle) => { let wal_acceptor_res = handle.await; // If there was any network error, return it. - res?; + network_res?; // Otherwise, WalAcceptor thread must have errored. match wal_acceptor_res { diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 59a8c595ab..ecaae9cfe7 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -340,12 +340,16 @@ impl SafekeeperPostgresHandler { start_pos: Lsn, term: Option, ) -> Result<(), QueryError> { + let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?; if let Err(end) = self - .handle_start_replication_guts(pgb, start_pos, term) + .handle_start_replication_guts(pgb, start_pos, term, tli.clone()) .await { + let info = tli.get_safekeeper_info(&self.conf).await; // Log the result and probably send it to the client, closing the stream. - pgb.handle_copy_stream_end(end).await; + pgb.handle_copy_stream_end(end) + .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.flush_lsn))) + .await; } Ok(()) } @@ -355,10 +359,9 @@ impl SafekeeperPostgresHandler { pgb: &mut PostgresBackend, start_pos: Lsn, term: Option, + tli: Arc, ) -> Result<(), CopyStreamHandlerEnd> { let appname = self.appname.clone(); - let tli = - GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?; // Use a guard object to remove our entry from the timeline when we are done. let ws_guard = Arc::new(tli.get_walsenders().register( From 7701ca45dd2215ecca8b8c3de50926ae9b520ffd Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Mon, 20 May 2024 12:08:45 -0400 Subject: [PATCH 083/130] feat(pageserver): generate image layers for sparse keyspace (#7567) Part of https://github.com/neondatabase/neon/issues/7462 Sparse keyspace does not generate image layers for now. This pull request adds support for generating image layers for sparse keyspace. ## Summary of changes * Use the scan interface to generate compaction data for sparse keyspace. * Track num of delta layers reads during scan. * Read-trigger compaction: when a scan on the keyspace touches too many delta files, generate an image layer. There are one hard-coded threshold for now: max delta layers we want to touch for a scan. * L0 compaction does not need to compute holes for metadata keyspace. Know issue: the scan interface currently reads past the image layer, which causes `delta_layer_accessed` keeps increasing even if image layers are generated. The pull request to fix that will be separate, and orthogonal to this one. --------- Signed-off-by: Alex Chi Z --- pageserver/src/pgdatadir_mapping.rs | 6 +- pageserver/src/tenant.rs | 105 +++++- pageserver/src/tenant/storage_layer.rs | 19 +- pageserver/src/tenant/timeline.rs | 339 +++++++++++++------ pageserver/src/tenant/timeline/compaction.rs | 35 +- 5 files changed, 363 insertions(+), 141 deletions(-) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 402f075365..b4fc4a08ee 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -40,7 +40,11 @@ use utils::bin_ser::DeserializeError; use utils::vec_map::{VecMap, VecMapOrdering}; use utils::{bin_ser::BeSer, lsn::Lsn}; -const MAX_AUX_FILE_DELTAS: usize = 1024; +/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached. +pub const MAX_AUX_FILE_DELTAS: usize = 1024; + +/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached. +pub const MAX_AUX_FILE_V2_DELTAS: usize = 64; #[derive(Debug)] pub enum LsnForTimestamp { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d42b9082b7..e598e9d2e3 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -4777,7 +4777,12 @@ mod tests { info!("Doing vectored read on {:?}", read); let vectored_res = tline - .get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx) + .get_vectored_impl( + read.clone(), + reads_lsn, + &mut ValuesReconstructState::new(), + &ctx, + ) .await; tline .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx) @@ -4826,7 +4831,7 @@ mod tests { .get_vectored_impl( aux_keyspace.clone(), read_lsn, - ValuesReconstructState::new(), + &mut ValuesReconstructState::new(), &ctx, ) .await; @@ -4971,7 +4976,7 @@ mod tests { .get_vectored_impl( read.clone(), current_lsn, - ValuesReconstructState::new(), + &mut ValuesReconstructState::new(), &ctx, ) .await?; @@ -5106,7 +5111,7 @@ mod tests { ranges: vec![child_gap_at_key..child_gap_at_key.next()], }, query_lsn, - ValuesReconstructState::new(), + &mut ValuesReconstructState::new(), &ctx, ) .await; @@ -5547,7 +5552,7 @@ mod tests { .await?; const NUM_KEYS: usize = 1000; - const STEP: usize = 100; // random update + scan base_key + idx * STEP + const STEP: usize = 10000; // random update + scan base_key + idx * STEP let cancel = CancellationToken::new(); @@ -5580,7 +5585,7 @@ mod tests { let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32)); - for _ in 0..10 { + for iter in 0..=10 { // Read all the blocks for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = (blknum * STEP) as u32; @@ -5595,7 +5600,7 @@ mod tests { .get_vectored_impl( keyspace.clone(), lsn, - ValuesReconstructState::default(), + &mut ValuesReconstructState::default(), &ctx, ) .await? @@ -5631,17 +5636,91 @@ mod tests { updated[blknum] = lsn; } - // Perform a cycle of flush, compact, and GC - tline.freeze_and_flush().await?; - tline.compact(&cancel, EnumSet::empty(), &ctx).await?; - tenant - .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) - .await?; + // Perform two cycles of flush, compact, and GC + for round in 0..2 { + tline.freeze_and_flush().await?; + tline + .compact( + &cancel, + if iter % 5 == 0 && round == 0 { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + } else { + EnumSet::empty() + }, + &ctx, + ) + .await?; + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .await?; + } } Ok(()) } + #[tokio::test] + async fn test_metadata_compaction_trigger() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_compaction_trigger")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + let test_key = base_key; + let mut lsn = Lsn(0x10); + + for _ in 0..20 { + lsn = Lsn(lsn.0 + 0x10); + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", 0, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + tline.freeze_and_flush().await?; // force create a delta layer + } + + let before_num_l0_delta_files = tline + .layers + .read() + .await + .layer_map() + .get_level0_deltas()? + .len(); + + tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + + let after_num_l0_delta_files = tline + .layers + .read() + .await + .layer_map() + .get_level0_deltas()? + .len(); + + assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}"); + + assert_eq!( + tline.get(test_key, lsn, &ctx).await?, + test_img(&format!("{} at {}", 0, lsn)) + ); + + Ok(()) + } + #[tokio::test] async fn test_branch_copies_dirty_aux_file_flag() { let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap(); diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 94a5e9ec47..4c8a518551 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -113,12 +113,17 @@ impl From for ValueReconstructState { } } -/// Bag of data accumulated during a vectored get +/// Bag of data accumulated during a vectored get. pub(crate) struct ValuesReconstructState { + /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline` + /// should not expect to get anything from this hashmap. pub(crate) keys: HashMap>, keys_done: KeySpaceRandomAccum, + + // Statistics that are still accessible as a caller of `get_vectored_impl`. layers_visited: u32, + delta_layers_visited: u32, } impl ValuesReconstructState { @@ -127,6 +132,7 @@ impl ValuesReconstructState { keys: HashMap::new(), keys_done: KeySpaceRandomAccum::new(), layers_visited: 0, + delta_layers_visited: 0, } } @@ -140,8 +146,17 @@ impl ValuesReconstructState { } } - pub(crate) fn on_layer_visited(&mut self) { + pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) { self.layers_visited += 1; + if let ReadableLayer::PersistentLayer(layer) = layer { + if layer.layer_desc().is_delta() { + self.delta_layers_visited += 1; + } + } + } + + pub(crate) fn get_delta_layers_visited(&self) -> u32 { + self.delta_layers_visited } pub(crate) fn get_layers_visited(&self) -> u32 { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1fb1928079..e6b58b7166 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -18,8 +18,8 @@ use fail::fail_point; use once_cell::sync::Lazy; use pageserver_api::{ key::{ - AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, - NON_INHERITED_SPARSE_RANGE, + AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, + NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE, }, keyspace::{KeySpaceAccum, SparseKeyPartitioning}, models::{ @@ -60,7 +60,6 @@ use std::{ ops::ControlFlow, }; -use crate::tenant::timeline::init::LocalLayerFileMetadata; use crate::{ aux_file::AuxFileSizeEstimator, tenant::{ @@ -89,6 +88,9 @@ use crate::{ metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, }; use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; +use crate::{ + pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata, +}; use crate::{ pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, virtual_file::{MaybeFatalIo, VirtualFile}, @@ -782,6 +784,11 @@ pub(crate) enum ShutdownMode { Hard, } +struct ImageLayerCreationOutcome { + image: Option, + next_start_key: Key, +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -883,7 +890,7 @@ impl Timeline { } let vectored_res = self - .get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx) + .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) .await; if self.conf.validate_vectored_get { @@ -1028,7 +1035,12 @@ impl Timeline { } GetVectoredImpl::Vectored => { let vectored_res = self - .get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx) + .get_vectored_impl( + keyspace.clone(), + lsn, + &mut ValuesReconstructState::new(), + ctx, + ) .await; if self.conf.validate_vectored_get { @@ -1116,7 +1128,7 @@ impl Timeline { .get_vectored_impl( keyspace.clone(), lsn, - ValuesReconstructState::default(), + &mut ValuesReconstructState::default(), ctx, ) .await; @@ -1193,7 +1205,7 @@ impl Timeline { &self, keyspace: KeySpace, lsn: Lsn, - mut reconstruct_state: ValuesReconstructState, + reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { let get_kind = if keyspace.total_raw_size() == 1 { @@ -1205,7 +1217,7 @@ impl Timeline { let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME .for_get_kind(get_kind) .start_timer(); - self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx) + self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx) .await?; get_data_timer.stop_and_record(); @@ -1214,7 +1226,8 @@ impl Timeline { .start_timer(); let mut results: BTreeMap> = BTreeMap::new(); let layers_visited = reconstruct_state.get_layers_visited(); - for (key, res) in reconstruct_state.keys { + + for (key, res) in std::mem::take(&mut reconstruct_state.keys) { match res { Err(err) => { results.insert(key, Err(err)); @@ -3448,7 +3461,7 @@ impl Timeline { unmapped_keyspace = keyspace_to_read; cont_lsn = next_cont_lsn; - reconstruct_state.on_layer_visited(); + reconstruct_state.on_layer_visited(&layer_to_read); } else { break; } @@ -4134,6 +4147,176 @@ impl Timeline { false } + /// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large, + /// so that at most one image layer will be produced from this function. + async fn create_image_layer_for_rel_blocks( + self: &Arc, + partition: &KeySpace, + mut image_layer_writer: ImageLayerWriter, + lsn: Lsn, + ctx: &RequestContext, + img_range: Range, + start: Key, + ) -> Result { + let mut wrote_keys = false; + + let mut key_request_accum = KeySpaceAccum::new(); + for range in &partition.ranges { + let mut key = range.start; + while key < range.end { + // Decide whether to retain this key: usually we do, but sharded tenants may + // need to drop keys that don't belong to them. If we retain the key, add it + // to `key_request_accum` for later issuing a vectored get + if self.shard_identity.is_key_disposable(&key) { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + } else { + key_request_accum.add_key(key); + } + + let last_key_in_range = key.next() == range.end; + key = key.next(); + + // Maybe flush `key_rest_accum` + if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS + || (last_key_in_range && key_request_accum.raw_size() > 0) + { + let results = self + .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) + .await?; + + for (img_key, img) in results { + let img = match img { + Ok(img) => img, + Err(err) => { + // If we fail to reconstruct a VM or FSM page, we can zero the + // page without losing any actual user data. That seems better + // than failing repeatedly and getting stuck. + // + // We had a bug at one point, where we truncated the FSM and VM + // in the pageserver, but the Postgres didn't know about that + // and continued to generate incremental WAL records for pages + // that didn't exist in the pageserver. Trying to replay those + // WAL records failed to find the previous image of the page. + // This special case allows us to recover from that situation. + // See https://github.com/neondatabase/neon/issues/2601. + // + // Unfortunately we cannot do this for the main fork, or for + // any metadata keys, keys, as that would lead to actual data + // loss. + if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) { + warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); + ZERO_PAGE.clone() + } else { + return Err(CreateImageLayersError::PageReconstructError(err)); + } + } + }; + + // Write all the keys we just read into our new image layer. + image_layer_writer.put_image(img_key, img, ctx).await?; + wrote_keys = true; + } + } + } + } + + if wrote_keys { + // Normal path: we have written some data into the new image layer for this + // partition, so flush it to disk. + let image_layer = image_layer_writer.finish(self, ctx).await?; + Ok(ImageLayerCreationOutcome { + image: Some(image_layer), + next_start_key: img_range.end, + }) + } else { + // Special case: the image layer may be empty if this is a sharded tenant and the + // partition does not cover any keys owned by this shard. In this case, to ensure + // we don't leave gaps between image layers, leave `start` where it is, so that the next + // layer we write will cover the key range that we just scanned. + tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); + Ok(ImageLayerCreationOutcome { + image: None, + next_start_key: start, + }) + } + } + + /// Create an image layer for metadata keys. This function produces one image layer for all metadata + /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it + /// would not be too large to fit in a single image layer. + #[allow(clippy::too_many_arguments)] + async fn create_image_layer_for_metadata_keys( + self: &Arc, + partition: &KeySpace, + mut image_layer_writer: ImageLayerWriter, + lsn: Lsn, + ctx: &RequestContext, + img_range: Range, + mode: ImageLayerCreationMode, + ) -> Result { + assert!(!matches!(mode, ImageLayerCreationMode::Initial)); + + // Metadata keys image layer creation. + let mut reconstruct_state = ValuesReconstructState::default(); + let data = self + .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx) + .await?; + let (data, total_kb_retrieved, total_key_retrieved) = { + let mut new_data = BTreeMap::new(); + let mut total_kb_retrieved = 0; + let mut total_key_retrieved = 0; + for (k, v) in data { + let v = v.map_err(CreateImageLayersError::PageReconstructError)?; + total_kb_retrieved += KEY_SIZE + v.len(); + total_key_retrieved += 1; + new_data.insert(k, v); + } + (new_data, total_kb_retrieved / 1024, total_key_retrieved) + }; + let delta_file_accessed = reconstruct_state.get_delta_layers_visited(); + + let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS; + info!( + "generate image layers for metadata keys: trigger_generation={trigger_generation}, \ + delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \ + total_key_retrieved={total_key_retrieved}" + ); + if !trigger_generation && mode == ImageLayerCreationMode::Try { + return Ok(ImageLayerCreationOutcome { + image: None, + next_start_key: img_range.end, + }); + } + let has_keys = !data.is_empty(); + for (k, v) in data { + // Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get + // considers this situation properly. + // if v.is_empty() { + // continue; + // } + + // No need to handle sharding b/c metadata keys are always on the 0-th shard. + + // TODO: split image layers to avoid too large layer files. Too large image files are not handled + // on the normal data path either. + image_layer_writer.put_image(k, v, ctx).await?; + } + Ok(ImageLayerCreationOutcome { + image: if has_keys { + let image_layer = image_layer_writer.finish(self, ctx).await?; + Some(image_layer) + } else { + tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); + None + }, + next_start_key: img_range.end, + }) + } + #[tracing::instrument(skip_all, fields(%lsn, %mode))] async fn create_image_layers( self: &Arc, @@ -4175,19 +4358,17 @@ impl Timeline { for partition in partitioning.parts.iter() { let img_range = start..partition.ranges.last().unwrap().end; - - if partition.overlaps(&Key::metadata_key_range()) { - // TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a - // rather big change. Keep this patch small for now. - match mode { - ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => { - // skip image layer creation anyways for metadata keys. - start = img_range.end; - continue; - } - ImageLayerCreationMode::Initial => { - return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers"))); - } + let compact_metadata = partition.overlaps(&Key::metadata_key_range()); + if compact_metadata { + for range in &partition.ranges { + assert!( + range.start.field1 >= METADATA_KEY_BEGIN_PREFIX + && range.end.field1 <= METADATA_KEY_END_PREFIX, + "metadata keys must be partitioned separately" + ); + } + if mode == ImageLayerCreationMode::Initial { + return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers"))); } } else if let ImageLayerCreationMode::Try = mode { // check_for_image_layers = false -> skip @@ -4198,7 +4379,7 @@ impl Timeline { } } - let mut image_layer_writer = ImageLayerWriter::new( + let image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, @@ -4214,87 +4395,39 @@ impl Timeline { ))) }); - let mut wrote_keys = false; + if !compact_metadata { + let ImageLayerCreationOutcome { + image, + next_start_key, + } = self + .create_image_layer_for_rel_blocks( + partition, + image_layer_writer, + lsn, + ctx, + img_range, + start, + ) + .await?; - let mut key_request_accum = KeySpaceAccum::new(); - for range in &partition.ranges { - let mut key = range.start; - while key < range.end { - // Decide whether to retain this key: usually we do, but sharded tenants may - // need to drop keys that don't belong to them. If we retain the key, add it - // to `key_request_accum` for later issuing a vectored get - if self.shard_identity.is_key_disposable(&key) { - debug!( - "Dropping key {} during compaction (it belongs on shard {:?})", - key, - self.shard_identity.get_shard_number(&key) - ); - } else { - key_request_accum.add_key(key); - } - - let last_key_in_range = key.next() == range.end; - key = key.next(); - - // Maybe flush `key_rest_accum` - if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS - || (last_key_in_range && key_request_accum.raw_size() > 0) - { - let results = self - .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) - .await?; - - for (img_key, img) in results { - let img = match img { - Ok(img) => img, - Err(err) => { - // If we fail to reconstruct a VM or FSM page, we can zero the - // page without losing any actual user data. That seems better - // than failing repeatedly and getting stuck. - // - // We had a bug at one point, where we truncated the FSM and VM - // in the pageserver, but the Postgres didn't know about that - // and continued to generate incremental WAL records for pages - // that didn't exist in the pageserver. Trying to replay those - // WAL records failed to find the previous image of the page. - // This special case allows us to recover from that situation. - // See https://github.com/neondatabase/neon/issues/2601. - // - // Unfortunately we cannot do this for the main fork, or for - // any metadata keys, keys, as that would lead to actual data - // loss. - if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) - { - warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); - ZERO_PAGE.clone() - } else { - return Err(CreateImageLayersError::PageReconstructError( - err, - )); - } - } - }; - - // Write all the keys we just read into our new image layer. - image_layer_writer.put_image(img_key, img, ctx).await?; - wrote_keys = true; - } - } - } - } - - if wrote_keys { - // Normal path: we have written some data into the new image layer for this - // partition, so flush it to disk. - start = img_range.end; - let image_layer = image_layer_writer.finish(self, ctx).await?; - image_layers.push(image_layer); + start = next_start_key; + image_layers.extend(image); } else { - // Special case: the image layer may be empty if this is a sharded tenant and the - // partition does not cover any keys owned by this shard. In this case, to ensure - // we don't leave gaps between image layers, leave `start` where it is, so that the next - // layer we write will cover the key range that we just scanned. - tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); + let ImageLayerCreationOutcome { + image, + next_start_key, + } = self + .create_image_layer_for_metadata_keys( + partition, + image_layer_writer, + lsn, + ctx, + img_range, + mode, + ) + .await?; + start = next_start_key; + image_layers.extend(image); } } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index ed48b4c9cb..2eff469591 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -116,9 +116,13 @@ impl Timeline { // 3. Create new image layers for partitions that have been modified // "enough". - let dense_layers = self + let mut partitioning = dense_partitioning; + partitioning + .parts + .extend(sparse_partitioning.into_dense().parts); + let image_layers = self .create_image_layers( - &dense_partitioning, + &partitioning, lsn, if flags.contains(CompactFlags::ForceImageLayerCreation) { ImageLayerCreationMode::Force @@ -130,24 +134,8 @@ impl Timeline { .await .map_err(anyhow::Error::from)?; - // For now, nothing will be produced... - let sparse_layers = self - .create_image_layers( - &sparse_partitioning.clone().into_dense(), - lsn, - if flags.contains(CompactFlags::ForceImageLayerCreation) { - ImageLayerCreationMode::Force - } else { - ImageLayerCreationMode::Try - }, - &image_ctx, - ) - .await - .map_err(anyhow::Error::from)?; - assert!(sparse_layers.is_empty()); - - self.upload_new_image_layers(dense_layers)?; - dense_partitioning.parts.len() + self.upload_new_image_layers(image_layers)?; + partitioning.parts.len() } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -499,8 +487,11 @@ impl Timeline { for &DeltaEntry { key: next_key, .. } in all_keys.iter() { if let Some(prev_key) = prev { - // just first fast filter - if next_key.to_i128() - prev_key.to_i128() >= min_hole_range { + // just first fast filter, do not create hole entries for metadata keys. The last hole in the + // compaction is the gap between data key and metadata keys. + if next_key.to_i128() - prev_key.to_i128() >= min_hole_range + && !Key::is_metadata_key(&prev_key) + { let key_range = prev_key..next_key; // Measuring hole by just subtraction of i128 representation of key range boundaries // has not so much sense, because largest holes will corresponds field1/field2 changes. From 2d7091871f82b6e9d598fdaed8cc28bdc27b9a16 Mon Sep 17 00:00:00 2001 From: Andy Hattemer Date: Mon, 20 May 2024 12:15:43 -0400 Subject: [PATCH 084/130] Update banner image in Readme (#7801) Update the readme banner with updated branding. --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 00a90f4483..ea0a289502 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -[![Neon](https://user-images.githubusercontent.com/13738772/236813940-dcfdcb5b-69d3-449b-a686-013febe834d4.png)](https://neon.tech) +[![Neon](https://github.com/neondatabase/neon/assets/11527560/f15a17f0-836e-40c5-b35d-030606a6b660)](https://neon.tech) + + # Neon From 6810d2aa53b7b7646013d2f236d155a4f1b4721d Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Mon, 20 May 2024 14:24:18 -0400 Subject: [PATCH 085/130] feat(pageserver): do not read past image layers for vectored get (#7773) ## Problem Part of https://github.com/neondatabase/neon/issues/7462 On metadata keyspace, vectored get will not stop if a key is not found, and will read past the image layer. However, the semantics is different from single get, because if a key does not exist in the image layer, it means that the key does not exist in the past, or have been deleted. This pull request fixed it by recording image layer coverage during the vectored get process and stop when the full keyspace is covered by an image layer. A corresponding test case is added to ensure generating image layer reduces the number of delta layers. This optimization (or bug fix) also applies to rel block keyspaces. If a key is missing, we can know it's missing once the first image layer is reached. Page server will not attempt to read lower layers, which potentially incurs layer downloads + evictions. --------- Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/keyspace.rs | 2 +- pageserver/src/tenant.rs | 372 +++++++++++++++++- pageserver/src/tenant/storage_layer.rs | 26 +- .../src/tenant/storage_layer/image_layer.rs | 4 + pageserver/src/tenant/timeline.rs | 79 +++- 5 files changed, 461 insertions(+), 22 deletions(-) diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index c0c4710a00..12c6dc3a6d 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -307,7 +307,7 @@ impl KeySpace { } /// Merge another keyspace into the current one. - /// Note: the keyspaces must not ovelap (enforced via assertions) + /// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`. pub fn merge(&mut self, other: &KeySpace) { let all_ranges = self .ranges diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e598e9d2e3..1a66f2c919 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3968,7 +3968,7 @@ mod tests { use crate::tenant::harness::*; use crate::tenant::timeline::CompactFlags; use crate::DEFAULT_PG_VERSION; - use bytes::BytesMut; + use bytes::{Bytes, BytesMut}; use hex_literal::hex; use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE}; use pageserver_api::keyspace::KeySpace; @@ -5996,4 +5996,374 @@ mod tests { Some(&bytes::Bytes::from_static(b"last")) ); } + + #[tokio::test] + async fn test_metadata_image_creation() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_image_creation")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + const NUM_KEYS: usize = 1000; + const STEP: usize = 10000; // random update + scan base_key + idx * STEP + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + let mut test_key = base_key; + let mut lsn = Lsn(0x10); + + async fn scan_with_statistics( + tline: &Timeline, + keyspace: &KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result<(BTreeMap>, usize)> { + let mut reconstruct_state = ValuesReconstructState::default(); + let res = tline + .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) + .await?; + Ok((res, reconstruct_state.get_delta_layers_visited() as usize)) + } + + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + } + + let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32)); + + for iter in 1..=10 { + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + } + + tline.freeze_and_flush().await?; + + if iter % 5 == 0 { + let (_, before_delta_file_accessed) = + scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + tline + .compact( + &cancel, + { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + }, + &ctx, + ) + .await?; + let (_, after_delta_file_accessed) = + scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}"); + // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances. + assert!( + after_delta_file_accessed <= 2, + "after_delta_file_accessed={after_delta_file_accessed}" + ); + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + let cancel = CancellationToken::new(); + + let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap(); + let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap(); + + let mut lsn = Lsn(0x20); + + { + let mut writer = tline.writer().await; + writer + .put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx) + .await?; + writer.finish_write(lsn); + drop(writer); + + tline.freeze_and_flush().await?; // this will create a image layer + } + + let child = tenant + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx) + .await + .unwrap(); + + lsn.0 += 0x10; + + { + let mut writer = child.writer().await; + writer + .put( + base_key_child, + lsn, + &Value::Image(test_img("data key 2")), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + + child.freeze_and_flush().await?; // this will create a delta + + { + // update the partitioning to include the test key space, otherwise they + // will be dropped by image layer creation + let mut guard = child.partitioning.lock().await; + let ((partitioning, _), partition_lsn) = &mut *guard; + partitioning + .parts + .push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key + *partition_lsn = lsn; + } + + child + .compact( + &cancel, + { + let mut set = EnumSet::empty(); + set.insert(CompactFlags::ForceImageLayerCreation); + set + }, + &ctx, + ) + .await?; // force create an image layer for the keys, TODO: check if the image layer is created + } + + async fn get_vectored_impl_wrapper( + tline: &Arc, + key: Key, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, GetVectoredError> { + let mut reconstruct_state = ValuesReconstructState::new(); + let mut res = tline + .get_vectored_impl( + KeySpace::single(key..key.next()), + lsn, + &mut reconstruct_state, + ctx, + ) + .await?; + Ok(res.pop_last().map(|(k, v)| { + assert_eq!(k, key); + v.unwrap() + })) + } + + // test vectored get on parent timeline + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, + Some(test_img("data key 1")) + ); + assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error()); + assert!( + get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); + + // test vectored get on child timeline + assert_eq!( + get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?, + Some(test_img("data key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?, + Some(test_img("data key 2")) + ); + assert!( + get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); + + Ok(()) + } + + #[tokio::test] + async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap(); + let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + base_key_child.field1 = AUX_KEY_PREFIX; + base_key_nonexist.field1 = AUX_KEY_PREFIX; + + let mut lsn = Lsn(0x20); + + { + let mut writer = tline.writer().await; + writer + .put( + base_key, + lsn, + &Value::Image(test_img("metadata key 1")), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + + tline.freeze_and_flush().await?; // this will create an image layer + + tline + .compact( + &cancel, + { + let mut set = EnumSet::empty(); + set.insert(CompactFlags::ForceImageLayerCreation); + set.insert(CompactFlags::ForceRepartition); + set + }, + &ctx, + ) + .await?; // force create an image layer for metadata keys + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .await?; + } + + let child = tenant + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx) + .await + .unwrap(); + + lsn.0 += 0x10; + + { + let mut writer = child.writer().await; + writer + .put( + base_key_child, + lsn, + &Value::Image(test_img("metadata key 2")), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + + child.freeze_and_flush().await?; + + child + .compact( + &cancel, + { + let mut set = EnumSet::empty(); + set.insert(CompactFlags::ForceImageLayerCreation); + set.insert(CompactFlags::ForceRepartition); + set + }, + &ctx, + ) + .await?; // force create an image layer for metadata keys + tenant + .gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .await?; + } + + async fn get_vectored_impl_wrapper( + tline: &Arc, + key: Key, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, GetVectoredError> { + let mut reconstruct_state = ValuesReconstructState::new(); + let mut res = tline + .get_vectored_impl( + KeySpace::single(key..key.next()), + lsn, + &mut reconstruct_state, + ctx, + ) + .await?; + Ok(res.pop_last().map(|(k, v)| { + assert_eq!(k, key); + v.unwrap() + })) + } + + // test vectored get on parent timeline + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, + Some(test_img("metadata key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?, + None + ); + + // test vectored get on child timeline + assert_eq!( + get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?, + Some(test_img("metadata key 2")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?, + None + ); + + Ok(()) + } } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 4c8a518551..9ccf20c0d4 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -113,14 +113,17 @@ impl From for ValueReconstructState { } } -/// Bag of data accumulated during a vectored get. +/// Bag of data accumulated during a vectored get.. pub(crate) struct ValuesReconstructState { /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline` /// should not expect to get anything from this hashmap. pub(crate) keys: HashMap>, - + /// The keys which are already retrieved keys_done: KeySpaceRandomAccum, + /// The keys covered by the image layers + keys_with_image_coverage: Option>, + // Statistics that are still accessible as a caller of `get_vectored_impl`. layers_visited: u32, delta_layers_visited: u32, @@ -131,6 +134,7 @@ impl ValuesReconstructState { Self { keys: HashMap::new(), keys_done: KeySpaceRandomAccum::new(), + keys_with_image_coverage: None, layers_visited: 0, delta_layers_visited: 0, } @@ -186,6 +190,16 @@ impl ValuesReconstructState { } } + /// On hitting image layer, we can mark all keys in this range as done, because + /// if the image layer does not contain a key, it is deleted/never added. + pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range) { + let prev_val = self.keys_with_image_coverage.replace(key_range.clone()); + assert_eq!( + prev_val, None, + "should consume the keyspace before the next iteration" + ); + } + /// Update the state collected for a given key. /// Returns true if this was the last value needed for the key and false otherwise. /// @@ -248,8 +262,12 @@ impl ValuesReconstructState { /// Returns the key space describing the keys that have /// been marked as completed since the last call to this function. - pub(crate) fn consume_done_keys(&mut self) -> KeySpace { - self.keys_done.consume_keyspace() + /// Returns individual keys done, and the image layer coverage. + pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option>) { + ( + self.keys_done.consume_keyspace(), + self.keys_with_image_coverage.take(), + ) } } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 6ea452b993..becd1e7a6d 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -158,6 +158,7 @@ pub struct ImageLayerInner { index_start_blk: u32, index_root_blk: u32, + key_range: Range, lsn: Lsn, file: VirtualFile, @@ -419,6 +420,7 @@ impl ImageLayerInner { file, file_id, max_vectored_read_bytes, + key_range: actual_summary.key_range, })) } @@ -478,6 +480,8 @@ impl ImageLayerInner { self.do_reads_and_update_state(reads, reconstruct_state, ctx) .await; + reconstruct_state.on_image_layer_visited(&self.key_range); + Ok(()) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e6b58b7166..7f2a41d90c 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -21,7 +21,7 @@ use pageserver_api::{ AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE, }, - keyspace::{KeySpaceAccum, SparseKeyPartitioning}, + keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, models::{ AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, @@ -348,8 +348,8 @@ pub struct Timeline { // though let's keep them both for better error visibility. pub initdb_lsn: Lsn, - /// When did we last calculate the partitioning? - partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, + /// When did we last calculate the partitioning? Make it pub to test cases. + pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, @@ -483,6 +483,11 @@ impl GcCutoffs { } } +pub(crate) struct TimelineVisitOutcome { + completed_keyspace: KeySpace, + image_covered_keyspace: KeySpace, +} + /// An error happened in a get() operation. #[derive(thiserror::Error, Debug)] pub(crate) enum PageReconstructError { @@ -507,6 +512,13 @@ pub(crate) enum PageReconstructError { MissingKey(MissingKeyError), } +impl GetVectoredError { + #[cfg(test)] + pub(crate) fn is_missing_key_error(&self) -> bool { + matches!(self, Self::MissingKey(_)) + } +} + #[derive(Debug)] pub struct MissingKeyError { key: Key, @@ -3300,12 +3312,15 @@ impl Timeline { let mut cont_lsn = Lsn(request_lsn.0 + 1); - loop { + let missing_keyspace = loop { if self.cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); } - let completed = Self::get_vectored_reconstruct_data_timeline( + let TimelineVisitOutcome { + completed_keyspace: completed, + image_covered_keyspace, + } = Self::get_vectored_reconstruct_data_timeline( timeline, keyspace.clone(), cont_lsn, @@ -3324,12 +3339,31 @@ impl Timeline { ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE], }); - // Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look - // into ancestor timelines). TODO: is there any other metadata which we want to inherit? - if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() { - break; + // Keyspace is fully retrieved + if keyspace.is_empty() { + break None; } + // Not fully retrieved but no ancestor timeline. + if timeline.ancestor_timeline.is_none() { + break Some(keyspace); + } + + // Now we see if there are keys covered by the image layer but does not exist in the + // image layer, which means that the key does not exist. + + // The block below will stop the vectored search if any of the keys encountered an image layer + // which did not contain a snapshot for said key. Since we have already removed all completed + // keys from `keyspace`, we expect there to be no overlap between it and the image covered key + // space. If that's not the case, we had at least one key encounter a gap in the image layer + // and stop the search as a result of that. + let removed = keyspace.remove_overlapping_with(&image_covered_keyspace); + if !removed.is_empty() { + break Some(removed); + } + // If we reached this point, `remove_overlapping_with` should not have made any change to the + // keyspace. + // Take the min to avoid reconstructing a page with data newer than request Lsn. cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1)); timeline_owned = timeline @@ -3337,14 +3371,14 @@ impl Timeline { .await .map_err(GetVectoredError::GetReadyAncestorError)?; timeline = &*timeline_owned; - } + }; - if keyspace.total_raw_size() != 0 { + if let Some(missing_keyspace) = missing_keyspace { return Err(GetVectoredError::MissingKey(MissingKeyError { - key: keyspace.start().unwrap(), /* better if we can store the full keyspace */ + key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */ shard: self .shard_identity - .get_shard_number(&keyspace.start().unwrap()), + .get_shard_number(&missing_keyspace.start().unwrap()), cont_lsn, request_lsn, ancestor_lsn: Some(timeline.ancestor_lsn), @@ -3369,6 +3403,9 @@ impl Timeline { /// /// At each iteration pop the top of the fringe (the layer with the highest Lsn) /// and get all the required reconstruct data from the layer in one go. + /// + /// Returns the completed keyspace and the keyspaces with image coverage. The caller + /// decides how to deal with these two keyspaces. async fn get_vectored_reconstruct_data_timeline( timeline: &Timeline, keyspace: KeySpace, @@ -3376,20 +3413,27 @@ impl Timeline { reconstruct_state: &mut ValuesReconstructState, cancel: &CancellationToken, ctx: &RequestContext, - ) -> Result { + ) -> Result { let mut unmapped_keyspace = keyspace.clone(); let mut fringe = LayerFringe::new(); let mut completed_keyspace = KeySpace::default(); + let mut image_covered_keyspace = KeySpaceRandomAccum::new(); loop { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); } - let keys_done_last_step = reconstruct_state.consume_done_keys(); + let (keys_done_last_step, keys_with_image_coverage) = + reconstruct_state.consume_done_keys(); unmapped_keyspace.remove_overlapping_with(&keys_done_last_step); completed_keyspace.merge(&keys_done_last_step); + if let Some(keys_with_image_coverage) = keys_with_image_coverage { + unmapped_keyspace + .remove_overlapping_with(&KeySpace::single(keys_with_image_coverage.clone())); + image_covered_keyspace.add_range(keys_with_image_coverage); + } // Do not descent any further if the last layer we visited // completed all keys in the keyspace it inspected. This is not @@ -3467,7 +3511,10 @@ impl Timeline { } } - Ok(completed_keyspace) + Ok(TimelineVisitOutcome { + completed_keyspace, + image_covered_keyspace: image_covered_keyspace.consume_keyspace(), + }) } /// # Cancel-safety From 6f3e043a76dd47a18180eec627ad5f5bbade6186 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Mon, 20 May 2024 17:00:47 -0700 Subject: [PATCH 086/130] Add some more replication slot metrics (#7761) ## Problem We want to add alerts for when people's replication slots break, and also metrics for retained WAL so that we can make warn customers when their storage gets bloated. ## Summary of changes Adds the metrics. Addresses https://github.com/neondatabase/neon/issues/7593 --- vm-image-spec.yaml | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index fa7cd014bf..0f9d56e466 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -254,8 +254,8 @@ files: select case when pg_catalog.pg_is_in_recovery() - then pg_last_wal_replay_lsn() - else pg_current_wal_lsn() + then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 + else (pg_current_wal_lsn() - '0/0')::FLOAT8 end as lsn; - metric_name: replication_delay_bytes @@ -294,6 +294,9 @@ files: query: | SELECT checkpoints_timed FROM pg_stat_bgwriter; + # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats. + # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly. + # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad. - metric_name: logical_slot_restart_lsn type: gauge @@ -302,7 +305,32 @@ files: - slot_name values: [restart_lsn] query: | - select slot_name, restart_lsn from pg_replication_slots where slot_type = 'logical'; + select slot_name, (restart_lsn - '0/0')::FLOAT8 from pg_replication_slots where slot_type = 'logical'; + + - metric_name: retained_wal + type: gauge + help: 'Retained WAL in inactive replication slots' + key_labels: + - slot_name + values: [retained_wal] + query: | + SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal + FROM pg_replication_slots + WHERE active = false; + + - metric_name: wal_is_lost + type: gauge + help: 'Whether or not the replication slot\'s wal_status is lost' + key_labels: + - slot_name + values: [wal_status_is_lost] + query: | + SELECT slot_name, + CASE + WHEN wal_status = 'lost' THEN 1 + ELSE 0 + END AS wal_status_is_lost + FROM pg_replication_slots; - filename: neon_collector_autoscaling.yml content: | collector_name: neon_collector_autoscaling From baeb58432f77809f68eb648481e81ed5af15a8b0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 21 May 2024 10:48:17 +0000 Subject: [PATCH 087/130] build(deps): bump requests from 2.31.0 to 2.32.0 (#7816) --- poetry.lock | 21 ++++++++++++++++----- pyproject.toml | 2 +- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index ef9f572b17..25c0c7398d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2405,6 +2405,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2529,13 +2530,13 @@ files = [ [[package]] name = "requests" -version = "2.31.0" +version = "2.32.0" description = "Python HTTP for Humans." optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, - {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, + {file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"}, + {file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"}, ] [package.dependencies] @@ -2959,6 +2960,16 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, + {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, + {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, + {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, + {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -3196,4 +3207,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "dcde14c58a32bda5f123319a069352c458b3719f3c62977991eebb9803a46a9e" +content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0" diff --git a/pyproject.toml b/pyproject.toml index ac7f9b061c..131d1121f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ pytest = "^7.4.4" psycopg2-binary = "^2.9.6" typing-extensions = "^4.6.1" PyJWT = {version = "^2.1.0", extras = ["crypto"]} -requests = "^2.31.0" +requests = "^2.32.0" pytest-xdist = "^3.3.1" asyncpg = "^0.29.0" aiopg = "^1.4.0" From 4ce6e2d2fc83ff8664eef2f80912cade71240669 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 21 May 2024 13:46:04 +0100 Subject: [PATCH 088/130] pageserver: fix secondary progress stats when layers are 404 (#7814) ## Problem Noticed this issue in staging. When a tenant is under somewhat heavy timeline creation/deletion thrashing, it becomes quite common for secondary downloads to encounter 404s downloading layers. This is tolerated by design, because heatmaps are not guaranteed to be up to date with what layers/timelines actually exist. However, we were not updating the SecondaryProgress structure in this case, so after such a download pass, we would leave a SecondaryProgress state with lower "downloaded" stats than "total" stats. This causes the storage controller to consider this secondary location inelegible for optimization actions such as we do after shard splits This issue has relative low impact because a typical tenant will eventually upload a heatmap where we do download all the layers and thereby enable the controller to progress with migrations -- the heavy thrashing of timeline creation/deletion is an artifact of our nightly stress tests. ## Summary of changes - In the layer 404 case, subtract the skipped layer's stats from the totals, so that at the end of this download pass we should still end up in a complete state. - When updating `last_downloaded`, do a sanity check that our progress is complete. In debug builds, assert out if this is not the case. In prod builds, correct the stats and log a warning. --- pageserver/src/tenant/secondary/downloader.rs | 92 +++++++++++++------ 1 file changed, 63 insertions(+), 29 deletions(-) diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 609e1431cf..870475eb57 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -569,6 +569,39 @@ impl<'a> TenantDownloader<'a> { heatmap.timelines.len() ); + // Get or initialize the local disk state for the timelines we will update + let mut timeline_states = HashMap::new(); + for timeline in &heatmap.timelines { + let timeline_state = self + .secondary_state + .detail + .lock() + .unwrap() + .timelines + .get(&timeline.timeline_id) + .cloned(); + + let timeline_state = match timeline_state { + Some(t) => t, + None => { + // We have no existing state: need to scan local disk for layers first. + let timeline_state = + init_timeline_state(self.conf, tenant_shard_id, timeline).await; + + // Re-acquire detail lock now that we're done with async load from local FS + self.secondary_state + .detail + .lock() + .unwrap() + .timelines + .insert(timeline.timeline_id, timeline_state.clone()); + timeline_state + } + }; + + timeline_states.insert(timeline.timeline_id, timeline_state); + } + // Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general // principle that deletions should be done before writes wherever possible, and so that we can use this // phase to initialize our SecondaryProgress. @@ -579,6 +612,10 @@ impl<'a> TenantDownloader<'a> { // Download the layers in the heatmap for timeline in heatmap.timelines { + let timeline_state = timeline_states + .remove(&timeline.timeline_id) + .expect("Just populated above"); + if self.secondary_state.cancel.is_cancelled() { tracing::debug!( "Cancelled before downloading timeline {}", @@ -588,7 +625,7 @@ impl<'a> TenantDownloader<'a> { } let timeline_id = timeline.timeline_id; - self.download_timeline(timeline, ctx) + self.download_timeline(timeline, timeline_state, ctx) .instrument(tracing::info_span!( "secondary_download_timeline", tenant_id=%tenant_shard_id.tenant_id, @@ -609,6 +646,22 @@ impl<'a> TenantDownloader<'a> { .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL), }); + // Robustness: we should have updated progress properly, but in case we didn't, make sure + // we don't leave the tenant in a state where we claim to have successfully downloaded + // everything, but our progress is incomplete. The invariant here should be that if + // we have set `last_download` to this heatmap's etag, then the next time we see that + // etag we can safely do no work (i.e. we must be complete). + let mut progress = self.secondary_state.progress.lock().unwrap(); + debug_assert!(progress.layers_downloaded == progress.layers_total); + debug_assert!(progress.bytes_downloaded == progress.bytes_total); + if progress.layers_downloaded != progress.layers_total + || progress.bytes_downloaded != progress.bytes_total + { + tracing::warn!("Correcting drift in progress stats ({progress:?})"); + progress.layers_downloaded = progress.layers_total; + progress.bytes_downloaded = progress.bytes_total; + } + Ok(()) } @@ -784,6 +837,7 @@ impl<'a> TenantDownloader<'a> { async fn download_timeline( &self, timeline: HeatMapTimeline, + timeline_state: SecondaryDetailTimeline, ctx: &RequestContext, ) -> Result<(), UpdateError> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -792,34 +846,6 @@ impl<'a> TenantDownloader<'a> { // Accumulate updates to the state let mut touched = Vec::new(); - // Clone a view of what layers already exist on disk - let timeline_state = self - .secondary_state - .detail - .lock() - .unwrap() - .timelines - .get(&timeline.timeline_id) - .cloned(); - - let timeline_state = match timeline_state { - Some(t) => t, - None => { - // We have no existing state: need to scan local disk for layers first. - let timeline_state = - init_timeline_state(self.conf, tenant_shard_id, &timeline).await; - - // Re-acquire detail lock now that we're done with async load from local FS - self.secondary_state - .detail - .lock() - .unwrap() - .timelines - .insert(timeline.timeline_id, timeline_state.clone()); - timeline_state - } - }; - tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); let mut download_futs = Vec::new(); @@ -1009,6 +1035,14 @@ impl<'a> TenantDownloader<'a> { "Skipped downloading missing layer {}, raced with compaction/gc?", layer.name ); + + // If the layer is 404, adjust the progress statistics to reflect that we will not download it. + let mut progress = self.secondary_state.progress.lock().unwrap(); + progress.layers_total = progress.layers_total.saturating_sub(1); + progress.bytes_total = progress + .bytes_total + .saturating_sub(layer.metadata.file_size); + return Ok(None); } Err(e) => return Err(e.into()), From 478cc37a70c4357e7817e6c8834264be4d3e3bfd Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 21 May 2024 14:32:29 +0300 Subject: [PATCH 089/130] Propagate standby apply LSN to pageserver to hold off GC. To avoid pageserver gc'ing data needed by standby, propagate standby apply LSN through standby -> safekeeper -> broker -> pageserver flow and hold off GC for it. Iteration of GC resets the value to remove the horizon when standby goes away -- pushes are assumed to happen at least once between gc iterations. As a safety guard max allowed lag compared to normal GC horizon is hardcoded as 10GB. Add test for the feature. Co-authored-by: Konstantin Knizhnik --- libs/safekeeper_api/src/models.rs | 3 + pageserver/src/tenant/timeline.rs | 28 +++++- .../walreceiver/connection_manager.rs | 13 +++ safekeeper/src/broker.rs | 1 + safekeeper/src/http/routes.rs | 1 + safekeeper/src/send_wal.rs | 85 +++++++++++++++---- safekeeper/src/timeline.rs | 7 +- storage_broker/benches/rps.rs | 1 + storage_broker/proto/broker.proto | 3 + storage_broker/src/bin/storage_broker.rs | 1 + test_runner/fixtures/neon_fixtures.py | 11 +++ test_runner/regress/test_hot_standby.py | 47 +++++++--- 12 files changed, 169 insertions(+), 32 deletions(-) diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index ce5a1e411e..2fbc333075 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -50,6 +50,9 @@ pub struct SkTimelineInfo { pub safekeeper_connstr: Option, #[serde(default)] pub http_connstr: Option, + // Minimum of all active RO replicas flush LSN + #[serde(default = "lsn_invalid")] + pub standby_horizon: Lsn, } #[derive(Debug, Clone, Deserialize, Serialize)] diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 7f2a41d90c..2c43c26359 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -269,6 +269,8 @@ pub struct Timeline { // Atomic would be more appropriate here. last_freeze_ts: RwLock, + pub(crate) standby_horizon: AtomicLsn, + // WAL redo manager. `None` only for broken tenants. walredo_mgr: Option>, @@ -2279,6 +2281,8 @@ impl Timeline { compaction_lock: tokio::sync::Mutex::default(), gc_lock: tokio::sync::Mutex::default(), + standby_horizon: AtomicLsn::new(0), + timeline_get_throttle: resources.timeline_get_throttle, aux_files: tokio::sync::Mutex::new(AuxFilesState { @@ -4844,7 +4848,29 @@ impl Timeline { (horizon_cutoff, pitr_cutoff, retain_lsns) }; - let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + let standby_horizon = self.standby_horizon.load(); + // Hold GC for the standby, but as a safety guard do it only within some + // reasonable lag. + if standby_horizon != Lsn::INVALID { + if let Some(standby_lag) = new_gc_cutoff.checked_sub(standby_horizon) { + const MAX_ALLOWED_STANDBY_LAG: u64 = 10u64 << 30; // 10 GB + if standby_lag.0 < MAX_ALLOWED_STANDBY_LAG { + new_gc_cutoff = Lsn::min(standby_horizon, new_gc_cutoff); + trace!("holding off GC for standby apply LSN {}", standby_horizon); + } else { + warn!( + "standby is lagging for more than {}MB, not holding gc for it", + MAX_ALLOWED_STANDBY_LAG / 1024 / 1024 + ) + } + } + } + + // Reset standby horizon to ignore it if it is not updated till next GC. + // It is an easy way to unset it when standby disappears without adding + // more conf options. + self.standby_horizon.store(Lsn::INVALID); let res = self .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff) diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 991e4ac045..a3c7adae44 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -705,6 +705,7 @@ impl ConnectionManagerState { commit_lsn: info.commit_lsn, safekeeper_connstr: info.safekeeper_connstr, availability_zone: info.availability_zone, + standby_horizon: info.standby_horizon, } } MessageType::SafekeeperDiscoveryResponse => { @@ -725,6 +726,17 @@ impl ConnectionManagerState { WALRECEIVER_BROKER_UPDATES.inc(); + trace!( + "safekeeper info update: standby_horizon(cutoff)={}", + timeline_update.standby_horizon + ); + if timeline_update.standby_horizon != 0 { + // ignore reports from safekeepers not connected to replicas + self.timeline + .standby_horizon + .store(Lsn(timeline_update.standby_horizon)); + } + let new_safekeeper_id = NodeId(timeline_update.safekeeper_id); let old_entry = self.wal_stream_candidates.insert( new_safekeeper_id, @@ -1094,6 +1106,7 @@ mod tests { commit_lsn, safekeeper_connstr: safekeeper_connstr.to_owned(), availability_zone: None, + standby_horizon: 0, }, latest_update, } diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index ea16ce450f..46a51438ea 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -186,6 +186,7 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc) -> Result< commit_lsn: sk_info.commit_lsn, safekeeper_connstr: sk_info.safekeeper_connstr, availability_zone: sk_info.availability_zone, + standby_horizon: 0, }; // note this is a blocking call diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 30d0081a47..808bb1e490 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -350,6 +350,7 @@ async fn record_safekeeper_info(mut request: Request) -> Result Self { + StandbyFeedback { + reply: StandbyReply::empty(), + hs_feedback: HotStandbyFeedback::empty(), + } + } } /// WalSenders registry. Timeline holds it (wrapped in Arc). @@ -162,8 +171,8 @@ impl WalSenders { } /// Get aggregated hot standby feedback (we send it to compute). - pub fn get_hotstandby(self: &Arc) -> HotStandbyFeedback { - self.mutex.lock().agg_hs_feedback + pub fn get_hotstandby(self: &Arc) -> StandbyFeedback { + self.mutex.lock().agg_standby_feedback } /// Record new pageserver feedback, update aggregated values. @@ -184,6 +193,10 @@ impl WalSenders { fn record_standby_reply(self: &Arc, id: WalSenderId, reply: &StandbyReply) { let mut shared = self.mutex.lock(); let slot = shared.get_slot_mut(id); + debug!( + "Record standby reply: ts={} apply_lsn={}", + reply.reply_ts, reply.apply_lsn + ); match &mut slot.feedback { ReplicationFeedback::Standby(sf) => sf.reply = *reply, ReplicationFeedback::Pageserver(_) => { @@ -208,7 +221,7 @@ impl WalSenders { }) } } - shared.update_hs_feedback(); + shared.update_reply_feedback(); } /// Get remote_consistent_lsn reported by the pageserver. Returns None if @@ -226,13 +239,13 @@ impl WalSenders { fn unregister(self: &Arc, id: WalSenderId) { let mut shared = self.mutex.lock(); shared.slots[id] = None; - shared.update_hs_feedback(); + shared.update_reply_feedback(); } } struct WalSendersShared { // aggregated over all walsenders value - agg_hs_feedback: HotStandbyFeedback, + agg_standby_feedback: StandbyFeedback, // last feedback ever received from any pageserver, empty if none last_ps_feedback: PageserverFeedback, // total counter of pageserver feedbacks received @@ -243,7 +256,7 @@ struct WalSendersShared { impl WalSendersShared { fn new() -> Self { WalSendersShared { - agg_hs_feedback: HotStandbyFeedback::empty(), + agg_standby_feedback: StandbyFeedback::empty(), last_ps_feedback: PageserverFeedback::empty(), ps_feedback_counter: 0, slots: Vec::new(), @@ -260,10 +273,11 @@ impl WalSendersShared { self.slots[id].as_mut().expect("walsender doesn't exist") } - /// Update aggregated hot standy feedback. We just take min of valid xmins + /// Update aggregated hot standy and normal reply feedbacks. We just take min of valid xmins /// and ts. - fn update_hs_feedback(&mut self) { + fn update_reply_feedback(&mut self) { let mut agg = HotStandbyFeedback::empty(); + let mut reply_agg = StandbyReply::empty(); for ws_state in self.slots.iter().flatten() { if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback { let hs_feedback = standby_feedback.hs_feedback; @@ -276,7 +290,7 @@ impl WalSendersShared { } else { agg.xmin = hs_feedback.xmin; } - agg.ts = min(agg.ts, hs_feedback.ts); + agg.ts = max(agg.ts, hs_feedback.ts); } if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID { if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID { @@ -284,11 +298,43 @@ impl WalSendersShared { } else { agg.catalog_xmin = hs_feedback.catalog_xmin; } - agg.ts = min(agg.ts, hs_feedback.ts); + agg.ts = max(agg.ts, hs_feedback.ts); + } + let reply = standby_feedback.reply; + if reply.write_lsn != Lsn::INVALID { + if reply_agg.write_lsn != Lsn::INVALID { + reply_agg.write_lsn = Lsn::min(reply_agg.write_lsn, reply.write_lsn); + } else { + reply_agg.write_lsn = reply.write_lsn; + } + } + if reply.flush_lsn != Lsn::INVALID { + if reply_agg.flush_lsn != Lsn::INVALID { + reply_agg.flush_lsn = Lsn::min(reply_agg.flush_lsn, reply.flush_lsn); + } else { + reply_agg.flush_lsn = reply.flush_lsn; + } + } + if reply.apply_lsn != Lsn::INVALID { + if reply_agg.apply_lsn != Lsn::INVALID { + reply_agg.apply_lsn = Lsn::min(reply_agg.apply_lsn, reply.apply_lsn); + } else { + reply_agg.apply_lsn = reply.apply_lsn; + } + } + if reply.reply_ts != 0 { + if reply_agg.reply_ts != 0 { + reply_agg.reply_ts = TimestampTz::min(reply_agg.reply_ts, reply.reply_ts); + } else { + reply_agg.reply_ts = reply.reply_ts; + } } } } - self.agg_hs_feedback = agg; + self.agg_standby_feedback = StandbyFeedback { + reply: reply_agg, + hs_feedback: agg, + }; } } @@ -793,8 +839,11 @@ mod tests { fn test_hs_feedback_no_valid() { let mut wss = WalSendersShared::new(); push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID)); - wss.update_hs_feedback(); - assert_eq!(wss.agg_hs_feedback.xmin, INVALID_FULL_TRANSACTION_ID); + wss.update_reply_feedback(); + assert_eq!( + wss.agg_standby_feedback.hs_feedback.xmin, + INVALID_FULL_TRANSACTION_ID + ); } #[test] @@ -803,7 +852,7 @@ mod tests { push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID)); push_feedback(&mut wss, hs_feedback(1, 42)); push_feedback(&mut wss, hs_feedback(1, 64)); - wss.update_hs_feedback(); - assert_eq!(wss.agg_hs_feedback.xmin, 42); + wss.update_reply_feedback(); + assert_eq!(wss.agg_standby_feedback.hs_feedback.xmin, 42); } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 64f764f191..e97247dc7c 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -248,6 +248,7 @@ impl SharedState { &self, ttid: &TenantTimelineId, conf: &SafeKeeperConf, + standby_apply_lsn: Lsn, ) -> SafekeeperTimelineInfo { SafekeeperTimelineInfo { safekeeper_id: conf.my_id.0, @@ -270,6 +271,7 @@ impl SharedState { backup_lsn: self.sk.state.inmem.backup_lsn.0, local_start_lsn: self.sk.state.local_start_lsn.0, availability_zone: conf.availability_zone.clone(), + standby_horizon: standby_apply_lsn.0, } } @@ -663,7 +665,7 @@ impl Timeline { // if this is AppendResponse, fill in proper hot standby feedback. if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { - resp.hs_feedback = self.walsenders.get_hotstandby(); + resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback; } commit_lsn = shared_state.sk.state.inmem.commit_lsn; @@ -716,7 +718,8 @@ impl Timeline { /// Get safekeeper info for broadcasting to broker and other peers. pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo { let shared_state = self.write_shared_state().await; - shared_state.get_safekeeper_info(&self.ttid, conf) + let standby_apply_lsn = self.walsenders.get_hotstandby().reply.apply_lsn; + shared_state.get_safekeeper_info(&self.ttid, conf, standby_apply_lsn) } /// Update timeline state with peer safekeeper data. diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index d66cbefa45..1a6fb7fedf 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -147,6 +147,7 @@ async fn publish(client: Option, n_keys: u64) { http_connstr: "zenith-1-sk-1.local:7677".to_owned(), local_start_lsn: 0, availability_zone: None, + standby_horizon: 0, }; counter += 1; yield info; diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto index 7d1b63d23f..a420fd9c66 100644 --- a/storage_broker/proto/broker.proto +++ b/storage_broker/proto/broker.proto @@ -42,6 +42,7 @@ message SafekeeperTimelineInfo { uint64 remote_consistent_lsn = 7; uint64 peer_horizon_lsn = 8; uint64 local_start_lsn = 9; + uint64 standby_horizon = 14; // A connection string to use for WAL receiving. string safekeeper_connstr = 10; // HTTP endpoint connection string @@ -105,4 +106,6 @@ message SafekeeperDiscoveryResponse { string safekeeper_connstr = 4; // Availability zone of a safekeeper. optional string availability_zone = 5; + // Replica apply LSN + uint64 standby_horizon = 6; } diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 8c88b61abc..0a4af543ab 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -736,6 +736,7 @@ mod tests { http_connstr: "neon-1-sk-1.local:7677".to_owned(), local_start_lsn: 0, availability_zone: None, + standby_horizon: 0, }) } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 23f30804b4..41377e2db2 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4288,6 +4288,17 @@ def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint): time.sleep(1) +def log_replica_lag(primary: Endpoint, secondary: Endpoint): + last_replay_lsn = Lsn( + secondary.safe_psql_scalar("SELECT pg_last_wal_replay_lsn()", log_query=False) + ) + primary_lsn = Lsn( + primary.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()", log_query=False) + ) + lag = primary_lsn - last_replay_lsn + log.info(f"primary_lsn={primary_lsn}, replay_lsn={last_replay_lsn}, lag={lag}") + + def wait_for_last_flush_lsn( env: NeonEnv, endpoint: Endpoint, diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 179cc273ec..31f436cb4c 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -1,9 +1,20 @@ import os import re +import threading import time +from functools import partial +import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, tenant_get_shards, wait_replica_caughtup +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + log_replica_lag, + tenant_get_shards, + wait_replica_caughtup, +) +from fixtures.utils import wait_until # Check for corrupted WAL messages which might otherwise go unnoticed if @@ -104,19 +115,28 @@ def test_2_replicas_start(neon_simple_env: NeonEnv): wait_replica_caughtup(primary, secondary2) -# We had an issue that a standby server made GetPage requests with an -# old LSN, based on the last-written LSN cache, to avoid waits in the -# pageserver. However, requesting a page with a very old LSN, such -# that the GC horizon has already advanced past it, results in an -# error from the pageserver: -# "Bad request: tried to request a page version that was garbage collected" +# Test two different scenarios related to gc of data needed by hot standby. # -# To avoid that, the compute<-> pageserver protocol was updated so -# that that the standby now sends two LSNs, the old last-written LSN -# and the current replay LSN. +# When pause_apply is False, standby is mostly caught up with the primary. +# However, in compute <-> pageserver protocol version 1 only one LSN had been +# sent to the pageserver in page request, and to avoid waits in the pageserver +# it was last-written LSN cache value. If page hasn't been updated for a long +# time that resulted in an error from the pageserver: "Bad request: tried to +# request a page version that was garbage collected". For primary this wasn't a +# problem because pageserver always bumped LSN to the newest one; for standy +# that would be incorrect since we might get page fresher then apply LSN. Hence, +# in protocol version v2 two LSNs were introduced: main request_lsn (apply LSN +# in case of standby) and not_modified_since which could be used as an +# optimization to avoid waiting. # # https://github.com/neondatabase/neon/issues/6211 -def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder): +# +# When pause_apply is True we model standby lagging behind primary (e.g. due to +# high max_standby_streaming_delay). To prevent pageserver from removing data +# still needed by the standby apply LSN is propagated in standby -> safekeepers +# -> broker -> pageserver flow so that pageserver could hold off gc for it. +@pytest.mark.parametrize("pause_apply", [False, True]) +def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): tenant_conf = { # set PITR interval to be small, so we can do GC "pitr_interval": "0 s", @@ -160,6 +180,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder): # so we still remember the LSNs of the pages. s_cur.execute("SELECT clear_buffer_cache()") + if pause_apply: + s_cur.execute("SELECT pg_wal_replay_pause()") + # Do other stuff on the primary, to advance the WAL p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g") @@ -176,6 +199,8 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder): # generates use old not_modified_since LSNs, older than # the GC cutoff, but new request LSNs. (In protocol # version 1 there was only one LSN, and this failed.) + log_replica_lag(primary, secondary) s_cur.execute("SELECT COUNT(*) FROM test") + log_replica_lag(primary, secondary) res = s_cur.fetchone() assert res[0] == 10000 From f54c3b96e08f78e56d1f82f8305994e9cc8f867f Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 21 May 2024 14:42:25 +0300 Subject: [PATCH 090/130] Fix bugs in hot standby feedback propagation and add test for it. Co-authored-by: Konstantin Knizhnik --- pgxn/neon/walproposer_pg.c | 19 +++--- safekeeper/src/send_wal.rs | 9 ++- test_runner/regress/test_hot_standby.py | 88 +++++++++++++++++++++++++ 3 files changed, 104 insertions(+), 12 deletions(-) diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index e5ef93b456..492a46fd54 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -1852,34 +1852,30 @@ static void CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) { hs->ts = 0; - hs->xmin.value = ~0; /* largest unsigned value */ - hs->catalog_xmin.value = ~0; /* largest unsigned value */ + hs->xmin = InvalidFullTransactionId; + hs->catalog_xmin = InvalidFullTransactionId; for (int i = 0; i < wp->n_safekeepers; i++) { - if (wp->safekeeper[i].appendResponse.hs.ts != 0) + + if (wp->safekeeper[i].state == SS_ACTIVE) { HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs; if (FullTransactionIdIsNormal(skhs->xmin) - && FullTransactionIdPrecedes(skhs->xmin, hs->xmin)) + && (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin))) { hs->xmin = skhs->xmin; hs->ts = skhs->ts; } if (FullTransactionIdIsNormal(skhs->catalog_xmin) - && FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin)) + && (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin))) { hs->catalog_xmin = skhs->catalog_xmin; hs->ts = skhs->ts; } } } - - if (hs->xmin.value == ~0) - hs->xmin = InvalidFullTransactionId; - if (hs->catalog_xmin.value == ~0) - hs->catalog_xmin = InvalidFullTransactionId; } /* @@ -1946,9 +1942,10 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) } CombineHotStanbyFeedbacks(&hsFeedback, wp); - if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0) + if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0) { agg_hs_feedback = hsFeedback; + elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin)); ProcessStandbyHSFeedback(hsFeedback.ts, XidFromFullTransactionId(hsFeedback.xmin), EpochFromFullTransactionId(hsFeedback.xmin), diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 4edd09a318..5a9745e1c9 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -756,8 +756,15 @@ impl ReplyReader { match msg.first().cloned() { Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { // Note: deserializing is on m[1..] because we skip the tag byte. - let hs_feedback = HotStandbyFeedback::des(&msg[1..]) + let mut hs_feedback = HotStandbyFeedback::des(&msg[1..]) .context("failed to deserialize HotStandbyFeedback")?; + // TODO: xmin/catalog_xmin are serialized by walreceiver.c in this way: + // pq_sendint32(&reply_message, xmin); + // pq_sendint32(&reply_message, xmin_epoch); + // So it is two big endian 32-bit words in low endian order! + hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32); + hs_feedback.catalog_xmin = + (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32); self.ws_guard .walsenders .record_hs_feedback(self.ws_guard.id, &hs_feedback); diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 31f436cb4c..244d482c18 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -204,3 +204,91 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): log_replica_lag(primary, secondary) res = s_cur.fetchone() assert res[0] == 10000 + + +def run_pgbench(connstr: str, pg_bin: PgBin): + log.info(f"Start a pgbench workload on pg {connstr}") + # s10 is about 150MB of data. In debug mode init takes about 15s on SSD. + pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) + log.info("pgbench init done") + pg_bin.run_capture(["pgbench", "-T60", connstr]) + + +# assert that pgbench_accounts and its index are created. +def pgbench_accounts_initialized(ep): + ep.safe_psql_scalar("select 'pgbench_accounts_pkey'::regclass") + + +# Test that hot_standby_feedback works in neon (it is forwarded through +# safekeepers). That is, ensure queries on standby don't fail during load on +# primary under the following conditions: +# - pgbench bombards primary with updates. +# - On the secondary we run long select of the updated table. +# - Set small max_standby_streaming_delay: hs feedback should prevent conflicts +# so apply doesn't need to wait. +# - Do agressive vacuum on primary which still shouldn't create conflicts. +# Actually this appears to be redundant due to microvacuum existence. +# +# Without hs feedback enabled we'd see 'User query might have needed to see row +# versions that must be removed.' errors. +def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + env = neon_env_builder.init_start() + agressive_vacuum_conf = [ + "log_autovacuum_min_duration = 0", + "autovacuum_naptime = 10s", + "autovacuum_vacuum_threshold = 25", + "autovacuum_vacuum_scale_factor = 0.1", + "autovacuum_vacuum_cost_delay = -1", + ] + with env.endpoints.create_start( + branch_name="main", endpoint_id="primary", config_lines=agressive_vacuum_conf + ) as primary: + # It would be great to have more strict max_standby_streaming_delay=0s here, but then sometimes it fails with + # 'User was holding shared buffer pin for too long.'. + with env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=[ + "max_standby_streaming_delay=2s", + "neon.protocol_version=2", + "hot_standby_feedback=true", + ], + ) as secondary: + log.info( + f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}" + ) + t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin)) + t.start() + # Wait until pgbench_accounts is created + filled on replica *and* + # index is created. Otherwise index creation would conflict with + # read queries and hs feedback won't save us. + wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary)) + + # Test should fail if hs feedback is disabled anyway, but cross + # check that walproposer sets some xmin. + def xmin_is_not_null(): + slot_xmin = primary.safe_psql_scalar( + "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'", + log_query=False, + ) + log.info(f"xmin is {slot_xmin}") + assert int(slot_xmin) > 0 + + wait_until(10, 1.0, xmin_is_not_null) + for _ in range(1, 5): + # in debug mode takes about 5-7s + balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts") + log.info(f"balance={balance}") + log_replica_lag(primary, secondary) + t.join() + + # check xmin is reset when standby is gone + def xmin_is_null(): + slot_xmin = primary.safe_psql_scalar( + "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'", + log_query=False, + ) + log.info(f"xmin is {slot_xmin}") + assert slot_xmin is None + + wait_until(10, 1.0, xmin_is_null) From f2771a99b7cc286a936ecc27949d305d1fe7103c Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 20 May 2024 14:28:36 +0300 Subject: [PATCH 091/130] Add metric for pageserver standby horizon. Co-authored-by: Konstantin Knizhnik --- pageserver/src/metrics.rs | 15 +++++++++++++++ pageserver/src/tenant/timeline.rs | 3 +++ .../timeline/walreceiver/connection_manager.rs | 4 ++++ test_runner/fixtures/metrics.py | 1 + 4 files changed, 23 insertions(+) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 5315f0b936..27e25d8e32 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -525,6 +525,15 @@ static LAST_RECORD_LSN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static STANDBY_HORIZON: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_standby_horizon", + "Standby apply LSN for which GC is hold off, by timeline.", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_resident_physical_size", @@ -2098,6 +2107,7 @@ pub(crate) struct TimelineMetrics { pub garbage_collect_histo: StorageTimeMetrics, pub find_gc_cutoffs_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, + pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, @@ -2167,6 +2177,9 @@ impl TimelineMetrics { let last_record_gauge = LAST_RECORD_LSN .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let standby_horizon_gauge = STANDBY_HORIZON + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2212,6 +2225,7 @@ impl TimelineMetrics { find_gc_cutoffs_histo, load_layer_map_histo, last_record_gauge, + standby_horizon_gauge, resident_physical_size_gauge, current_logical_size_gauge, aux_file_size_gauge, @@ -2246,6 +2260,7 @@ impl TimelineMetrics { let timeline_id = &self.timeline_id; let shard_id = &self.shard_id; let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]); { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2c43c26359..b0e5275b5f 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4871,6 +4871,9 @@ impl Timeline { // It is an easy way to unset it when standby disappears without adding // more conf options. self.standby_horizon.store(Lsn::INVALID); + self.metrics + .standby_horizon_gauge + .set(Lsn::INVALID.0 as i64); let res = self .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff) diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index a3c7adae44..1d2ffec08f 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -735,6 +735,10 @@ impl ConnectionManagerState { self.timeline .standby_horizon .store(Lsn(timeline_update.standby_horizon)); + self.timeline + .metrics + .standby_horizon_gauge + .set(timeline_update.standby_horizon as i64); } let new_safekeeper_id = NodeId(timeline_update.safekeeper_id); diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 8fa67e75c9..8b8075f8c1 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -142,6 +142,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_resident_physical_size", "pageserver_io_operations_bytes_total", "pageserver_last_record_lsn", + "pageserver_standby_horizon", "pageserver_smgr_query_seconds_bucket", "pageserver_smgr_query_seconds_count", "pageserver_smgr_query_seconds_sum", From d43dcceef9bac464cd2b80bd8f40cfae496c7f62 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 20 May 2024 18:41:39 +0300 Subject: [PATCH 092/130] Minimize hot standby feedback xmins to next_xid. Hot standby feedback xmins can be greater than next_xid due to sparse update of nextXid on pageserver (to do less writes it advances next xid on 1024). ProcessStandbyHSFeedback ignores such xids from the future; to fix, minimize received xmin to next_xid. Co-authored-by: Konstantin Knizhnik --- pgxn/neon/walproposer_pg.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 492a46fd54..316e23a72e 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -1944,13 +1944,26 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) CombineHotStanbyFeedbacks(&hsFeedback, wp); if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0) { + FullTransactionId xmin = hsFeedback.xmin; + FullTransactionId catalog_xmin = hsFeedback.catalog_xmin; + FullTransactionId next_xid = ReadNextFullTransactionId(); + /* + * Page server is updating nextXid in checkpoint each 1024 transactions, + * so feedback xmin can be actually larger then nextXid and + * function TransactionIdInRecentPast return false in this case, + * preventing update of slot's xmin. + */ + if (FullTransactionIdPrecedes(next_xid, xmin)) + xmin = next_xid; + if (FullTransactionIdPrecedes(next_xid, catalog_xmin)) + catalog_xmin = next_xid; agg_hs_feedback = hsFeedback; elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin)); ProcessStandbyHSFeedback(hsFeedback.ts, - XidFromFullTransactionId(hsFeedback.xmin), - EpochFromFullTransactionId(hsFeedback.xmin), - XidFromFullTransactionId(hsFeedback.catalog_xmin), - EpochFromFullTransactionId(hsFeedback.catalog_xmin)); + XidFromFullTransactionId(xmin), + EpochFromFullTransactionId(xmin), + XidFromFullTransactionId(catalog_xmin), + EpochFromFullTransactionId(catalog_xmin)); } CheckGracefulShutdown(wp); From d9d471e3c43c9e47b0ffc1ce9a796bd15aacf36c Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 14 May 2024 13:49:04 -0500 Subject: [PATCH 093/130] Add some Python typing in a few test files --- test_runner/fixtures/neon_fixtures.py | 7 ++++++- test_runner/regress/test_pg_regress.py | 22 +++++++++++++++------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 41377e2db2..5c865baa54 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2721,7 +2721,12 @@ class PgBin: env.update(env_add) return env - def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None): + def run( + self, + command: List[str], + env: Optional[Env] = None, + cwd: Optional[Union[str, Path]] = None, + ): """ Run one of the postgres binaries. diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 2b1b7fff34..c00a8ff6b7 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -1,8 +1,10 @@ # # This file runs pg_regress-based tests. # +from __future__ import annotations + from pathlib import Path -from typing import Optional +from typing import TYPE_CHECKING import pytest from fixtures.neon_fixtures import ( @@ -11,6 +13,12 @@ from fixtures.neon_fixtures import ( ) from fixtures.remote_storage import s3_storage +if TYPE_CHECKING: + from typing import Optional + + from fixtures.neon_fixtures import PgBin + from pytest import CaptureFixture + # Run the main PostgreSQL regression tests, in src/test/regress. # @@ -19,8 +27,8 @@ def test_pg_regress( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, build_type: str, - pg_bin, - capsys, + pg_bin: PgBin, + capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, shard_count: Optional[int], @@ -86,8 +94,8 @@ def test_pg_regress( def test_isolation( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - pg_bin, - capsys, + pg_bin: PgBin, + capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, shard_count: Optional[int], @@ -142,8 +150,8 @@ def test_isolation( def test_sql_regress( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - pg_bin, - capsys, + pg_bin: PgBin, + capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, shard_count: Optional[int], From e8b8ebfa1da3f528b5ebfb0a5aff7b946a8eabc1 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Fri, 17 May 2024 13:39:33 -0500 Subject: [PATCH 094/130] Allow check_restored_datadir_content to ignore certain files Some files may have known differences that we are okay with. --- test_runner/fixtures/neon_fixtures.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5c865baa54..7a660b64ed 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4150,7 +4150,12 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]: # pg is the existing and running compute node, that we want to compare with a basebackup -def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint): +def check_restored_datadir_content( + test_output_dir: Path, + env: NeonEnv, + endpoint: Endpoint, + ignored_files: Optional[list[str]] = None, +): pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) # Get the timeline ID. We need it for the 'basebackup' command @@ -4203,6 +4208,10 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint if not f.startswith("pg_xact") and not f.startswith("pg_multixact") ] + if ignored_files: + pgdata_files = [f for f in pgdata_files if f not in ignored_files] + restored_files = [f for f in restored_files if f not in ignored_files] + # check that file sets are equal assert pgdata_files == restored_files From 9a4b896636465d4d24e5b11c6a7bdb6aed83f23f Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Fri, 17 May 2024 13:50:36 -0500 Subject: [PATCH 095/130] Use a constant for database name in test_pg_regress --- test_runner/regress/test_pg_regress.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index c00a8ff6b7..302f94064c 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -33,6 +33,8 @@ def test_pg_regress( pg_distrib_dir: Path, shard_count: Optional[int], ): + DBNAME = "regression" + """ :param shard_count: if None, create an unsharded tenant. Otherwise create a tenant with this many shards. @@ -50,7 +52,7 @@ def test_pg_regress( # Connect to postgres and create a database called "regression". endpoint = env.endpoints.create_start("main") - endpoint.safe_psql("CREATE DATABASE regression") + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_regress to run in. runpath = test_output_dir / "regress" From 8030b8e4c5f50a2320888433a39e550bfa1ec22e Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Fri, 17 May 2024 13:56:02 -0500 Subject: [PATCH 096/130] Fix test_pg_regress for unlogged relations Previously we worked around file comparison issues by dropping unlogged relations in the pg_regress tests, but this would lead to an unnecessary diff when compared to upstream in our Postgres fork. Instead, we can precompute the files that we know will be different, and ignore them. --- test_runner/regress/test_pg_regress.py | 65 +++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 302f94064c..885a94a557 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -4,13 +4,14 @@ from __future__ import annotations from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import pytest from fixtures.neon_fixtures import ( NeonEnvBuilder, check_restored_datadir_content, ) +from fixtures.pg_version import PgVersion from fixtures.remote_storage import s3_storage if TYPE_CHECKING: @@ -87,7 +88,67 @@ def test_pg_regress( with capsys.disabled(): pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - check_restored_datadir_content(test_output_dir, env, endpoint) + ignored_files: Optional[list[str]] = None + + # Neon handles unlogged relations in a special manner. During a + # basebackup, we ship the init fork as the main fork. This presents a + # problem in that the endpoint's data directory and the basebackup will + # have differences and will fail the eventual file comparison. + # + # Unlogged tables were introduced in version 9.1. ALTER TABLE grew + # support for setting the persistence of a table in 9.5. The reason that + # this doesn't affect versions < 15 (but probably would between 9.1 and + # 9.5) is that all the regression tests that deal with unlogged tables + # up until that point dropped the unlogged tables or set them to logged + # at some point during the test. + # + # In version 15, Postgres grew support for unlogged sequences, and with + # that came a few more regression tests. These tests did not all drop + # the unlogged tables/sequences prior to finishing. + # + # But unlogged sequences came with a bug in that, sequences didn't + # inherit the persistence of their "parent" tables if they had one. This + # was fixed and backported to 15, thus exacerbating our problem a bit. + # + # So what we can do is just ignore file differences between the data + # directory and basebackup for unlogged relations. + results = cast( + "list[tuple[str, str]]", + endpoint.safe_psql( + """ + SELECT + relkind, + pg_relation_filepath( + pg_filenode_relation(reltablespace, relfilenode) + ) AS unlogged_relation_paths + FROM pg_class + WHERE relpersistence = 'u' + """, + dbname=DBNAME, + ), + ) + + unlogged_relation_files: list[str] = [] + for r in results: + unlogged_relation_files.append(r[1]) + # This is related to the following Postgres commit: + # + # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b + # Author: Heikki Linnakangas + # Date: 2023-08-23 09:21:31 -0500 + # + # Use the buffer cache when initializing an unlogged index. + # + # This patch was backpatched to 16. Without it, the LSN in the + # page header would be 0/0 in the data directory, which wouldn't + # match the LSN generated during the basebackup, thus creating + # a difference. + if env.pg_version <= PgVersion.V15 and r[0] == "i": + unlogged_relation_files.append(f"{r[1]}_init") + + ignored_files = unlogged_relation_files + + check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files) # Run the PostgreSQL "isolation" tests, in src/test/isolation. From 781352bd8e4dca5676168fe67521b178fa91a0ec Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Mon, 20 May 2024 10:01:58 -0500 Subject: [PATCH 097/130] Upgrade Postgres v14 to 14.12 --- vendor/postgres-v14 | 2 +- vendor/revisions.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index d6f7e2c604..21ec61d539 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a +Subproject commit 21ec61d539d22a81fe811c2d79e26436820bc3f4 diff --git a/vendor/revisions.json b/vendor/revisions.json index c5b55762fa..ec82786109 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { "v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"], "v15": ["15.6", "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a"], - "v14": ["14.11", "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"] + "v14": ["14.12", "21ec61d539d22a81fe811c2d79e26436820bc3f4"] } From 9d081851ec846523d3893f8e66e8fb5112c1e7ca Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Mon, 20 May 2024 10:03:09 -0500 Subject: [PATCH 098/130] Upgrade Postgres v15 to 15.7 --- vendor/postgres-v15 | 2 +- vendor/revisions.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index f0d6b0ef75..e2dbd63345 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit f0d6b0ef7581bd78011832e23d8420a7d2c8a83a +Subproject commit e2dbd63345c584de75173c27951f111249ae0016 diff --git a/vendor/revisions.json b/vendor/revisions.json index ec82786109..d022e6f455 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { "v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"], - "v15": ["15.6", "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a"], + "v15": ["15.7", "e2dbd63345c584de75173c27951f111249ae0016"], "v14": ["14.12", "21ec61d539d22a81fe811c2d79e26436820bc3f4"] } From e3415706b7b23a30f21051a8063bc257a79e3953 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Mon, 20 May 2024 10:04:04 -0500 Subject: [PATCH 099/130] Upgrade Postgres v16 to 16.3 --- libs/walproposer/src/walproposer.rs | 4 ++-- vendor/postgres-v16 | 2 +- vendor/revisions.json | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index fb815607a7..f7b72b205f 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -496,9 +496,9 @@ mod tests { // TODO: When updating Postgres versions, this test will cause // problems. Postgres version in message needs updating. // - // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 }) + // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 }) vec![ - 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110, 147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147, 188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1, diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 8ef3c33aa0..c271017c6c 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 8ef3c33aa01631e17cb24a122776349fcc777b46 +Subproject commit c271017c6c4846be59948766baec2ba4ace5dc9c diff --git a/vendor/revisions.json b/vendor/revisions.json index d022e6f455..a3af9331fe 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"], + "v16": ["16.3", "c271017c6c4846be59948766baec2ba4ace5dc9c"], "v15": ["15.7", "e2dbd63345c584de75173c27951f111249ae0016"], "v14": ["14.12", "21ec61d539d22a81fe811c2d79e26436820bc3f4"] } From 1988ad8db702a24afc19ce4332e1199531441a8c Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Fri, 17 May 2024 13:42:51 -0500 Subject: [PATCH 100/130] Extend test_unlogged to include a sequence Unlogged sequences were added in v15, so let's just test to make sure they work on Neon. --- test_runner/regress/test_unlogged.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py index 708bf0dfeb..137d28b9fa 100644 --- a/test_runner/regress/test_unlogged.py +++ b/test_runner/regress/test_unlogged.py @@ -1,4 +1,5 @@ from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn +from fixtures.pg_version import PgVersion # @@ -17,7 +18,8 @@ def test_unlogged(neon_simple_env: NeonEnv): cur.execute("CREATE UNLOGGED TABLE iut (id int);") # create index to test unlogged index relation as well cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);") - cur.execute("INSERT INTO iut values (42);") + cur.execute("ALTER TABLE iut ADD COLUMN seq int GENERATED ALWAYS AS IDENTITY;") + cur.execute("INSERT INTO iut (id) values (42);") # create another compute to fetch inital empty contents from pageserver fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged") @@ -26,7 +28,15 @@ def test_unlogged(neon_simple_env: NeonEnv): conn2 = endpoint2.connect() cur2 = conn2.cursor() # after restart table should be empty but valid - cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)") + cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut (id) VALUES ($1)") cur2.execute("EXECUTE iut_plan (43);") cur2.execute("SELECT * FROM iut") - assert cur2.fetchall() == [(43,)] + results = cur2.fetchall() + # Unlogged sequences were introduced in v15. On <= v14, the sequence created + # for the GENERATED ALWAYS AS IDENTITY column is logged, and hence it keeps + # the old value (2) on restart. While on v15 and above, it's unlogged, so it + # gets reset to 1. + if env.pg_version <= PgVersion.V14: + assert results == [(43, 2)] + else: + assert results == [(43, 1)] From 353afe4fe7dc2c5a131aa01d06d10d0d229fd84a Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 21 May 2024 16:13:54 +0100 Subject: [PATCH 101/130] neon_local: run controller's postgres with fsync=off (#7817) ## Problem In `test_storage_controller_many_tenants` we [occasionally](https://neon-github-public-dev.s3.amazonaws.com/reports/main/9155810417/index.html#/testresult/8fbdf57a0e859c2d) see it hit the retry limit on serializable transactions. That's likely due to a combination of relative slow fsync on the hetzner nodes running the test, and the way the test does lots of parallel timeline creations, putting high load on the drive. Running the storage controller's db with fsync=off may help here. ## Summary of changes - Set `fsync=off` in the postgres config for the database used by the storage controller in tests --- control_plane/src/storage_controller.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 96e8276f4d..b6b7ea7762 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -243,9 +243,13 @@ impl StorageController { anyhow::bail!("initdb failed with status {status}"); } + // Write a minimal config file: + // - Specify the port, since this is chosen dynamically + // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing + // the storage controller we don't want a slow local disk to interfere with that. tokio::fs::write( &pg_data_path.join("postgresql.conf"), - format!("port = {}", self.postgres_port), + format!("port = {}\nfsync=off\n", self.postgres_port), ) .await?; }; From a8a88ba7bcf99505b1552858db0f5415974b8f8a Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 21 May 2024 20:08:43 +0300 Subject: [PATCH 102/130] test(detach_ancestor): ensure L0 compaction in history is ok (#7813) detaching a timeline from its ancestor can leave the resulting timeline with more L0 layers than the compaction threshold. most of the time, the detached timeline has made progress, and next time the L0 -> L1 compaction happens near the original branch point and not near the last_record_lsn. add a test to ensure that inheriting the historical L0s does not change fullbackup. additionally: - add `wait_until_completed` to test-only timeline checkpoint and compact HTTP endpoints. with `?wait_until_completed=true` the endpoints will wait until the remote client has completed uploads. - for delta layers, describe L0-ness with the `/layer` endpoint Cc: #6994 --- libs/pageserver_api/src/models.rs | 2 + pageserver/src/http/routes.rs | 11 ++ pageserver/src/tenant/storage_layer/layer.rs | 1 + .../src/tenant/storage_layer/layer_name.rs | 16 +- test_runner/fixtures/pageserver/http.py | 20 ++- test_runner/regress/test_compaction.py | 3 +- test_runner/regress/test_layer_eviction.py | 4 +- test_runner/regress/test_ondemand_download.py | 1 - .../regress/test_pageserver_layer_rolling.py | 2 +- test_runner/regress/test_sharding.py | 1 - .../regress/test_timeline_detach_ancestor.py | 157 ++++++++++++++++++ test_runner/regress/test_timeline_size.py | 2 +- 12 files changed, 200 insertions(+), 20 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 451ee1a13c..05a444d738 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -789,6 +789,8 @@ pub enum HistoricLayerInfo { lsn_end: Lsn, remote: bool, access_stats: LayerAccessStats, + + l0: bool, }, Image { layer_file_name: String, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 0eab6510ca..ec3b1141f3 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1736,6 +1736,8 @@ async fn timeline_compact_handler( if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { flags |= CompactFlags::ForceImageLayerCreation; } + let wait_until_uploaded = + parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); @@ -1744,6 +1746,9 @@ async fn timeline_compact_handler( .compact(&cancel, flags, &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; + if wait_until_uploaded { + timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?; + } json_response(StatusCode::OK, ()) } .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) @@ -1768,6 +1773,8 @@ async fn timeline_checkpoint_handler( if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { flags |= CompactFlags::ForceImageLayerCreation; } + let wait_until_uploaded = + parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); @@ -1781,6 +1788,10 @@ async fn timeline_checkpoint_handler( .await .map_err(|e| ApiError::InternalServerError(e.into()))?; + if wait_until_uploaded { + timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?; + } + json_response(StatusCode::OK, ()) } .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 97b349f635..45d61ce048 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1264,6 +1264,7 @@ impl LayerInner { lsn_end: lsn_range.end, remote: !resident, access_stats, + l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()), } } else { let lsn = self.desc.image_layer_lsn(); diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs index c733404693..da26e1eeb7 100644 --- a/pageserver/src/tenant/storage_layer/layer_name.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -347,37 +347,33 @@ impl<'de> serde::de::Visitor<'de> for LayerNameVisitor { mod test { use super::*; #[test] - fn image_layer_parse() -> anyhow::Result<()> { + fn image_layer_parse() { let expected = LayerName::Image(ImageLayerName { key_range: Key::from_i128(0) ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), lsn: Lsn::from_hex("00000000014FED58").unwrap(), }); - let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?; + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap(); assert_eq!(parsed, expected,); // Omitting generation suffix is valid - let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?; + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap(); assert_eq!(parsed, expected,); - - Ok(()) } #[test] - fn delta_layer_parse() -> anyhow::Result<()> { + fn delta_layer_parse() { let expected = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0) ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), lsn_range: Lsn::from_hex("00000000014FED58").unwrap() ..Lsn::from_hex("000000000154C481").unwrap(), }); - let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?; + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap(); assert_eq!(parsed, expected); // Omitting generation suffix is valid - let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?; + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap(); assert_eq!(parsed, expected); - - Ok(()) } } diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 4d563a532b..f1f96f6d5f 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -56,20 +56,30 @@ class InMemoryLayerInfo: class HistoricLayerInfo: kind: str layer_file_name: str - layer_file_size: Optional[int] + layer_file_size: int lsn_start: str lsn_end: Optional[str] remote: bool + # None for image layers, true if pageserver thinks this is an L0 delta layer + l0: Optional[bool] @classmethod def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo: + # instead of parsing the key range lets keep the definition of "L0" in pageserver + l0_ness = d.get("l0") + assert l0_ness is None or isinstance(l0_ness, bool) + + size = d["layer_file_size"] + assert isinstance(size, int) + return HistoricLayerInfo( kind=d["kind"], layer_file_name=d["layer_file_name"], - layer_file_size=d.get("layer_file_size"), + layer_file_size=size, lsn_start=d["lsn_start"], lsn_end=d.get("lsn_end"), remote=d["remote"], + l0=l0_ness, ) @@ -583,6 +593,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): timeline_id: TimelineId, force_repartition=False, force_image_layer_creation=False, + wait_until_uploaded=False, ): self.is_testing_enabled_or_skip() query = {} @@ -590,6 +601,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): query["force_repartition"] = "true" if force_image_layer_creation: query["force_image_layer_creation"] = "true" + if wait_until_uploaded: + query["wait_until_uploaded"] = "true" log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") res = self.put( @@ -656,6 +669,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): timeline_id: TimelineId, force_repartition=False, force_image_layer_creation=False, + wait_until_uploaded=False, ): self.is_testing_enabled_or_skip() query = {} @@ -663,6 +677,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): query["force_repartition"] = "true" if force_image_layer_creation: query["force_image_layer_creation"] = "true" + if wait_until_uploaded: + query["wait_until_uploaded"] = "true" log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") res = self.put( diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 93a16620a3..6a3515e1bd 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -165,7 +165,6 @@ def test_sharding_compaction( image_layer_sizes[layer.layer_file_name] = layer.layer_file_size # Pageserver should assert rather than emit an empty layer file, but double check here - assert layer.layer_file_size is not None assert layer.layer_file_size > 0 shard_has_image_layers.append(len(image_layer_sizes) > 1) @@ -178,7 +177,7 @@ def test_sharding_compaction( # # We only do this check with tiny stripes, because large stripes may not give all shards enough # data to have statistically significant image layers - avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes) # type: ignore + avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes) log.info(f"Shard {shard_id} average image layer size: {avg_size}") assert avg_size > compaction_target_size / 2 diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 0ef4f6d95b..193149ea03 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -272,14 +272,14 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): resident_physical_size_metric == 0 ), "ensure that resident_physical_size metric is zero" assert resident_physical_size_metric == sum( - layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote + layer.layer_file_size for layer in info.historic_layers if not layer.remote ), "ensure that resident_physical_size metric corresponds to layer map dump" remote_physical_size_metric = ps_http.get_timeline_metric( tenant_id, timeline_id, "pageserver_remote_physical_size" ) assert remote_physical_size_metric == sum( - layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote + layer.layer_file_size for layer in info.historic_layers if layer.remote ), "ensure that remote_physical_size metric corresponds to layer map dump" log.info("before runnning GC, ensure that remote_physical size is zero") diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index b51754c9e0..b137fb3a5c 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -540,7 +540,6 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder: for layer in layers.historic_layers: log.info(f"pre-compact: {layer}") - assert layer.layer_file_size is not None, "we must know layer file sizes" layer_sizes += layer.layer_file_size pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name) diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index aab0536f5a..66b6185aaa 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -287,7 +287,7 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder): total_historic_bytes += sum( layer.layer_file_size for layer in layer_map.historic_layers - if layer.layer_file_size is not None and Lsn(layer.lsn_start) > initdb_lsn + if Lsn(layer.lsn_start) > initdb_lsn ) total_ephemeral_layers += len(layer_map.in_memory_layers) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 1bfeec6f4b..bbb1ad0c6d 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -632,7 +632,6 @@ def test_sharding_ingest_layer_sizes( historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start) for layer in historic_layers: - assert layer.layer_file_size is not None if layer.layer_file_size < expect_layer_size // 2: classification = "Small" small_layer_count += 1 diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 8406de8bc1..3e435caeee 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -482,6 +482,163 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) +def test_compaction_induced_by_detaches_in_history( + neon_env_builder: NeonEnvBuilder, test_output_dir, pg_distrib_dir, pg_bin: PgBin +): + """ + Assuming the tree of timelines: + + root + |- child1 + |- ... + |- wanted_detached_child + + Each detach can add N more L0 per level, this is actually unbounded because + compaction can be arbitrarily delayed (or detach happen right before one + starts). If "wanted_detached_child" has already made progress and compacted + L1s, we want to make sure "compaction in the history" does not leave the + timeline broken. + """ + + psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} + + env = neon_env_builder.init_start( + initial_tenant_conf={ + # we want to create layers manually so we don't branch on arbitrary + # Lsn, but we also do not want to compact L0 -> L1. + "compaction_threshold": "99999", + "compaction_period": "0s", + # shouldn't matter, but just in case + "gc_period": "0s", + } + ) + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + client = env.pageserver.http_client() + + def delta_layers(timeline_id: TimelineId): + # shorthand for more readable formatting + return client.layer_map_info(env.initial_tenant, timeline_id).delta_layers() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("create table integers (i bigint not null);") + ep.safe_psql("insert into integers (i) values (42)") + branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + assert len(delta_layers(env.initial_timeline)) == 2 + + more_good_numbers = range(0, 3) + + branches: List[Tuple[str, TimelineId]] = [("main", env.initial_timeline)] + + for num in more_good_numbers: + branch_name = f"br-{len(branches)}" + branch_timeline_id = env.neon_cli.create_branch( + branch_name, + ancestor_branch_name=branches[-1][0], + tenant_id=env.initial_tenant, + ancestor_start_lsn=branch_lsn, + ) + branches.append((branch_name, branch_timeline_id)) + + with env.endpoints.create_start(branches[-1][0], tenant_id=env.initial_tenant) as ep: + ep.safe_psql( + f"insert into integers (i) select i from generate_series({num}, {num + 100}) as s(i)" + ) + branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id) + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id) + + assert len(delta_layers(branch_timeline_id)) == 1 + + # now fill in the final, most growing timeline + + branch_name, branch_timeline_id = branches[-1] + with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep: + ep.safe_psql("insert into integers (i) select i from generate_series(50, 500) s(i)") + + last_suffix = None + for suffix in range(0, 4): + ep.safe_psql(f"create table other_table_{suffix} as select * from integers") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id) + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id) + last_suffix = suffix + + assert last_suffix is not None + + assert len(delta_layers(branch_timeline_id)) == 5 + + client.patch_tenant_config_client_side( + env.initial_tenant, {"compaction_threshold": 5}, None + ) + + client.timeline_compact(env.initial_tenant, branch_timeline_id) + + # one more layer + ep.safe_psql(f"create table other_table_{last_suffix + 1} as select * from integers") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id) + + # we need to wait here, because the detaches will do implicit tenant restart, + # and we could get unexpected layer counts + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id, wait_until_uploaded=True) + + assert len([filter(lambda x: x.l0, delta_layers(branch_timeline_id))]) == 1 + + skip_main = branches[1:] + branch_lsn = client.timeline_detail(env.initial_tenant, branch_timeline_id)["ancestor_lsn"] + + # take the fullbackup before and after inheriting the new L0s + fullbackup_before = test_output_dir / "fullbackup-before.tar" + cmd = [ + "psql", + "--no-psqlrc", + env.pageserver.connstr(), + "-c", + f"fullbackup {env.initial_tenant} {branch_timeline_id} {branch_lsn}", + "-o", + str(fullbackup_before), + ] + pg_bin.run_capture(cmd, env=psql_env) + + for _, timeline_id in skip_main: + reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert reparented == set(), "we have no earlier branches at any level" + + post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id))) + assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total" + + # checkpoint does compaction, which in turn decides to run, because + # there is now in total threshold number L0s even if they are not + # adjacent in Lsn space: + # + # inherited flushed during this checkpoint + # \\\\ / + # 1234X5---> lsn + # | + # l1 layers from "fill in the final, most growing timeline" + # + # branch_lsn is between 4 and first X. + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id) + + post_compact_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id))) + assert len(post_compact_l0s) == 1, "only the consecutive inherited L0s should be compacted" + + fullbackup_after = test_output_dir / "fullbackup_after.tar" + cmd = [ + "psql", + "--no-psqlrc", + env.pageserver.connstr(), + "-c", + f"fullbackup {env.initial_tenant} {branch_timeline_id} {branch_lsn}", + "-o", + str(fullbackup_after), + ] + pg_bin.run_capture(cmd, env=psql_env) + + # we don't need to skip any files, because zenith.signal will be identical + tar_cmp(fullbackup_before, fullbackup_after, set()) + + # TODO: # - after starting the operation, tenant is deleted # - after starting the operation, pageserver is shutdown, restarted diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index a6d06df3b6..db5297870e 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -656,7 +656,7 @@ def get_physical_size_values( client = env.pageserver.http_client() res.layer_map_file_size_sum = sum( - layer.layer_file_size or 0 + layer.layer_file_size for layer in client.layer_map_info(tenant_id, timeline_id).historic_layers ) From e3f6a07ca300549181042752c5943c71424e476a Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Tue, 21 May 2024 13:33:29 -0400 Subject: [PATCH 103/130] chore(pageserver): remove metrics for in-memory ingestion (#7823) The metrics was added in https://github.com/neondatabase/neon/pull/7515/ to observe if https://github.com/neondatabase/neon/pull/7467 introduces any perf regressions. The change was deployed on 5/7 and no changes are observed in the metrics. So it's safe to remove the metrics now. Signed-off-by: Alex Chi Z --- pageserver/src/metrics.rs | 7 ------- pageserver/src/pgdatadir_mapping.rs | 5 ----- 2 files changed, 12 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 27e25d8e32..4f2c75d308 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1867,7 +1867,6 @@ pub(crate) struct WalIngestMetrics { pub(crate) records_received: IntCounter, pub(crate) records_committed: IntCounter, pub(crate) records_filtered: IntCounter, - pub(crate) time_spent_on_ingest: Histogram, } pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { @@ -1891,12 +1890,6 @@ pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMet "Number of WAL records filtered out due to sharding" ) .expect("failed to define a metric"), - time_spent_on_ingest: register_histogram!( - "pageserver_wal_ingest_put_value_seconds", - "Actual time spent on ingesting a record", - redo_histogram_time_buckets!(), - ) - .expect("failed to define a metric"), }); pub(crate) static WAL_REDO_TIME: Lazy = Lazy::new(|| { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index b4fc4a08ee..f9d8c1020d 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -9,7 +9,6 @@ use super::tenant::{PageReconstructError, Timeline}; use crate::context::RequestContext; use crate::keyspace::{KeySpace, KeySpaceAccum}; -use crate::metrics::WAL_INGEST; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::walrecord::NeonWalRecord; use crate::{aux_file, repository::*}; @@ -1702,8 +1701,6 @@ impl<'a> DatadirModification<'a> { pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { let mut writer = self.tline.writer().await; - let timer = WAL_INGEST.time_spent_on_ingest.start_timer(); - let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; @@ -1743,8 +1740,6 @@ impl<'a> DatadirModification<'a> { writer.update_directory_entries_count(kind, count as u64); } - timer.observe_duration(); - Ok(()) } From 679e031cf641d45d95705c61e121aa40781cc86f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 21 May 2024 23:31:20 +0200 Subject: [PATCH 104/130] Add dummy lsn lease http and page service APIs (#7815) We want to introduce a concept of temporary and expiring LSN leases. This adds both a http API as well as one for the page service to obtain temporary LSN leases. This adds a dummy implementation to unblock integration work of this API. A functional implementation of the lease feature is deferred to a later step. Fixes #7808 Co-authored-by: Joonas Koivunen --- libs/pageserver_api/src/models.rs | 16 ++++++ libs/pageserver_api/src/shard.rs | 8 +++ pageserver/src/http/openapi_spec.yml | 40 +++++++++++++++ pageserver/src/http/routes.rs | 30 ++++++++++++ pageserver/src/page_service.rs | 73 ++++++++++++++++++++++++++++ pageserver/src/tenant/mgr.rs | 9 +++- pageserver/src/tenant/timeline.rs | 16 +++++- 7 files changed, 190 insertions(+), 2 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 05a444d738..55e9c48421 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -162,6 +162,22 @@ impl std::fmt::Debug for TenantState { } } +/// A temporary lease to a specific lsn inside a timeline. +/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`. +#[serde_as] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct LsnLease { + #[serde_as(as = "SystemTimeAsRfc3339Millis")] + pub valid_until: SystemTime, +} + +serde_with::serde_conv!( + SystemTimeAsRfc3339Millis, + SystemTime, + |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(), + |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) } +); + /// The only [`TenantState`] variants we could be `TenantState::Activating` from. #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum ActivatingFrom { diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 43d9b2e48c..1c05a01926 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -559,6 +559,14 @@ impl ShardIdentity { } } + /// Obtains the shard number and count combined into a `ShardIndex`. + pub fn shard_index(&self) -> ShardIndex { + ShardIndex { + shard_count: self.count, + shard_number: self.number, + } + } + pub fn shard_slug(&self) -> String { if self.count > ShardCount(0) { format!("-{:02x}{:02x}", self.number.0, self.count.0) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 36c74ed140..107bcd4a22 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -257,6 +257,37 @@ paths: schema: $ref: "#/components/schemas/LsnByTimestampResponse" + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Obtain lease for the given LSN + parameters: + - name: lsn + in: query + required: true + schema: + type: string + format: hex + description: A LSN to obtain the lease for + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: "#/components/schemas/LsnLease" + /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc: parameters: - name: tenant_id @@ -980,6 +1011,15 @@ components: type: string enum: [past, present, future, nodata] + LsnLease: + type: object + required: + - valid_until + properties: + valid_until: + type: string + format: date-time + PageserverUtilization: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index ec3b1141f3..c75e4ca5a9 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1701,6 +1701,32 @@ async fn handle_tenant_break( json_response(StatusCode::OK, ()) } +// Obtains an lsn lease on the given timeline. +async fn lsn_lease_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let lsn: Lsn = parse_query_param(&request, "lsn")? + .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + let result = timeline + .make_lsn_lease(lsn, &ctx) + .map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?; + + json_response(StatusCode::OK, result) +} + // Run GC immediately on given timeline. async fn timeline_gc_handler( mut request: Request, @@ -2712,6 +2738,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn", |r| api_handler(r, get_timestamp_of_lsn_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease", + |r| api_handler(r, lsn_lease_handler), + ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc", |r| api_handler(r, timeline_gc_handler), diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 35aba044b2..c066f56c17 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -19,6 +19,7 @@ use pageserver_api::models::{ }; use pageserver_api::shard::ShardIndex; use pageserver_api::shard::ShardNumber; +use pageserver_api::shard::TenantShardId; use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError}; use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; @@ -33,6 +34,7 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use std::time::Instant; +use std::time::SystemTime; use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::io::StreamReader; @@ -905,6 +907,39 @@ impl PageServerHandler { } } + #[instrument(skip_all, fields(shard_id, %lsn))] + async fn handle_make_lsn_lease( + &self, + pgb: &mut PostgresBackend, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result<(), QueryError> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { + let shard_selector = ShardSelector::Known(tenant_shard_id.to_index()); + let timeline = self + .get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector) + .await?; + let lease = timeline.make_lsn_lease(lsn, ctx)?; + let valid_until = lease + .valid_until + .duration_since(SystemTime::UNIX_EPOCH) + .map_err(|e| QueryError::Other(e.into()))?; + + pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( + b"valid_until", + )]))? + .write_message_noflush(&BeMessage::DataRow(&[Some( + &valid_until.as_millis().to_be_bytes(), + )]))? + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + + Ok(()) + } + #[instrument(skip_all, fields(shard_id))] async fn handle_get_rel_exists_request( &mut self, @@ -1802,6 +1837,44 @@ where // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + } else if query_string.starts_with("lease lsn ") { + let (_, params_raw) = query_string.split_at("lease lsn ".len()); + let params = params_raw.split_whitespace().collect::>(); + if params.len() != 3 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number {} for lease lsn command", + params.len() + ))); + } + + let tenant_shard_id = TenantShardId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; + + tracing::Span::current() + .record("tenant_id", field::display(tenant_shard_id)) + .record("timeline_id", field::display(timeline_id)); + + self.check_permission(Some(tenant_shard_id.tenant_id))?; + + // The caller is responsible for providing correct lsn. + let lsn = Lsn::from_str(params[2]) + .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; + + match self + .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx) + .await + { + Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + Err(e) => { + error!("error obtaining lsn lease for {lsn}: {e:?}"); + pgb.write_message_noflush(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))? + } + }; } else if query_string.starts_with("show ") { // show let (_, params_raw) = query_string.split_at("show ".len()); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 1d8e2cf6d3..89fdf31849 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -7,7 +7,7 @@ use itertools::Itertools; use pageserver_api::key::Key; use pageserver_api::models::LocationConfigMode; use pageserver_api::shard::{ - ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, + ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId, }; use pageserver_api::upcall_api::ReAttachResponseTenant; use rand::{distributions::Alphanumeric, Rng}; @@ -127,6 +127,8 @@ pub(crate) enum ShardSelector { First, /// Pick the shard that holds this key Page(Key), + /// The shard ID is known: pick the given shard + Known(ShardIndex), } /// A convenience for use with the re_attach ControlPlaneClient function: rather @@ -2067,6 +2069,11 @@ impl TenantManager { return ShardResolveResult::Found(tenant.clone()); } } + ShardSelector::Known(shard) + if tenant.shard_identity.shard_index() == shard => + { + return ShardResolveResult::Found(tenant.clone()); + } _ => continue, } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b0e5275b5f..1f8ee9ffc4 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -25,7 +25,7 @@ use pageserver_api::{ models::{ AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, - TimelineState, + LsnLease, TimelineState, }, reltag::BlockNumber, shard::{ShardIdentity, ShardNumber, TenantShardId}, @@ -1532,6 +1532,20 @@ impl Timeline { Ok(()) } + /// Obtains a temporary lease blocking garbage collection for the given LSN + pub(crate) fn make_lsn_lease( + &self, + _lsn: Lsn, + _ctx: &RequestContext, + ) -> anyhow::Result { + const LEASE_LENGTH: Duration = Duration::from_secs(5 * 60); + let lease = LsnLease { + valid_until: SystemTime::now() + LEASE_LENGTH, + }; + // TODO: dummy implementation + Ok(lease) + } + /// Flush to disk all data that was written with the put_* functions #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))] pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> { From 00d66e8012c2d4b0df706fb5e89ab8351e4eb402 Mon Sep 17 00:00:00 2001 From: Em Sharnoff Date: Tue, 21 May 2024 16:52:48 -0700 Subject: [PATCH 105/130] compute_ctl: Fix handling of missing /neonvm/bin/resize-swap (#7832) The logic added in the original PR (#7434) only worked before sudo was used, because 'sudo foo' will only fail with NotFound if 'sudo' doesn't exist; if 'foo' doesn't exist, then sudo will fail with a normal error exit. This means that compute_ctl may fail to restart if it exits after successfully enabling swap. --- compute_tools/src/swap.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs index c22b6bc14e..024c5b338e 100644 --- a/compute_tools/src/swap.rs +++ b/compute_tools/src/swap.rs @@ -1,3 +1,5 @@ +use std::path::Path; + use anyhow::{anyhow, Context}; use tracing::warn; @@ -17,17 +19,24 @@ pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> { .arg(size_bytes.to_string()) .spawn(); - if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) { - warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running"); - return Ok(()); - } - child_result .context("spawn() failed") .and_then(|mut child| child.wait().context("wait() failed")) .and_then(|status| match status.success() { true => Ok(()), - false => Err(anyhow!("process exited with {status}")), + false => { + // The command failed. Maybe it was because the resize-swap file doesn't exist? + // The --once flag causes it to delete itself on success so we don't disable swap + // while postgres is running; maybe this is fine. + match Path::new(RESIZE_SWAP_BIN).try_exists() { + Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")), + // The path doesn't exist; we're actually ok + Ok(false) => { + warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running"); + Ok(()) + }, + } + } }) // wrap any prior error with the overall context that we couldn't run the command .with_context(|| { From bd5cb9e86b2f8623b2528df9fa59daa21759e183 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 22 May 2024 09:34:39 +0100 Subject: [PATCH 106/130] Implement timeline_manager for safekeeper background tasks (#7768) In safekeepers we have several background tasks. Previously `WAL backup` task was spawned by another task called `wal_backup_launcher`. That task received notifications via `wal_backup_launcher_rx` and decided to spawn or kill existing backup task associated with the timeline. This was inconvenient because each code segment that touched shared state was responsible for pushing notification into `wal_backup_launcher_tx` channel. This was error prone because it's easy to miss and could lead to deadlock in some cases, if notification pushing was done in the wrong order. We also had a similar issue with `is_active` timeline flag. That flag was calculated based on the state and code modifying the state had to call function to update the flag. We had a few bugs related to that, when we forgot to update `is_active` flag in some places where it could change. To fix these issues, this PR adds a new `timeline_manager` background task associated with each timeline. This task is responsible for managing all background tasks, including `is_active` flag which is used for pushing broker messages. It is subscribed for updates in timeline state in a loop and decides to spawn/kill background tasks when needed. There is a new structure called `TimelinesSet`. It stores a set of `Arc` and allows to copy the set to iterate without holding the mutex. This is what replaced `is_active` flag for the broker. Now broker push task holds a reference to the `TimelinesSet` with active timelines and use it instead of iterating over all timelines and filtering by `is_active` flag. Also added some metrics for manager iterations and active backup tasks. Ideally manager should be doing not too many iterations and we should not have a lot of backup tasks spawned at the same time. Fixes #7751 --------- Co-authored-by: Arseny Sher --- safekeeper/src/bin/safekeeper.rs | 17 +- safekeeper/src/broker.rs | 11 +- safekeeper/src/lib.rs | 2 + safekeeper/src/metrics.rs | 35 ++- safekeeper/src/receive_wal.rs | 57 ++-- safekeeper/src/remove_wal.rs | 19 +- safekeeper/src/timeline.rs | 404 +++++++++++-------------- safekeeper/src/timeline_manager.rs | 153 ++++++++++ safekeeper/src/timelines_global_map.rs | 60 ++-- safekeeper/src/timelines_set.rs | 90 ++++++ safekeeper/src/wal_backup.rs | 201 +++++------- 11 files changed, 587 insertions(+), 462 deletions(-) create mode 100644 safekeeper/src/timeline_manager.rs create mode 100644 safekeeper/src/timelines_set.rs diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 09c565ce71..aee3898ac7 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -20,7 +20,6 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use storage_broker::Uri; -use tokio::sync::mpsc; use tracing::*; use utils::pid_file; @@ -30,13 +29,13 @@ use safekeeper::defaults::{ DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, }; +use safekeeper::remove_wal; use safekeeper::wal_service; use safekeeper::GlobalTimelines; use safekeeper::SafeKeeperConf; use safekeeper::{broker, WAL_SERVICE_RUNTIME}; use safekeeper::{control_file, BROKER_RUNTIME}; use safekeeper::{http, WAL_REMOVER_RUNTIME}; -use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME}; use safekeeper::{wal_backup, HTTP_RUNTIME}; use storage_broker::DEFAULT_ENDPOINT; use utils::auth::{JwtAuth, Scope, SwappableJwtAuth}; @@ -377,8 +376,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let timeline_collector = safekeeper::metrics::TimelineCollector::new(); metrics::register_internal(Box::new(timeline_collector))?; - let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); - wal_backup::init_remote_storage(&conf); // Keep handles to main tasks to die if any of them disappears. @@ -391,19 +388,9 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let current_thread_rt = conf .current_thread_runtime .then(|| Handle::try_current().expect("no runtime in main")); - let conf_ = conf.clone(); - let wal_backup_handle = current_thread_rt - .as_ref() - .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle()) - .spawn(wal_backup::wal_backup_launcher_task_main( - conf_, - wal_backup_launcher_rx, - )) - .map(|res| ("WAL backup launcher".to_owned(), res)); - tasks_handles.push(Box::pin(wal_backup_handle)); // Load all timelines from disk to memory. - GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx).await?; + GlobalTimelines::init(conf.clone()).await?; let conf_ = conf.clone(); // Run everything in current thread rt, if asked. diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 46a51438ea..7cc2142291 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -46,6 +46,8 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { return Ok(()); } + let active_timelines_set = GlobalTimelines::get_global_broker_active_set(); + let mut client = storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?; let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); @@ -57,15 +59,9 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // sensitive and there is no risk of deadlock as we don't await while // lock is held. let now = Instant::now(); - let all_tlis = GlobalTimelines::get_all(); + let all_tlis = active_timelines_set.get_all(); let mut n_pushed_tlis = 0; for tli in &all_tlis { - // filtering alternative futures::stream::iter(all_tlis) - // .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::>().await; - // doesn't look better, and I'm not sure how to do that without collect. - if !tli.is_active().await { - continue; - } let sk_info = tli.get_safekeeper_info(&conf).await; yield sk_info; BROKER_PUSHED_UPDATES.inc(); @@ -90,6 +86,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { } /// Subscribe and fetch all the interesting data from the broker. +#[instrument(name = "broker pull", skip_all)] async fn pull_loop(conf: SafeKeeperConf, stats: Arc) -> Result<()> { let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?; diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 543714a54e..8d8d2cf23e 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -31,6 +31,8 @@ pub mod safekeeper; pub mod send_wal; pub mod state; pub mod timeline; +pub mod timeline_manager; +pub mod timelines_set; pub mod wal_backup; pub mod wal_backup_partial; pub mod wal_service; diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 28ae042bb3..1e965393e3 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -11,8 +11,9 @@ use futures::Future; use metrics::{ core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, - register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, Gauge, - IntCounter, IntCounterPairVec, IntCounterVec, IntGaugeVec, + register_int_counter, register_int_counter_pair, register_int_counter_pair_vec, + register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, + IntGaugeVec, }; use once_cell::sync::Lazy; @@ -162,6 +163,29 @@ pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter") }); +pub static MANAGER_ITERATIONS_TOTAL: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_manager_iterations_total", + "Number of iterations of the timeline manager task" + ) + .expect("Failed to register safekeeper_manager_iterations_total counter") +}); +pub static MANAGER_ACTIVE_CHANGES: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_manager_active_changes_total", + "Number of timeline active status changes in the timeline manager task" + ) + .expect("Failed to register safekeeper_manager_active_changes_total counter") +}); +pub static WAL_BACKUP_TASKS: Lazy = Lazy::new(|| { + register_int_counter_pair!( + "safekeeper_wal_backup_tasks_started_total", + "Number of active WAL backup tasks", + "safekeeper_wal_backup_tasks_finished_total", + "Number of finished WAL backup tasks", + ) + .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter") +}); pub const LABEL_UNKNOWN: &str = "unknown"; @@ -614,8 +638,7 @@ impl Collector for TimelineCollector { self.written_wal_seconds.reset(); self.flushed_wal_seconds.reset(); - let timelines = GlobalTimelines::get_all(); - let timelines_count = timelines.len(); + let timelines_count = GlobalTimelines::get_all().len(); let mut active_timelines_count = 0; // Prometheus Collector is sync, and data is stored under async lock. To @@ -746,9 +769,9 @@ impl Collector for TimelineCollector { async fn collect_timeline_metrics() -> Vec { let mut res = vec![]; - let timelines = GlobalTimelines::get_all(); + let active_timelines = GlobalTimelines::get_global_broker_active_set().get_all(); - for tli in timelines { + for tli in active_timelines { if let Some(info) = tli.info_for_metrics().await { res.push(info); } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 0356def7df..03cfa882c4 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -45,6 +45,9 @@ const DEFAULT_FEEDBACK_CAPACITY: usize = 8; pub struct WalReceivers { mutex: Mutex, pageserver_feedback_tx: tokio::sync::broadcast::Sender, + + num_computes_tx: tokio::sync::watch::Sender, + num_computes_rx: tokio::sync::watch::Receiver, } /// Id under which walreceiver is registered in shmem. @@ -55,16 +58,21 @@ impl WalReceivers { let (pageserver_feedback_tx, _) = tokio::sync::broadcast::channel(DEFAULT_FEEDBACK_CAPACITY); + let (num_computes_tx, num_computes_rx) = tokio::sync::watch::channel(0usize); + Arc::new(WalReceivers { mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }), pageserver_feedback_tx, + num_computes_tx, + num_computes_rx, }) } /// Register new walreceiver. Returned guard provides access to the slot and /// automatically deregisters in Drop. pub fn register(self: &Arc, conn_id: Option) -> WalReceiverGuard { - let slots = &mut self.mutex.lock().slots; + let mut shared = self.mutex.lock(); + let slots = &mut shared.slots; let walreceiver = WalReceiverState { conn_id, status: WalReceiverStatus::Voting, @@ -78,6 +86,9 @@ impl WalReceivers { slots.push(Some(walreceiver)); pos }; + + self.update_num(&shared); + WalReceiverGuard { id: pos, walreceivers: self.clone(), @@ -99,7 +110,18 @@ impl WalReceivers { /// Get number of walreceivers (compute connections). pub fn get_num(self: &Arc) -> usize { - self.mutex.lock().slots.iter().flatten().count() + self.mutex.lock().get_num() + } + + /// Get channel for number of walreceivers. + pub fn get_num_rx(self: &Arc) -> tokio::sync::watch::Receiver { + self.num_computes_rx.clone() + } + + /// Should get called after every update of slots. + fn update_num(self: &Arc, shared: &MutexGuard) { + let num = shared.get_num(); + self.num_computes_tx.send_replace(num); } /// Get state of all walreceivers. @@ -123,6 +145,7 @@ impl WalReceivers { fn unregister(self: &Arc, id: WalReceiverId) { let mut shared = self.mutex.lock(); shared.slots[id] = None; + self.update_num(&shared); } /// Broadcast pageserver feedback to connected walproposers. @@ -137,6 +160,13 @@ struct WalReceiversShared { slots: Vec>, } +impl WalReceiversShared { + /// Get number of walreceivers (compute connections). + fn get_num(&self) -> usize { + self.slots.iter().flatten().count() + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct WalReceiverState { /// None means it is recovery initiated by us (this safekeeper). @@ -456,14 +486,7 @@ impl WalAcceptor { /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed; /// it must mean that network thread terminated. async fn run(&mut self) -> anyhow::Result<()> { - // Register the connection and defer unregister. - // Order of the next two lines is important: we want first to remove our entry and then - // update status which depends on registered connections. - let _compute_conn_guard = ComputeConnectionGuard { - timeline: Arc::clone(&self.tli), - }; let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id); - self.tli.update_status_notify().await?; // After this timestamp we will stop processing AppendRequests and send a response // to the walproposer. walproposer sends at least one AppendRequest per second, @@ -529,19 +552,3 @@ impl WalAcceptor { } } } - -/// Calls update_status_notify in drop to update timeline status. -struct ComputeConnectionGuard { - timeline: Arc, -} - -impl Drop for ComputeConnectionGuard { - fn drop(&mut self) { - let tli = self.timeline.clone(); - tokio::spawn(async move { - if let Err(e) = tli.update_status_notify().await { - error!("failed to update timeline status: {}", e); - } - }); - } -} diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index 9dce06a886..98ce671182 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -7,29 +7,18 @@ use tracing::*; use crate::{GlobalTimelines, SafeKeeperConf}; -const ALLOW_INACTIVE_TIMELINES: bool = true; - -pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { +pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> { let wal_removal_interval = Duration::from_millis(5000); loop { let now = tokio::time::Instant::now(); - let mut active_timelines = 0; - let tlis = GlobalTimelines::get_all(); for tli in &tlis { - let is_active = tli.is_active().await; - if is_active { - active_timelines += 1; - } - if !ALLOW_INACTIVE_TIMELINES && !is_active { - continue; - } let ttid = tli.ttid; async { if let Err(e) = tli.maybe_persist_control_file().await { warn!("failed to persist control file: {e}"); } - if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled).await { + if let Err(e) = tli.remove_old_wal().await { error!("failed to remove WAL: {}", e); } } @@ -42,8 +31,8 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { if elapsed > wal_removal_interval { info!( - "WAL removal is too long, processed {} active timelines ({} total) in {:?}", - active_timelines, total_timelines, elapsed + "WAL removal is too long, processed {} timelines in {:?}", + total_timelines, elapsed ); } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index e97247dc7c..da2e3f4538 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -8,13 +8,12 @@ use serde::{Deserialize, Serialize}; use tokio::fs; use std::cmp::max; +use std::ops::{Deref, DerefMut}; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::time::Duration; -use tokio::sync::{Mutex, MutexGuard}; -use tokio::{ - sync::{mpsc::Sender, watch}, - time::Instant, -}; +use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; +use tokio::{sync::watch, time::Instant}; use tracing::*; use utils::http::error::ApiError; use utils::{ @@ -33,12 +32,13 @@ use crate::safekeeper::{ }; use crate::send_wal::WalSenders; use crate::state::{TimelineMemState, TimelinePersistentState}; +use crate::timelines_set::TimelinesSet; use crate::wal_backup::{self}; use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; use crate::metrics::FullTimelineInfo; use crate::wal_storage::Storage as wal_storage_iface; -use crate::{debug_dump, wal_backup_partial, wal_storage}; +use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage}; use crate::{GlobalTimelines, SafeKeeperConf}; /// Things safekeeper should know about timeline state on peers. @@ -51,8 +51,7 @@ pub struct PeerInfo { /// LSN of the last record. pub flush_lsn: Lsn, pub commit_lsn: Lsn, - /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new - /// sk since backup_lsn. + /// Since which LSN safekeeper has WAL. pub local_start_lsn: Lsn, /// When info was received. Serde annotations are not very useful but make /// the code compile -- we don't rely on this field externally. @@ -97,25 +96,72 @@ impl PeersInfo { } } +pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>; + +/// WriteGuardSharedState is a wrapper around `RwLockWriteGuard` that +/// automatically updates `watch::Sender` channels with state on drop. +pub struct WriteGuardSharedState<'a> { + tli: Arc, + guard: RwLockWriteGuard<'a, SharedState>, +} + +impl<'a> WriteGuardSharedState<'a> { + fn new(tli: Arc, guard: RwLockWriteGuard<'a, SharedState>) -> Self { + WriteGuardSharedState { tli, guard } + } +} + +impl<'a> Deref for WriteGuardSharedState<'a> { + type Target = SharedState; + + fn deref(&self) -> &Self::Target { + &self.guard + } +} + +impl<'a> DerefMut for WriteGuardSharedState<'a> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.guard + } +} + +impl<'a> Drop for WriteGuardSharedState<'a> { + fn drop(&mut self) { + let term_flush_lsn = TermLsn::from((self.guard.sk.get_term(), self.guard.sk.flush_lsn())); + let commit_lsn = self.guard.sk.state.inmem.commit_lsn; + + let _ = self.tli.term_flush_lsn_watch_tx.send_if_modified(|old| { + if *old != term_flush_lsn { + *old = term_flush_lsn; + true + } else { + false + } + }); + + let _ = self.tli.commit_lsn_watch_tx.send_if_modified(|old| { + if *old != commit_lsn { + *old = commit_lsn; + true + } else { + false + } + }); + + // send notification about shared state update + self.tli.shared_state_version_tx.send_modify(|old| { + *old += 1; + }); + } +} + /// Shared state associated with database instance pub struct SharedState { /// Safekeeper object - sk: SafeKeeper, + pub(crate) sk: SafeKeeper, /// In memory list containing state of peers sent in latest messages from them. - peers_info: PeersInfo, - /// True when WAL backup launcher oversees the timeline, making sure WAL is - /// offloaded, allows to bother launcher less. - wal_backup_active: bool, - /// True whenever there is at least some pending activity on timeline: live - /// compute connection, pageserver is not caughtup (it must have latest WAL - /// for new compute start) or WAL backuping is not finished. Practically it - /// means safekeepers broadcast info to peers about the timeline, old WAL is - /// trimmed. - /// - /// TODO: it might be better to remove tli completely from GlobalTimelines - /// when tli is inactive instead of having this flag. - active: bool, - last_removed_segno: XLogSegNo, + pub(crate) peers_info: PeersInfo, + pub(crate) last_removed_segno: XLogSegNo, } impl SharedState { @@ -152,8 +198,6 @@ impl SharedState { Ok(Self { sk, peers_info: PeersInfo(vec![]), - wal_backup_active: false, - active: false, last_removed_segno: 0, }) } @@ -171,75 +215,10 @@ impl SharedState { Ok(Self { sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?, peers_info: PeersInfo(vec![]), - wal_backup_active: false, - active: false, last_removed_segno: 0, }) } - fn is_active(&self, num_computes: usize) -> bool { - self.is_wal_backup_required(num_computes) - // FIXME: add tracking of relevant pageservers and check them here individually, - // otherwise migration won't work (we suspend too early). - || self.sk.state.inmem.remote_consistent_lsn < self.sk.state.inmem.commit_lsn - } - - /// Mark timeline active/inactive and return whether s3 offloading requires - /// start/stop action. If timeline is deactivated, control file is persisted - /// as maintenance task does that only for active timelines. - async fn update_status(&mut self, num_computes: usize, ttid: TenantTimelineId) -> bool { - let is_active = self.is_active(num_computes); - if self.active != is_active { - info!( - "timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}", - ttid, - is_active, - self.sk.state.inmem.remote_consistent_lsn, - self.sk.state.inmem.commit_lsn - ); - if !is_active { - if let Err(e) = self.sk.state.flush().await { - warn!("control file save in update_status failed: {:?}", e); - } - } - } - self.active = is_active; - self.is_wal_backup_action_pending(num_computes) - } - - /// Should we run s3 offloading in current state? - fn is_wal_backup_required(&self, num_computes: usize) -> bool { - let seg_size = self.get_wal_seg_size(); - num_computes > 0 || - // Currently only the whole segment is offloaded, so compare segment numbers. - (self.sk.state.inmem.commit_lsn.segment_number(seg_size) > - self.sk.state.inmem.backup_lsn.segment_number(seg_size)) - } - - /// Is current state of s3 offloading is not what it ought to be? - fn is_wal_backup_action_pending(&self, num_computes: usize) -> bool { - let res = self.wal_backup_active != self.is_wal_backup_required(num_computes); - if res { - let action_pending = if self.is_wal_backup_required(num_computes) { - "start" - } else { - "stop" - }; - trace!( - "timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}", - self.sk.state.timeline_id, action_pending, num_computes, self.sk.state.inmem.commit_lsn, self.sk.state.inmem.backup_lsn - ); - } - res - } - - /// Returns whether s3 offloading is required and sets current status as - /// matching. - fn wal_backup_attend(&mut self, num_computes: usize) -> bool { - self.wal_backup_active = self.is_wal_backup_required(num_computes); - self.wal_backup_active - } - fn get_wal_seg_size(&self) -> usize { self.sk.state.server.wal_seg_size as usize } @@ -278,7 +257,7 @@ impl SharedState { /// Get our latest view of alive peers status on the timeline. /// We pass our own info through the broker as well, so when we don't have connection /// to the broker returned vec is empty. - fn get_peers(&self, heartbeat_timeout: Duration) -> Vec { + pub(crate) fn get_peers(&self, heartbeat_timeout: Duration) -> Vec { let now = Instant::now(); self.peers_info .0 @@ -294,18 +273,13 @@ impl SharedState { /// offloading. /// While it is safe to use inmem values for determining horizon, /// we use persistent to make possible normal states less surprising. - fn get_horizon_segno( - &self, - wal_backup_enabled: bool, - extra_horizon_lsn: Option, - ) -> XLogSegNo { + fn get_horizon_segno(&self, extra_horizon_lsn: Option) -> XLogSegNo { let state = &self.sk.state; use std::cmp::min; let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn); - if wal_backup_enabled { - horizon_lsn = min(horizon_lsn, state.backup_lsn); - } + // we don't want to remove WAL that is not yet offloaded to s3 + horizon_lsn = min(horizon_lsn, state.backup_lsn); if let Some(extra_horizon_lsn) = extra_horizon_lsn { horizon_lsn = min(horizon_lsn, extra_horizon_lsn); } @@ -346,11 +320,6 @@ impl From for ApiError { pub struct Timeline { pub ttid: TenantTimelineId, - /// Sending here asks for wal backup launcher attention (start/stop - /// offloading). Sending ttid instead of concrete command allows to do - /// sending without timeline lock. - pub wal_backup_launcher_tx: Sender, - /// Used to broadcast commit_lsn updates to all background jobs. commit_lsn_watch_tx: watch::Sender, commit_lsn_watch_rx: watch::Receiver, @@ -362,10 +331,14 @@ pub struct Timeline { term_flush_lsn_watch_tx: watch::Sender, term_flush_lsn_watch_rx: watch::Receiver, + /// Broadcasts shared state updates. + shared_state_version_tx: watch::Sender, + shared_state_version_rx: watch::Receiver, + /// Safekeeper and other state, that should remain consistent and /// synchronized with the disk. This is tokio mutex as we write WAL to disk /// while holding it, ensuring that consensus checks are in order. - mutex: Mutex, + mutex: RwLock, walsenders: Arc, walreceivers: Arc, @@ -384,15 +357,15 @@ pub struct Timeline { /// with different speed. // TODO: add `Arc` here instead of adding each field separately. walsenders_keep_horizon: bool, + + // timeline_manager controlled state + pub(crate) broker_active: AtomicBool, + pub(crate) wal_backup_active: AtomicBool, } impl Timeline { /// Load existing timeline from disk. - pub fn load_timeline( - conf: &SafeKeeperConf, - ttid: TenantTimelineId, - wal_backup_launcher_tx: Sender, - ) -> Result { + pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result { let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); let shared_state = SharedState::restore(conf, &ttid)?; @@ -402,23 +375,27 @@ impl Timeline { shared_state.sk.get_term(), shared_state.sk.flush_lsn(), ))); + let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0); let (cancellation_tx, cancellation_rx) = watch::channel(false); let walreceivers = WalReceivers::new(); Ok(Timeline { ttid, - wal_backup_launcher_tx, commit_lsn_watch_tx, commit_lsn_watch_rx, term_flush_lsn_watch_tx, term_flush_lsn_watch_rx, - mutex: Mutex::new(shared_state), + shared_state_version_tx, + shared_state_version_rx, + mutex: RwLock::new(shared_state), walsenders: WalSenders::new(walreceivers.clone()), walreceivers, cancellation_rx, cancellation_tx, timeline_dir: conf.timeline_dir(&ttid), walsenders_keep_horizon: conf.walsenders_keep_horizon, + broker_active: AtomicBool::new(false), + wal_backup_active: AtomicBool::new(false), }) } @@ -426,7 +403,6 @@ impl Timeline { pub fn create_empty( conf: &SafeKeeperConf, ttid: TenantTimelineId, - wal_backup_launcher_tx: Sender, server_info: ServerInfo, commit_lsn: Lsn, local_start_lsn: Lsn, @@ -434,25 +410,30 @@ impl Timeline { let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID); let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID))); + let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0); let (cancellation_tx, cancellation_rx) = watch::channel(false); + let state = TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); let walreceivers = WalReceivers::new(); Ok(Timeline { ttid, - wal_backup_launcher_tx, commit_lsn_watch_tx, commit_lsn_watch_rx, term_flush_lsn_watch_tx, term_flush_lsn_watch_rx, - mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?), + shared_state_version_tx, + shared_state_version_rx, + mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?), walsenders: WalSenders::new(walreceivers.clone()), walreceivers, cancellation_rx, cancellation_tx, timeline_dir: conf.timeline_dir(&ttid), walsenders_keep_horizon: conf.walsenders_keep_horizon, + broker_active: AtomicBool::new(false), + wal_backup_active: AtomicBool::new(false), }) } @@ -463,8 +444,9 @@ impl Timeline { /// and state on disk should remain unchanged. pub async fn init_new( self: &Arc, - shared_state: &mut MutexGuard<'_, SharedState>, + shared_state: &mut WriteGuardSharedState<'_>, conf: &SafeKeeperConf, + broker_active_set: Arc, ) -> Result<()> { match fs::metadata(&self.timeline_dir).await { Ok(_) => { @@ -495,16 +477,29 @@ impl Timeline { return Err(e); } - self.bootstrap(conf); + self.bootstrap(conf, broker_active_set); Ok(()) } - /// Bootstrap new or existing timeline starting background stasks. - pub fn bootstrap(self: &Arc, conf: &SafeKeeperConf) { + /// Bootstrap new or existing timeline starting background tasks. + pub fn bootstrap( + self: &Arc, + conf: &SafeKeeperConf, + broker_active_set: Arc, + ) { + // Start manager task which will monitor timeline state and update + // background tasks. + tokio::spawn(timeline_manager::main_task( + self.clone(), + conf.clone(), + broker_active_set, + )); + // Start recovery task which always runs on the timeline. if conf.peer_recovery_enabled { tokio::spawn(recovery_main(self.clone(), conf.clone())); } + // TODO: migrate to timeline_manager if conf.is_wal_backup_enabled() && conf.partial_backup_enabled { tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone())); } @@ -517,10 +512,9 @@ impl Timeline { /// deletion API endpoint is retriable. pub async fn delete( &self, - shared_state: &mut MutexGuard<'_, SharedState>, + shared_state: &mut WriteGuardSharedState<'_>, only_local: bool, - ) -> Result<(bool, bool)> { - let was_active = shared_state.active; + ) -> Result { self.cancel(shared_state); // TODO: It's better to wait for s3 offloader termination before @@ -534,18 +528,12 @@ impl Timeline { wal_backup::delete_timeline(&self.ttid).await?; } let dir_existed = delete_dir(&self.timeline_dir).await?; - Ok((dir_existed, was_active)) + Ok(dir_existed) } /// Cancel timeline to prevent further usage. Background tasks will stop /// eventually after receiving cancellation signal. - /// - /// Note that we can't notify backup launcher here while holding - /// shared_state lock, as this is a potential deadlock: caller is - /// responsible for that. Generally we should probably make WAL backup tasks - /// to shut down on their own, checking once in a while whether it is the - /// time. - fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) { + fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) { info!("timeline {} is cancelled", self.ttid); let _ = self.cancellation_tx.send(true); // Close associated FDs. Nobody will be able to touch timeline data once @@ -569,30 +557,12 @@ impl Timeline { } /// Take a writing mutual exclusive lock on timeline shared_state. - pub async fn write_shared_state(&self) -> MutexGuard { - self.mutex.lock().await + pub async fn write_shared_state<'a>(self: &'a Arc) -> WriteGuardSharedState<'a> { + WriteGuardSharedState::new(self.clone(), self.mutex.write().await) } - async fn update_status(&self, shared_state: &mut SharedState) -> bool { - shared_state - .update_status(self.walreceivers.get_num(), self.ttid) - .await - } - - /// Update timeline status and kick wal backup launcher to stop/start offloading if needed. - pub async fn update_status_notify(&self) -> Result<()> { - if self.is_cancelled() { - bail!(TimelineError::Cancelled(self.ttid)); - } - let is_wal_backup_action_pending: bool = { - let mut shared_state = self.write_shared_state().await; - self.update_status(&mut shared_state).await - }; - if is_wal_backup_action_pending { - // Can fail only if channel to a static thread got closed, which is not normal at all. - self.wal_backup_launcher_tx.send(self.ttid).await?; - } - Ok(()) + pub async fn read_shared_state(&self) -> ReadGuardSharedState { + self.mutex.read().await } /// Returns true if walsender should stop sending WAL to pageserver. We @@ -604,7 +574,7 @@ impl Timeline { if self.is_cancelled() { return true; } - let shared_state = self.write_shared_state().await; + let shared_state = self.read_shared_state().await; if self.walreceivers.get_num() == 0 { return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn; @@ -612,9 +582,9 @@ impl Timeline { false } - /// Ensure taht current term is t, erroring otherwise, and lock the state. - pub async fn acquire_term(&self, t: Term) -> Result> { - let ss = self.write_shared_state().await; + /// Ensure that current term is t, erroring otherwise, and lock the state. + pub async fn acquire_term(&self, t: Term) -> Result { + let ss = self.read_shared_state().await; if ss.sk.state.acceptor_state.term != t { bail!( "failed to acquire term {}, current term {}", @@ -625,18 +595,6 @@ impl Timeline { Ok(ss) } - /// Returns whether s3 offloading is required and sets current status as - /// matching it. - pub async fn wal_backup_attend(&self) -> bool { - if self.is_cancelled() { - return false; - } - - self.write_shared_state() - .await - .wal_backup_attend(self.walreceivers.get_num()) - } - /// Returns commit_lsn watch channel. pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver { self.commit_lsn_watch_rx.clone() @@ -647,9 +605,14 @@ impl Timeline { self.term_flush_lsn_watch_rx.clone() } + /// Returns watch channel for SharedState update version. + pub fn get_state_version_rx(&self) -> watch::Receiver { + self.shared_state_version_rx.clone() + } + /// Pass arrived message to the safekeeper. pub async fn process_msg( - &self, + self: &Arc, msg: &ProposerAcceptorMessage, ) -> Result> { if self.is_cancelled() { @@ -657,8 +620,6 @@ impl Timeline { } let mut rmsg: Option; - let commit_lsn: Lsn; - let term_flush_lsn: TermLsn; { let mut shared_state = self.write_shared_state().await; rmsg = shared_state.sk.process_msg(msg).await?; @@ -667,43 +628,28 @@ impl Timeline { if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback; } - - commit_lsn = shared_state.sk.state.inmem.commit_lsn; - term_flush_lsn = - TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn())); } - self.term_flush_lsn_watch_tx.send(term_flush_lsn)?; - self.commit_lsn_watch_tx.send(commit_lsn)?; Ok(rmsg) } /// Returns wal_seg_size. pub async fn get_wal_seg_size(&self) -> usize { - self.write_shared_state().await.get_wal_seg_size() - } - - /// Returns true only if the timeline is loaded and active. - pub async fn is_active(&self) -> bool { - if self.is_cancelled() { - return false; - } - - self.write_shared_state().await.active + self.read_shared_state().await.get_wal_seg_size() } /// Returns state of the timeline. pub async fn get_state(&self) -> (TimelineMemState, TimelinePersistentState) { - let state = self.write_shared_state().await; + let state = self.read_shared_state().await; (state.sk.state.inmem.clone(), state.sk.state.clone()) } /// Returns latest backup_lsn. pub async fn get_wal_backup_lsn(&self) -> Lsn { - self.write_shared_state().await.sk.state.inmem.backup_lsn + self.read_shared_state().await.sk.state.inmem.backup_lsn } /// Sets backup_lsn to the given value. - pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> { + pub async fn set_wal_backup_lsn(self: &Arc, backup_lsn: Lsn) -> Result<()> { if self.is_cancelled() { bail!(TimelineError::Cancelled(self.ttid)); } @@ -717,40 +663,34 @@ impl Timeline { /// Get safekeeper info for broadcasting to broker and other peers. pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo { - let shared_state = self.write_shared_state().await; let standby_apply_lsn = self.walsenders.get_hotstandby().reply.apply_lsn; + let shared_state = self.read_shared_state().await; shared_state.get_safekeeper_info(&self.ttid, conf, standby_apply_lsn) } /// Update timeline state with peer safekeeper data. - pub async fn record_safekeeper_info(&self, sk_info: SafekeeperTimelineInfo) -> Result<()> { - let is_wal_backup_action_pending: bool; - let commit_lsn: Lsn; + pub async fn record_safekeeper_info( + self: &Arc, + sk_info: SafekeeperTimelineInfo, + ) -> Result<()> { { let mut shared_state = self.write_shared_state().await; shared_state.sk.record_safekeeper_info(&sk_info).await?; let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now()); shared_state.peers_info.upsert(&peer_info); - is_wal_backup_action_pending = self.update_status(&mut shared_state).await; - commit_lsn = shared_state.sk.state.inmem.commit_lsn; - } - self.commit_lsn_watch_tx.send(commit_lsn)?; - // Wake up wal backup launcher, if it is time to stop the offloading. - if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.send(self.ttid).await?; } Ok(()) } /// Update in memory remote consistent lsn. - pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) { + pub async fn update_remote_consistent_lsn(self: &Arc, candidate: Lsn) { let mut shared_state = self.write_shared_state().await; shared_state.sk.state.inmem.remote_consistent_lsn = max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate); } pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec { - let shared_state = self.write_shared_state().await; + let shared_state = self.read_shared_state().await; shared_state.get_peers(conf.heartbeat_timeout) } @@ -772,7 +712,7 @@ impl Timeline { /// depending on assembled quorum (e.g. classic picture 8 from Raft paper). /// Thus we don't try to predict it here. pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo { - let ss = self.write_shared_state().await; + let ss = self.read_shared_state().await; let term = ss.sk.state.acceptor_state.term; let last_log_term = ss.sk.get_epoch(); let flush_lsn = ss.sk.flush_lsn(); @@ -843,12 +783,12 @@ impl Timeline { /// Returns flush_lsn. pub async fn get_flush_lsn(&self) -> Lsn { - self.write_shared_state().await.sk.wal_store.flush_lsn() + self.read_shared_state().await.sk.wal_store.flush_lsn() } /// Delete WAL segments from disk that are no longer needed. This is determined /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn. - pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { + pub async fn remove_old_wal(self: &Arc) -> Result<()> { if self.is_cancelled() { bail!(TimelineError::Cancelled(self.ttid)); } @@ -864,9 +804,8 @@ impl Timeline { let horizon_segno: XLogSegNo; let remover = { - let shared_state = self.write_shared_state().await; - horizon_segno = - shared_state.get_horizon_segno(wal_backup_enabled, replication_horizon_lsn); + let shared_state = self.read_shared_state().await; + horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { return Ok(()); // nothing to do } @@ -888,7 +827,7 @@ impl Timeline { /// passed after the last save. This helps to keep remote_consistent_lsn up /// to date so that storage nodes restart doesn't cause many pageserver -> /// safekeeper reconnections. - pub async fn maybe_persist_control_file(&self) -> Result<()> { + pub async fn maybe_persist_control_file(self: &Arc) -> Result<()> { self.write_shared_state() .await .sk @@ -896,38 +835,33 @@ impl Timeline { .await } - /// Gather timeline data for metrics. If the timeline is not active, returns - /// None, we do not collect these. + /// Gather timeline data for metrics. pub async fn info_for_metrics(&self) -> Option { if self.is_cancelled() { return None; } let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats(); - let state = self.write_shared_state().await; - if state.active { - Some(FullTimelineInfo { - ttid: self.ttid, - ps_feedback_count, - last_ps_feedback, - wal_backup_active: state.wal_backup_active, - timeline_is_active: state.active, - num_computes: self.walreceivers.get_num() as u32, - last_removed_segno: state.last_removed_segno, - epoch_start_lsn: state.sk.epoch_start_lsn, - mem_state: state.sk.state.inmem.clone(), - persisted_state: state.sk.state.clone(), - flush_lsn: state.sk.wal_store.flush_lsn(), - wal_storage: state.sk.wal_store.get_metrics(), - }) - } else { - None - } + let state = self.read_shared_state().await; + Some(FullTimelineInfo { + ttid: self.ttid, + ps_feedback_count, + last_ps_feedback, + wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed), + timeline_is_active: self.broker_active.load(Ordering::Relaxed), + num_computes: self.walreceivers.get_num() as u32, + last_removed_segno: state.last_removed_segno, + epoch_start_lsn: state.sk.epoch_start_lsn, + mem_state: state.sk.state.inmem.clone(), + persisted_state: state.sk.state.clone(), + flush_lsn: state.sk.wal_store.flush_lsn(), + wal_storage: state.sk.wal_store.get_metrics(), + }) } /// Returns in-memory timeline state to build a full debug dump. pub async fn memory_dump(&self) -> debug_dump::Memory { - let state = self.write_shared_state().await; + let state = self.read_shared_state().await; let (write_lsn, write_record_lsn, flush_lsn, file_open) = state.sk.wal_store.internal_state(); @@ -936,8 +870,8 @@ impl Timeline { is_cancelled: self.is_cancelled(), peers_info_len: state.peers_info.0.len(), walsenders: self.walsenders.get_all(), - wal_backup_active: state.wal_backup_active, - active: state.active, + wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed), + active: self.broker_active.load(Ordering::Relaxed), num_computes: self.walreceivers.get_num() as u32, last_removed_segno: state.last_removed_segno, epoch_start_lsn: state.sk.epoch_start_lsn, @@ -951,7 +885,7 @@ impl Timeline { /// Apply a function to the control file state and persist it. pub async fn map_control_file( - &self, + self: &Arc, f: impl FnOnce(&mut TimelinePersistentState) -> Result, ) -> Result { let mut state = self.write_shared_state().await; diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs new file mode 100644 index 0000000000..52ad915065 --- /dev/null +++ b/safekeeper/src/timeline_manager.rs @@ -0,0 +1,153 @@ +//! The timeline manager task is responsible for managing the timeline's background tasks. +//! It is spawned alongside each timeline and exits when the timeline is deleted. +//! It watches for changes in the timeline state and decides when to spawn or kill background tasks. +//! It also can manage some reactive state, like should the timeline be active for broker pushes or not. + +use std::{sync::Arc, time::Duration}; + +use tracing::{info, instrument, warn}; +use utils::lsn::Lsn; + +use crate::{ + metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL}, + timeline::{PeerInfo, ReadGuardSharedState, Timeline}, + timelines_set::TimelinesSet, + wal_backup::{self, WalBackupTaskHandle}, + SafeKeeperConf, +}; + +pub struct StateSnapshot { + pub commit_lsn: Lsn, + pub backup_lsn: Lsn, + pub remote_consistent_lsn: Lsn, + pub peers: Vec, +} + +impl StateSnapshot { + /// Create a new snapshot of the timeline state. + fn new(read_guard: ReadGuardSharedState, heartbeat_timeout: Duration) -> Self { + Self { + commit_lsn: read_guard.sk.state.inmem.commit_lsn, + backup_lsn: read_guard.sk.state.inmem.backup_lsn, + remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn, + peers: read_guard.get_peers(heartbeat_timeout), + } + } +} + +/// Control how often the manager task should wake up to check updates. +/// There is no need to check for updates more often than this. +const REFRESH_INTERVAL: Duration = Duration::from_millis(300); + +/// This task gets spawned alongside each timeline and is responsible for managing the timeline's +/// background tasks. +#[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))] +pub async fn main_task( + tli: Arc, + conf: SafeKeeperConf, + broker_active_set: Arc, +) { + let mut cancellation_rx = match tli.get_cancellation_rx() { + Ok(rx) => rx, + Err(_) => { + info!("timeline canceled during task start"); + return; + } + }; + + scopeguard::defer! { + if tli.is_cancelled() { + info!("manager task finished"); + } else { + warn!("manager task finished prematurely"); + } + }; + + // sets whether timeline is active for broker pushes or not + let mut tli_broker_active = broker_active_set.guard(tli.clone()); + + let ttid = tli.ttid; + let wal_seg_size = tli.get_wal_seg_size().await; + let heartbeat_timeout = conf.heartbeat_timeout; + + let mut state_version_rx = tli.get_state_version_rx(); + + let walreceivers = tli.get_walreceivers(); + let mut num_computes_rx = walreceivers.get_num_rx(); + + // list of background tasks + let mut backup_task: Option = None; + + let last_state = 'outer: loop { + MANAGER_ITERATIONS_TOTAL.inc(); + + let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout); + let num_computes = *num_computes_rx.borrow(); + + let is_wal_backup_required = + wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot); + + if conf.is_wal_backup_enabled() { + wal_backup::update_task( + &conf, + ttid, + is_wal_backup_required, + &state_snapshot, + &mut backup_task, + ) + .await; + } + + let is_active = is_wal_backup_required + || num_computes > 0 + || state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn; + + // update the broker timeline set + if tli_broker_active.set(is_active) { + // write log if state has changed + info!( + "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}", + is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn, + ); + + MANAGER_ACTIVE_CHANGES.inc(); + + if !is_active { + // TODO: maybe use tokio::spawn? + if let Err(e) = tli.maybe_persist_control_file().await { + warn!("control file save in update_status failed: {:?}", e); + } + } + } + + // update the state in Arc + tli.wal_backup_active + .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed); + tli.broker_active + .store(is_active, std::sync::atomic::Ordering::Relaxed); + + // wait until something changes. tx channels are stored under Arc, so they will not be + // dropped until the manager task is finished. + tokio::select! { + _ = cancellation_rx.changed() => { + // timeline was deleted + break 'outer state_snapshot; + } + _ = async { + // don't wake up on every state change, but at most every REFRESH_INTERVAL + tokio::time::sleep(REFRESH_INTERVAL).await; + let _ = state_version_rx.changed().await; + } => { + // state was updated + } + _ = num_computes_rx.changed() => { + // number of connected computes was updated + } + } + }; + + // shutdown background tasks + if conf.is_wal_backup_enabled() { + wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await; + } +} diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 079e706ff8..8d37bd6371 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -4,6 +4,7 @@ use crate::safekeeper::ServerInfo; use crate::timeline::{Timeline, TimelineError}; +use crate::timelines_set::TimelinesSet; use crate::SafeKeeperConf; use anyhow::{bail, Context, Result}; use camino::Utf8PathBuf; @@ -11,16 +12,16 @@ use once_cell::sync::Lazy; use serde::Serialize; use std::collections::HashMap; use std::str::FromStr; +use std::sync::atomic::Ordering; use std::sync::{Arc, Mutex}; -use tokio::sync::mpsc::Sender; use tracing::*; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; struct GlobalTimelinesState { timelines: HashMap>, - wal_backup_launcher_tx: Option>, conf: Option, + broker_active_set: Arc, load_lock: Arc>, } @@ -36,11 +37,8 @@ impl GlobalTimelinesState { } /// Get dependencies for a timeline constructor. - fn get_dependencies(&self) -> (SafeKeeperConf, Sender) { - ( - self.get_conf().clone(), - self.wal_backup_launcher_tx.as_ref().unwrap().clone(), - ) + fn get_dependencies(&self) -> (SafeKeeperConf, Arc) { + (self.get_conf().clone(), self.broker_active_set.clone()) } /// Insert timeline into the map. Returns error if timeline with the same id already exists. @@ -65,8 +63,8 @@ impl GlobalTimelinesState { static TIMELINES_STATE: Lazy> = Lazy::new(|| { Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), - wal_backup_launcher_tx: None, conf: None, + broker_active_set: Arc::new(TimelinesSet::default()), load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)), }) }); @@ -76,16 +74,11 @@ pub struct GlobalTimelines; impl GlobalTimelines { /// Inject dependencies needed for the timeline constructors and load all timelines to memory. - pub async fn init( - conf: SafeKeeperConf, - wal_backup_launcher_tx: Sender, - ) -> Result<()> { + pub async fn init(conf: SafeKeeperConf) -> Result<()> { // clippy isn't smart enough to understand that drop(state) releases the // lock, so use explicit block let tenants_dir = { let mut state = TIMELINES_STATE.lock().unwrap(); - assert!(state.wal_backup_launcher_tx.is_none()); - state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); state.conf = Some(conf); // Iterate through all directories and load tenants for all directories @@ -129,12 +122,9 @@ impl GlobalTimelines { /// this function is called during init when nothing else is running, so /// this is fine. async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> { - let (conf, wal_backup_launcher_tx) = { + let (conf, broker_active_set) = { let state = TIMELINES_STATE.lock().unwrap(); - ( - state.get_conf().clone(), - state.wal_backup_launcher_tx.as_ref().unwrap().clone(), - ) + state.get_dependencies() }; let timelines_dir = conf.tenant_dir(&tenant_id); @@ -147,7 +137,7 @@ impl GlobalTimelines { TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or("")) { let ttid = TenantTimelineId::new(tenant_id, timeline_id); - match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx.clone()) { + match Timeline::load_timeline(&conf, ttid) { Ok(timeline) => { let tli = Arc::new(timeline); TIMELINES_STATE @@ -155,8 +145,7 @@ impl GlobalTimelines { .unwrap() .timelines .insert(ttid, tli.clone()); - tli.bootstrap(&conf); - tli.update_status_notify().await.unwrap(); + tli.bootstrap(&conf, broker_active_set.clone()); } // If we can't load a timeline, it's most likely because of a corrupted // directory. We will log an error and won't allow to delete/recreate @@ -189,9 +178,9 @@ impl GlobalTimelines { _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>, ttid: TenantTimelineId, ) -> Result> { - let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies(); + let (conf, broker_active_set) = TIMELINES_STATE.lock().unwrap().get_dependencies(); - match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) { + match Timeline::load_timeline(&conf, ttid) { Ok(timeline) => { let tli = Arc::new(timeline); @@ -202,7 +191,7 @@ impl GlobalTimelines { .timelines .insert(ttid, tli.clone()); - tli.bootstrap(&conf); + tli.bootstrap(&conf, broker_active_set); Ok(tli) } @@ -221,6 +210,10 @@ impl GlobalTimelines { TIMELINES_STATE.lock().unwrap().get_conf().clone() } + pub fn get_global_broker_active_set() -> Arc { + TIMELINES_STATE.lock().unwrap().broker_active_set.clone() + } + /// Create a new timeline with the given id. If the timeline already exists, returns /// an existing timeline. pub async fn create( @@ -229,7 +222,7 @@ impl GlobalTimelines { commit_lsn: Lsn, local_start_lsn: Lsn, ) -> Result> { - let (conf, wal_backup_launcher_tx) = { + let (conf, broker_active_set) = { let state = TIMELINES_STATE.lock().unwrap(); if let Ok(timeline) = state.get(&ttid) { // Timeline already exists, return it. @@ -243,7 +236,6 @@ impl GlobalTimelines { let timeline = Arc::new(Timeline::create_empty( &conf, ttid, - wal_backup_launcher_tx, server_info, commit_lsn, local_start_lsn, @@ -264,7 +256,10 @@ impl GlobalTimelines { // Write the new timeline to the disk and start background workers. // Bootstrap is transactional, so if it fails, the timeline will be deleted, // and the state on disk should remain unchanged. - if let Err(e) = timeline.init_new(&mut shared_state, &conf).await { + if let Err(e) = timeline + .init_new(&mut shared_state, &conf, broker_active_set) + .await + { // Note: the most likely reason for init failure is that the timeline // directory already exists on disk. This happens when timeline is corrupted // and wasn't loaded from disk on startup because of that. We want to preserve @@ -281,8 +276,6 @@ impl GlobalTimelines { // We are done with bootstrap, release the lock, return the timeline. // {} block forces release before .await } - timeline.update_status_notify().await?; - timeline.wal_backup_launcher_tx.send(timeline.ttid).await?; Ok(timeline) } @@ -335,12 +328,13 @@ impl GlobalTimelines { let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid); match tli_res { Ok(timeline) => { + let was_active = timeline.broker_active.load(Ordering::Relaxed); + // Take a lock and finish the deletion holding this mutex. let mut shared_state = timeline.write_shared_state().await; info!("deleting timeline {}, only_local={}", ttid, only_local); - let (dir_existed, was_active) = - timeline.delete(&mut shared_state, only_local).await?; + let dir_existed = timeline.delete(&mut shared_state, only_local).await?; // Remove timeline from the map. // FIXME: re-enable it once we fix the issue with recreation of deleted timelines @@ -349,7 +343,7 @@ impl GlobalTimelines { Ok(TimelineDeleteForceResult { dir_existed, - was_active, + was_active, // TODO: we probably should remove this field }) } Err(_) => { diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs new file mode 100644 index 0000000000..ea8e23bb72 --- /dev/null +++ b/safekeeper/src/timelines_set.rs @@ -0,0 +1,90 @@ +use std::{collections::HashMap, sync::Arc}; + +use utils::id::TenantTimelineId; + +use crate::timeline::Timeline; + +/// Set of timelines, supports operations: +/// - add timeline +/// - remove timeline +/// - clone the set +/// +/// Usually used for keeping subset of timelines. For example active timelines that require broker push. +pub struct TimelinesSet { + timelines: std::sync::Mutex>>, +} + +impl Default for TimelinesSet { + fn default() -> Self { + Self { + timelines: std::sync::Mutex::new(HashMap::new()), + } + } +} + +impl TimelinesSet { + pub fn insert(&self, tli: Arc) { + self.timelines.lock().unwrap().insert(tli.ttid, tli); + } + + pub fn delete(&self, ttid: &TenantTimelineId) { + self.timelines.lock().unwrap().remove(ttid); + } + + /// If present is true, adds timeline to the set, otherwise removes it. + pub fn set_present(&self, tli: Arc, present: bool) { + if present { + self.insert(tli); + } else { + self.delete(&tli.ttid); + } + } + + pub fn is_present(&self, ttid: &TenantTimelineId) -> bool { + self.timelines.lock().unwrap().contains_key(ttid) + } + + /// Returns all timelines in the set. + pub fn get_all(&self) -> Vec> { + self.timelines.lock().unwrap().values().cloned().collect() + } + + /// Returns a timeline guard for easy presence control. + pub fn guard(self: &Arc, tli: Arc) -> TimelineSetGuard { + let is_present = self.is_present(&tli.ttid); + TimelineSetGuard { + timelines_set: self.clone(), + tli, + is_present, + } + } +} + +/// Guard is used to add or remove timeline from the set. +/// If the timeline present in set, it will be removed from it on drop. +/// Note: do not use more than one guard for the same timeline, it caches the presence state. +/// It is designed to be used in the manager task only. +pub struct TimelineSetGuard { + timelines_set: Arc, + tli: Arc, + is_present: bool, +} + +impl TimelineSetGuard { + /// Returns true if the state was changed. + pub fn set(&mut self, present: bool) -> bool { + if present == self.is_present { + return false; + } + self.is_present = present; + self.timelines_set.set_present(self.tli.clone(), present); + true + } +} + +impl Drop for TimelineSetGuard { + fn drop(&mut self) { + // remove timeline from the map on drop + self.timelines_set.delete(&self.tli.ttid); + } +} diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index e496f07114..84680557f9 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -9,7 +9,7 @@ use utils::backoff; use utils::id::NodeId; use std::cmp::min; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::num::NonZeroU32; use std::pin::Pin; use std::sync::Arc; @@ -29,9 +29,10 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; -use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS}; +use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; use crate::timeline::{PeerInfo, Timeline}; -use crate::{GlobalTimelines, SafeKeeperConf}; +use crate::timeline_manager::StateSnapshot; +use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME}; use once_cell::sync::OnceCell; @@ -41,35 +42,84 @@ const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; /// Default buffer size when interfacing with [`tokio::fs::File`]. const BUFFER_SIZE: usize = 32 * 1024; -/// Check whether wal backup is required for timeline. If yes, mark that launcher is -/// aware of current status and return the timeline. -async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option> { - match GlobalTimelines::get(ttid).ok() { - Some(tli) => { - tli.wal_backup_attend().await; - Some(tli) - } - None => None, - } -} - -struct WalBackupTaskHandle { +pub struct WalBackupTaskHandle { shutdown_tx: Sender<()>, handle: JoinHandle<()>, } -struct WalBackupTimelineEntry { - timeline: Arc, - handle: Option, +/// Do we have anything to upload to S3, i.e. should safekeepers run backup activity? +pub fn is_wal_backup_required( + wal_seg_size: usize, + num_computes: usize, + state: &StateSnapshot, +) -> bool { + num_computes > 0 || + // Currently only the whole segment is offloaded, so compare segment numbers. + (state.commit_lsn.segment_number(wal_seg_size) > state.backup_lsn.segment_number(wal_seg_size)) } -async fn shut_down_task(ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry) { - if let Some(wb_handle) = entry.handle.take() { +/// Based on peer information determine which safekeeper should offload; if it +/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task +/// is running, kill it. +pub async fn update_task( + conf: &SafeKeeperConf, + ttid: TenantTimelineId, + need_backup: bool, + state: &StateSnapshot, + entry: &mut Option, +) { + let (offloader, election_dbg_str) = + determine_offloader(&state.peers, state.backup_lsn, ttid, conf); + let elected_me = Some(conf.my_id) == offloader; + + let should_task_run = need_backup && elected_me; + + // start or stop the task + if should_task_run != (entry.is_some()) { + if should_task_run { + info!("elected for backup: {}", election_dbg_str); + + let (shutdown_tx, shutdown_rx) = mpsc::channel(1); + let timeline_dir = conf.timeline_dir(&ttid); + + let async_task = backup_task_main( + ttid, + timeline_dir, + conf.workdir.clone(), + conf.backup_parallel_jobs, + shutdown_rx, + ); + + let handle = if conf.current_thread_runtime { + tokio::spawn(async_task) + } else { + WAL_BACKUP_RUNTIME.spawn(async_task) + }; + + *entry = Some(WalBackupTaskHandle { + shutdown_tx, + handle, + }); + } else { + if !need_backup { + // don't need backup at all + info!("stepping down from backup, need_backup={}", need_backup); + } else { + // someone else has been elected + info!("stepping down from backup: {}", election_dbg_str); + } + shut_down_task(entry).await; + } + } +} + +async fn shut_down_task(entry: &mut Option) { + if let Some(wb_handle) = entry.take() { // Tell the task to shutdown. Error means task exited earlier, that's ok. let _ = wb_handle.shutdown_tx.send(()).await; // Await the task itself. TODO: restart panicked tasks earlier. if let Err(e) = wb_handle.handle.await { - warn!("WAL backup task for {} panicked: {}", ttid, e); + warn!("WAL backup task panicked: {}", e); } } } @@ -126,49 +176,6 @@ fn determine_offloader( } } -/// Based on peer information determine which safekeeper should offload; if it -/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task -/// is running, kill it. -async fn update_task( - conf: &SafeKeeperConf, - ttid: TenantTimelineId, - entry: &mut WalBackupTimelineEntry, -) { - let alive_peers = entry.timeline.get_peers(conf).await; - let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await; - let (offloader, election_dbg_str) = - determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf); - let elected_me = Some(conf.my_id) == offloader; - - if elected_me != (entry.handle.is_some()) { - if elected_me { - info!("elected for backup: {}", election_dbg_str); - - let (shutdown_tx, shutdown_rx) = mpsc::channel(1); - let timeline_dir = conf.timeline_dir(&ttid); - - let handle = tokio::spawn( - backup_task_main( - ttid, - timeline_dir, - conf.workdir.clone(), - conf.backup_parallel_jobs, - shutdown_rx, - ) - .in_current_span(), - ); - - entry.handle = Some(WalBackupTaskHandle { - shutdown_tx, - handle, - }); - } else { - info!("stepping down from backup: {}", election_dbg_str); - shut_down_task(ttid, entry).await; - } - } -} - static REMOTE_STORAGE: OnceCell> = OnceCell::new(); // Storage must be configured and initialized when this is called. @@ -190,67 +197,6 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) { }); } -const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; - -/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup -/// tasks. Having this in separate task simplifies locking, allows to reap -/// panics and separate elections from offloading itself. -pub async fn wal_backup_launcher_task_main( - conf: SafeKeeperConf, - mut wal_backup_launcher_rx: Receiver, -) -> anyhow::Result<()> { - info!( - "WAL backup launcher started, remote config {:?}", - conf.remote_storage - ); - - // Presence in this map means launcher is aware s3 offloading is needed for - // the timeline, but task is started only if it makes sense for to offload - // from this safekeeper. - let mut tasks: HashMap = HashMap::new(); - - let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC)); - loop { - tokio::select! { - ttid = wal_backup_launcher_rx.recv() => { - // channel is never expected to get closed - let ttid = ttid.unwrap(); - if !conf.is_wal_backup_enabled() { - continue; /* just drain the channel and do nothing */ - } - async { - let timeline = is_wal_backup_required(ttid).await; - // do we need to do anything at all? - if timeline.is_some() != tasks.contains_key(&ttid) { - if let Some(timeline) = timeline { - // need to start the task - let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry { - timeline, - handle: None, - }); - update_task(&conf, ttid, entry).await; - } else { - // need to stop the task - info!("stopping WAL backup task"); - let mut entry = tasks.remove(&ttid).unwrap(); - shut_down_task(ttid, &mut entry).await; - } - } - }.instrument(info_span!("WAL backup", ttid = %ttid)).await; - } - // For each timeline needing offloading, check if this safekeeper - // should do the job and start/stop the task accordingly. - _ = ticker.tick() => { - for (ttid, entry) in tasks.iter_mut() { - update_task(&conf, *ttid, entry) - .instrument(info_span!("WAL backup", ttid = %ttid)) - .await; - } - } - } - } -} - struct WalBackupTask { timeline: Arc, timeline_dir: Utf8PathBuf, @@ -261,6 +207,7 @@ struct WalBackupTask { } /// Offload single timeline. +#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))] async fn backup_task_main( ttid: TenantTimelineId, timeline_dir: Utf8PathBuf, @@ -268,6 +215,8 @@ async fn backup_task_main( parallel_jobs: usize, mut shutdown_rx: Receiver<()>, ) { + let _guard = WAL_BACKUP_TASKS.guard(); + info!("started"); let res = GlobalTimelines::get(ttid); if let Err(e) = res { From 664f92dc6e6eac25380272fe4e466b4ddc4954bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 22 May 2024 12:43:03 +0200 Subject: [PATCH 107/130] Refactor PageServerHandler::process_query parsing (#7835) In the process_query function in page_service.rs there was some redundant duplication. Remove it and create a vector of whitespace separated parts at the start and then use `slice::strip_prefix`. Only use `starts_with` in the places with multiple whitespace separated parameters: here we want to preserve grep/rg ability. Followup of #7815, requested in https://github.com/neondatabase/neon/pull/7815#pullrequestreview-2068835674 --- pageserver/src/page_service.rs | 70 ++++++++++++---------------------- 1 file changed, 25 insertions(+), 45 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index c066f56c17..d250864fd6 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1521,9 +1521,8 @@ where let ctx = self.connection_ctx.attached_child(); debug!("process query {query_string:?}"); - if query_string.starts_with("pagestream_v2 ") { - let (_, params_raw) = query_string.split_at("pagestream_v2 ".len()); - let params = params_raw.split(' ').collect::>(); + let parts = query_string.split_whitespace().collect::>(); + if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) { if params.len() != 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for pagestream command" @@ -1548,9 +1547,7 @@ where ctx, ) .await?; - } else if query_string.starts_with("pagestream ") { - let (_, params_raw) = query_string.split_at("pagestream ".len()); - let params = params_raw.split(' ').collect::>(); + } else if let Some(params) = parts.strip_prefix(&["pagestream"]) { if params.len() != 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for pagestream command" @@ -1575,10 +1572,7 @@ where ctx, ) .await?; - } else if query_string.starts_with("basebackup ") { - let (_, params_raw) = query_string.split_at("basebackup ".len()); - let params = params_raw.split_whitespace().collect::>(); - + } else if let Some(params) = parts.strip_prefix(&["basebackup"]) { if params.len() < 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for basebackup command" @@ -1596,26 +1590,23 @@ where self.check_permission(Some(tenant_id))?; - let lsn = if params.len() >= 3 { + let lsn = if let Some(lsn_str) = params.get(2) { Some( - Lsn::from_str(params[2]) - .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?, + Lsn::from_str(lsn_str) + .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?, ) } else { None }; - let gzip = if params.len() >= 4 { - if params[3] == "--gzip" { - true - } else { + let gzip = match params.get(3) { + Some(&"--gzip") => true, + None => false, + Some(third_param) => { return Err(QueryError::Other(anyhow::anyhow!( - "Parameter in position 3 unknown {}", - params[3], - ))); + "Parameter in position 3 unknown {third_param}", + ))) } - } else { - false }; let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx); @@ -1639,10 +1630,7 @@ where res?; } // return pair of prev_lsn and last_lsn - else if query_string.starts_with("get_last_record_rlsn ") { - let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len()); - let params = params_raw.split_whitespace().collect::>(); - + else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) { if params.len() != 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for get_last_record_rlsn command" @@ -1684,10 +1672,7 @@ where .await?; } // same as basebackup, but result includes relational data as well - else if query_string.starts_with("fullbackup ") { - let (_, params_raw) = query_string.split_at("fullbackup ".len()); - let params = params_raw.split_whitespace().collect::>(); - + else if let Some(params) = parts.strip_prefix(&["fullbackup"]) { if params.len() < 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for fullbackup command" @@ -1704,18 +1689,18 @@ where .record("timeline_id", field::display(timeline_id)); // The caller is responsible for providing correct lsn and prev_lsn. - let lsn = if params.len() > 2 { + let lsn = if let Some(lsn_str) = params.get(2) { Some( - Lsn::from_str(params[2]) - .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?, + Lsn::from_str(lsn_str) + .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?, ) } else { None }; - let prev_lsn = if params.len() > 3 { + let prev_lsn = if let Some(prev_lsn_str) = params.get(3) { Some( - Lsn::from_str(params[3]) - .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?, + Lsn::from_str(prev_lsn_str) + .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?, ) } else { None @@ -1748,8 +1733,7 @@ where // 2. Run: // cat my_backup/base.tar | psql -h $PAGESERVER \ // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION" - let (_, params_raw) = query_string.split_at("import basebackup ".len()); - let params = params_raw.split_whitespace().collect::>(); + let params = &parts[2..]; if params.len() != 5 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for import basebackup command" @@ -1798,8 +1782,7 @@ where // // Files are scheduled to be persisted to remote storage, and the // caller should poll the http api to check when that is done. - let (_, params_raw) = query_string.split_at("import wal ".len()); - let params = params_raw.split_whitespace().collect::>(); + let params = &parts[2..]; if params.len() != 4 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for import wal command" @@ -1838,8 +1821,7 @@ where // on connect pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("lease lsn ") { - let (_, params_raw) = query_string.split_at("lease lsn ".len()); - let params = params_raw.split_whitespace().collect::>(); + let params = &parts[2..]; if params.len() != 3 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number {} for lease lsn command", @@ -1875,10 +1857,8 @@ where ))? } }; - } else if query_string.starts_with("show ") { + } else if let Some(params) = parts.strip_prefix(&["show"]) { // show - let (_, params_raw) = query_string.split_at("show ".len()); - let params = params_raw.split(' ').collect::>(); if params.len() != 1 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for config command" From b43f6daa488de05691775a8908920d274fb53369 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Thu, 2 May 2024 15:53:30 +0300 Subject: [PATCH 108/130] One more iteration on making walcraft test more robust. Some WAL might be inserted on the page boundary before XLOG_SWITCH lands there, repeat construction in this case. --- libs/postgres_ffi/wal_craft/src/lib.rs | 46 ++++++++++++-------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index b6769629a8..6052f04d11 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -373,31 +373,29 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", &[&(repeats as i32)], )?; - break; - } - info!( - "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", - client.pg_current_wal_insert_lsn()?, - XLOG_SIZE_OF_XLOG_RECORD - ); + info!( + "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", + client.pg_current_wal_insert_lsn()?, + XLOG_SIZE_OF_XLOG_RECORD + ); - // Emit the XLOG_SWITCH - let before_xlog_switch = client.pg_current_wal_insert_lsn()?; - let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); - let next_segment = PgLsn::from(0x0200_0000); - ensure!( - xlog_switch_record_end < next_segment, - "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}", - xlog_switch_record_end, - next_segment - ); - ensure!( - u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD, - "XLOG_SWITCH message ended not on page boundary: {}, offset = {}", - xlog_switch_record_end, - u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ - ); - Ok(vec![before_xlog_switch, xlog_switch_record_end]) + // Emit the XLOG_SWITCH + let before_xlog_switch = client.pg_current_wal_insert_lsn()?; + let xlog_switch_record_end: PgLsn = + client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + + if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ + != XLOG_SIZE_OF_XLOG_SHORT_PHD + { + warn!( + "XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating", + xlog_switch_record_end, + u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ + ); + continue; + } + return Ok(vec![before_xlog_switch, xlog_switch_record_end]); + } } } From ef96c82c9f189633629e6bfc051a78fbab12c9a6 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 21 May 2024 19:33:03 +0300 Subject: [PATCH 109/130] Fix zenith_test_evict mode and clear_buffer_cache() function Using InvalidateBuffer is wrong, because if the page is concurrently dirtied, it will throw away the dirty page without calling smgwrite(). In Neon, that means that the last-written LSN update for the page is missed. In v16, use the new InvalidateVictimBuffer() function that does what we need. In v15 and v14, backport the InvalidateVictimBuffer() function. Fixes issue https://github.com/neondatabase/neon/issues/7802 --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/revisions.json | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 21ec61d539..0d30e28f74 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 21ec61d539d22a81fe811c2d79e26436820bc3f4 +Subproject commit 0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index e2dbd63345..74fb144890 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit e2dbd63345c584de75173c27951f111249ae0016 +Subproject commit 74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index c271017c6c..3c2b9d576c 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit c271017c6c4846be59948766baec2ba4ace5dc9c +Subproject commit 3c2b9d576c580e0b5b7108001f959b8c5b42e0a2 diff --git a/vendor/revisions.json b/vendor/revisions.json index a3af9331fe..2f16f334c5 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "v16": ["16.3", "c271017c6c4846be59948766baec2ba4ace5dc9c"], - "v15": ["15.7", "e2dbd63345c584de75173c27951f111249ae0016"], - "v14": ["14.12", "21ec61d539d22a81fe811c2d79e26436820bc3f4"] + "v16": ["16.3", "3c2b9d576c580e0b5b7108001f959b8c5b42e0a2"], + "v15": ["15.7", "74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d"], + "v14": ["14.12", "0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f"] } From df9ab1b5e3962da4d98c7b1cd6a7fa20f4ff3902 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 22 May 2024 15:43:21 +0300 Subject: [PATCH 110/130] refactor(test): duplication with fullbackup, tar content hashing (#7828) "taking a fullbackup" is an ugly multi-liner copypasted in multiple places, most recently with timeline ancestor detach tests. move it under `PgBin` which is not a great place, but better than yet another utility function. Additionally: - cleanup `psql_env` repetition (PgBin already configures that) - move the backup tar comparison as a yet another free utility function - use backup tar comparison in `test_import.py` where a size check was done previously - cleanup extra timeline creation from test Cc: #7715 --- test_runner/fixtures/neon_fixtures.py | 22 ++++ test_runner/fixtures/utils.py | 50 ++++++++ test_runner/regress/test_fullbackup.py | 23 ++-- test_runner/regress/test_import.py | 25 +--- test_runner/regress/test_next_xid.py | 11 +- .../regress/test_timeline_detach_ancestor.py | 120 +++--------------- 6 files changed, 106 insertions(+), 145 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7a660b64ed..b02054a702 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2788,6 +2788,28 @@ class PgBin: log.info(f"last checkpoint at {checkpoint_lsn}") return Lsn(checkpoint_lsn) + def take_fullbackup( + self, + pageserver: NeonPageserver, + tenant: TenantId, + timeline: TimelineId, + lsn: Lsn, + output: Path, + ): + """ + Request fullbackup from pageserver, store it at 'output'. + """ + cmd = [ + "psql", + "--no-psqlrc", + pageserver.connstr(), + "-c", + f"fullbackup {tenant} {timeline} {lsn}", + "-o", + str(output), + ] + self.run_capture(cmd) + @pytest.fixture(scope="function") def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin: diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 16dc9e8cfb..70263245e7 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -4,10 +4,13 @@ import json import os import re import subprocess +import tarfile import threading import time +from hashlib import sha256 from pathlib import Path from typing import ( + IO, TYPE_CHECKING, Any, Callable, @@ -15,8 +18,10 @@ from typing import ( Iterable, List, Optional, + Set, Tuple, TypeVar, + Union, ) from urllib.parse import urlencode @@ -499,3 +504,48 @@ class AuxFileStore(str, enum.Enum): def __str__(self) -> str: return f"'aux-{self.value}'" + + +def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str]): + """ + This is essentially: + + lines=$(comm -3 \ + <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \ + <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \ + | wc -l) + [ "$lines" = "0" ] + + But in a more mac friendly fashion. + """ + started_at = time.time() + + def hash_extracted(reader: Union[IO[bytes], None]) -> bytes: + assert reader is not None + digest = sha256(usedforsecurity=False) + while True: + buf = reader.read(64 * 1024) + if not buf: + break + digest.update(buf) + return digest.digest() + + def build_hash_list(p: Path) -> List[Tuple[str, bytes]]: + with tarfile.open(p) as f: + matching_files = (info for info in f if info.isreg() and info.name not in skip_files) + ret = list( + map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files) + ) + ret.sort(key=lambda t: t[0]) + return ret + + left_list, right_list = map(build_hash_list, [left, right]) + + try: + assert len(left_list) == len(right_list) + + for left_tuple, right_tuple in zip(left_list, right_list): + assert left_tuple == right_tuple + finally: + elapsed = time.time() - started_at + log.info(f"assert_pageserver_backups_equal completed in {elapsed}s") diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index e1e4f700d4..e6d51a77a6 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -1,7 +1,7 @@ import os from pathlib import Path -from fixtures.common_types import Lsn, TimelineId +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -19,17 +19,16 @@ def test_fullbackup( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor, - pg_distrib_dir: Path, test_output_dir: Path, ): env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_fullbackup") - endpoint_main = env.endpoints.create_start("test_fullbackup") + # endpoint needs to be alive until the fullbackup so that we have + # prev_record_lsn for the vanilla_pg to start in read-write mode + # for some reason this does not happen if endpoint is shutdown. + endpoint_main = env.endpoints.create_start("main") with endpoint_main.cursor() as cur: - timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) - # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") cur.execute( @@ -41,17 +40,13 @@ def test_fullbackup( lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"start_backup_lsn = {lsn}") - # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. - # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} - # Get and unpack fullbackup from pageserver restored_dir_path = env.repo_dir / "restored_datadir" os.mkdir(restored_dir_path, 0o750) - query = f"fullbackup {env.initial_tenant} {timeline} {lsn}" tar_output_file = test_output_dir / "fullbackup.tar" - cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, env.initial_timeline, lsn, tar_output_file + ) subprocess_capture( env.repo_dir, ["tar", "-xf", str(tar_output_file), "-C", str(restored_dir_path)] ) @@ -61,7 +56,7 @@ def test_fullbackup( # use resetwal to overwrite it pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") cmd = [pg_resetwal_path, "-D", str(restored_dir_path)] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.run_capture(cmd) # Restore from the backup and find the data we inserted port = port_distributor.get_port() diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 1f1c8cc582..62229ebfe7 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -21,7 +21,7 @@ from fixtures.pageserver.utils import ( wait_for_upload, ) from fixtures.remote_storage import RemoteStorageKind -from fixtures.utils import subprocess_capture +from fixtures.utils import assert_pageserver_backups_equal, subprocess_capture def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder): @@ -248,15 +248,9 @@ def _import( path to the backup archive file""" log.info(f"start_backup_lsn = {lsn}") - # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. - # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} - # Get a fullbackup from pageserver - query = f"fullbackup { env.initial_tenant} {timeline} {lsn}" tar_output_file = test_output_dir / "fullbackup.tar" - cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup(env.pageserver, env.initial_tenant, timeline, lsn, tar_output_file) # Stop the first pageserver instance, erase all its data env.endpoints.stop_all() @@ -305,22 +299,11 @@ def _import( assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)] # Take another fullbackup - query = f"fullbackup { tenant} {timeline} {lsn}" new_tar_output_file = test_output_dir / "fullbackup-new.tar" - cmd = [ - "psql", - "--no-psqlrc", - env.pageserver.connstr(), - "-c", - query, - "-o", - str(new_tar_output_file), - ] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup(env.pageserver, tenant, timeline, lsn, new_tar_output_file) # Check it's the same as the first fullbackup - # TODO pageserver should be checking checksum - assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file) + assert_pageserver_backups_equal(tar_output_file, new_tar_output_file, set()) # Check that gc works pageserver_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index 45c0e3e409..98fb06a0d6 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -5,7 +5,7 @@ from pathlib import Path from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_wal_insert_lsn from fixtures.pageserver.utils import ( wait_for_last_record_lsn, ) @@ -71,22 +71,17 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder): def test_import_at_2bil( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - pg_distrib_dir: Path, - pg_bin, + pg_bin: PgBin, vanilla_pg, ): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() ps_http = env.pageserver.http_client() - # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. - # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} - # Reset the vanilla Postgres instance to somewhat before 2 billion transactions. pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") cmd = [pg_resetwal_path, "--next-transaction-id=2129920000", "-D", str(vanilla_pg.pgdatadir)] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.run_capture(cmd) vanilla_pg.start() vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 3e435caeee..1563c161e0 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1,25 +1,18 @@ import datetime import enum -import tarfile -import time from concurrent.futures import ThreadPoolExecutor -from hashlib import sha256 -from pathlib import Path from queue import Empty, Queue from threading import Barrier -from typing import IO, List, Set, Tuple, Union +from typing import List, Tuple import pytest from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - NeonEnvBuilder, - PgBin, - wait_for_last_flush_lsn, -) +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn from fixtures.pageserver.http import HistoricLayerInfo from fixtures.pageserver.utils import wait_timeline_detail_404 from fixtures.remote_storage import LocalFsStorage +from fixtures.utils import assert_pageserver_backups_equal def by_end_lsn(info: HistoricLayerInfo) -> Lsn: @@ -68,7 +61,6 @@ SHUTDOWN_ALLOWED_ERRORS = [ @pytest.mark.parametrize("write_to_branch_first", [True, False]) def test_ancestor_detach_branched_from( test_output_dir, - pg_distrib_dir, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, branchpoint: Branchpoint, @@ -80,7 +72,6 @@ def test_ancestor_detach_branched_from( """ env = neon_env_builder.init_start() - psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) client = env.pageserver.http_client() @@ -160,16 +151,9 @@ def test_ancestor_detach_branched_from( # run fullbackup to make sure there are no off by one errors # take this on the parent fullbackup_before = test_output_dir / "fullbackup-before.tar" - cmd = [ - "psql", - "--no-psqlrc", - env.pageserver.connstr(), - "-c", - f"fullbackup {env.initial_tenant} {env.initial_timeline} {branch_at}", - "-o", - str(fullbackup_before), - ] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, env.initial_timeline, branch_at, fullbackup_before + ) all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) assert all_reparented == set() @@ -200,16 +184,9 @@ def test_ancestor_detach_branched_from( # take this on the detached, at same lsn fullbackup_after = test_output_dir / "fullbackup-after.tar" - cmd = [ - "psql", - "--no-psqlrc", - env.pageserver.connstr(), - "-c", - f"fullbackup {env.initial_tenant} {timeline_id} {branch_at}", - "-o", - str(fullbackup_after), - ] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, timeline_id, branch_at, fullbackup_after + ) client.timeline_delete(env.initial_tenant, env.initial_timeline) wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) @@ -218,52 +195,7 @@ def test_ancestor_detach_branched_from( # as there is always "PREV_LSN: invalid" for "before" skip_files = {"zenith.signal"} - tar_cmp(fullbackup_before, fullbackup_after, skip_files) - - -def tar_cmp(left: Path, right: Path, skip_files: Set[str]): - """ - This is essentially: - - lines=$(comm -3 \ - <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \ - <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \ - | wc -l) - [ "$lines" = "0" ] - - But in a more mac friendly fashion. - """ - started_at = time.time() - - def hash_extracted(reader: Union[IO[bytes], None]) -> bytes: - assert reader is not None - digest = sha256(usedforsecurity=False) - while True: - buf = reader.read(64 * 1024) - if not buf: - break - digest.update(buf) - return digest.digest() - - def build_hash_list(p: Path) -> List[Tuple[str, bytes]]: - with tarfile.open(p) as f: - matching_files = (info for info in f if info.isreg() and info.name not in skip_files) - ret = list( - map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files) - ) - ret.sort(key=lambda t: t[0]) - return ret - - left_list, right_list = map(build_hash_list, [left, right]) - - try: - assert len(left_list) == len(right_list) - - for left_tuple, right_tuple in zip(left_list, right_list): - assert left_tuple == right_tuple - finally: - elapsed = time.time() - started_at - log.info(f"tar_cmp completed in {elapsed}s") + assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, skip_files) def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): @@ -483,7 +415,7 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn def test_compaction_induced_by_detaches_in_history( - neon_env_builder: NeonEnvBuilder, test_output_dir, pg_distrib_dir, pg_bin: PgBin + neon_env_builder: NeonEnvBuilder, test_output_dir, pg_bin: PgBin ): """ Assuming the tree of timelines: @@ -500,8 +432,6 @@ def test_compaction_induced_by_detaches_in_history( timeline broken. """ - psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} - env = neon_env_builder.init_start( initial_tenant_conf={ # we want to create layers manually so we don't branch on arbitrary @@ -589,16 +519,9 @@ def test_compaction_induced_by_detaches_in_history( # take the fullbackup before and after inheriting the new L0s fullbackup_before = test_output_dir / "fullbackup-before.tar" - cmd = [ - "psql", - "--no-psqlrc", - env.pageserver.connstr(), - "-c", - f"fullbackup {env.initial_tenant} {branch_timeline_id} {branch_lsn}", - "-o", - str(fullbackup_before), - ] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_before + ) for _, timeline_id in skip_main: reparented = client.detach_ancestor(env.initial_tenant, timeline_id) @@ -624,19 +547,12 @@ def test_compaction_induced_by_detaches_in_history( assert len(post_compact_l0s) == 1, "only the consecutive inherited L0s should be compacted" fullbackup_after = test_output_dir / "fullbackup_after.tar" - cmd = [ - "psql", - "--no-psqlrc", - env.pageserver.connstr(), - "-c", - f"fullbackup {env.initial_tenant} {branch_timeline_id} {branch_lsn}", - "-o", - str(fullbackup_after), - ] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_after + ) # we don't need to skip any files, because zenith.signal will be identical - tar_cmp(fullbackup_before, fullbackup_after, set()) + assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set()) # TODO: From d1d55bbd9fbdca34d7f96c391f4a0368fb8c6583 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 22 May 2024 14:43:10 +0100 Subject: [PATCH 111/130] CI(report-benchmarks-failures): fix condition (#7820) ## Problem `report-benchmarks-failures` got skipped if a dependent job fails. ## Summary of changes - Fix the if-condition by adding `&& failures()` to it; it'll make the job run if the dependent job fails. --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 14d19f7ae3..d8ad6e26d0 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -548,7 +548,7 @@ jobs: report-benchmarks-failures: needs: [ benchmarks, create-test-report ] - if: github.ref_name == 'main' && needs.benchmarks.result == 'failure' + if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure' runs-on: ubuntu-latest steps: From ce44dfe3532b22689464cbeddbf392d05049b9c4 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 22 May 2024 16:55:34 +0300 Subject: [PATCH 112/130] openapi: document timeline ancestor detach (#7650) The openapi description with the error descriptions: - 200 is used for "detached or has been detached previously" - 400 is used for "cannot be detached right now" -- it's an odd thing, but good enough - 404 is used for tenant or timeline not found - 409 is used for "can never be detached" (root timeline) - 500 is used for transient errors (basically ill-defined shutdown errors) - 503 is used for busy (other tenant ancestor detach underway, pageserver shutdown) Cc: #6994 --- pageserver/src/http/openapi_spec.yml | 87 +++++++++++++++++++ pageserver/src/http/routes.rs | 27 +++--- .../src/tenant/timeline/detach_ancestor.rs | 30 ++++++- .../regress/test_timeline_detach_ancestor.py | 36 +++++++- 4 files changed, 161 insertions(+), 19 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 107bcd4a22..e5eafc51f4 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -612,6 +612,80 @@ paths: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + ŕequired: true + schema: + type: string + + put: + description: | + Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`. + Current implementation might not be retryable across failure cases, but will be enhanced in future. + Detaching should be expected to be expensive operation. Timeouts should be retried. + responses: + "200": + description: | + The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented. + If any timelines were deleted after reparenting, they might not be on this list. + content: + application/json: + schema: + $ref: "#/components/schemas/AncestorDetached" + + "400": + description: | + Number of early checks meaning the timeline cannot be detached now: + - the ancestor of timeline has an ancestor: not supported, see RFC + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + "404": + description: Tenant or timeline not found. + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" + + "409": + description: | + The timeline can never be detached: + - timeline has no ancestor, implying that the timeline has never had an ancestor + content: + application/json: + schema: + $ref: "#/components/schemas/ConflictError" + + "500": + description: | + Transient error, for example, pageserver shutdown happened while + processing the request but we were unable to distinguish that. Must + be retried. + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + "503": + description: | + Temporarily unavailable, please retry. Possible reasons: + - another timeline detach for the same tenant is underway, please retry later + - detected shutdown error + content: + application/json: + schema: + $ref: "#/components/schemas/ServiceUnavailableError" + + /v1/tenant/: get: description: Get tenants list @@ -1077,6 +1151,19 @@ components: format: int64 description: How many bytes of layer content were in the latest layer heatmap + AncestorDetached: + type: object + required: + - reparented_timelines + properties: + reparented_timelines: + type: array + description: Set of reparented timeline ids + properties: + type: string + format: hex + description: TimelineId + Error: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index c75e4ca5a9..34b9806a26 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -74,6 +74,7 @@ use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::Timeline; +use crate::tenant::GetTimelineError; use crate::tenant::SpawnMode; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::{config::PageServerConf, tenant::mgr}; @@ -279,6 +280,13 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(gte: GetTimelineError) -> Self { + // Rationale: tenant is activated only after eligble timelines activate + ApiError::NotFound(gte.into()) + } +} + impl From for ApiError { fn from(e: GetActiveTenantError) -> ApiError { match e { @@ -643,9 +651,7 @@ async fn timeline_preserve_initdb_handler( .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; - let timeline = tenant - .get_timeline(timeline_id, false) - .map_err(|e| ApiError::NotFound(e.into()))?; + let timeline = tenant.get_timeline(timeline_id, false)?; timeline .preserve_initdb_archive() @@ -687,9 +693,7 @@ async fn timeline_detail_handler( tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - let timeline = tenant - .get_timeline(timeline_id, false) - .map_err(|e| ApiError::NotFound(e.into()))?; + let timeline = tenant.get_timeline(timeline_id, false)?; let timeline_info = build_timeline_info( &timeline, @@ -1901,14 +1905,11 @@ async fn timeline_detach_ancestor_handler( let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download); let ctx = &ctx; - let timeline = tenant - .get_timeline(timeline_id, true) - .map_err(|e| ApiError::NotFound(e.into()))?; + let timeline = tenant.get_timeline(timeline_id, true)?; let (_guard, prepared) = timeline .prepare_to_detach_from_ancestor(&tenant, options, ctx) - .await - .map_err(|e| ApiError::InternalServerError(e.into()))?; + .await?; let res = state .tenant_manager @@ -2042,9 +2043,7 @@ async fn active_timeline_of_active_tenant( tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - tenant - .get_timeline(timeline_id, true) - .map_err(|e| ApiError::NotFound(e.into())) + Ok(tenant.get_timeline(timeline_id, true)?) } async fn always_panic_handler( diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 4d8e570181..e6ddabe5b5 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -12,7 +12,7 @@ use crate::{ }; use tokio_util::sync::CancellationToken; use tracing::Instrument; -use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn}; +use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn}; #[derive(Debug, thiserror::Error)] pub(crate) enum Error { @@ -41,6 +41,27 @@ pub(crate) enum Error { Unexpected(#[source] anyhow::Error), } +impl From for ApiError { + fn from(value: Error) -> Self { + match value { + e @ Error::NoAncestor => ApiError::Conflict(e.to_string()), + // TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError? + e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)), + Error::ShuttingDown => ApiError::ShuttingDown, + Error::OtherTimelineDetachOngoing(_) => { + ApiError::ResourceUnavailable("other timeline detach is already ongoing".into()) + } + // All of these contain shutdown errors, in fact, it's the most common + e @ Error::FlushAncestor(_) + | e @ Error::RewrittenDeltaDownloadFailed(_) + | e @ Error::CopyDeltaPrefix(_) + | e @ Error::UploadRewritten(_) + | e @ Error::CopyFailed(_) + | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()), + } + } +} + pub(crate) struct PreparedTimelineDetach { layers: Vec, } @@ -75,6 +96,11 @@ pub(super) async fn prepare( .as_ref() .map(|tl| (tl.clone(), detached.ancestor_lsn)) else { + // TODO: check if we have already been detached; for this we need to read the stored data + // on remote client, for that we need a follow-up which makes uploads cheaper and maintains + // a projection of the commited data. + // + // the error is wrong per openapi return Err(NoAncestor); }; @@ -84,7 +110,7 @@ pub(super) async fn prepare( if ancestor.ancestor_timeline.is_some() { // non-technical requirement; we could flatten N ancestors just as easily but we chose - // not to + // not to, at least initially return Err(TooManyAncestors); } diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 1563c161e0..f0b2f7d733 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -8,9 +8,13 @@ from typing import List, Tuple import pytest from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn -from fixtures.pageserver.http import HistoricLayerInfo -from fixtures.pageserver.utils import wait_timeline_detail_404 +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException +from fixtures.pageserver.utils import wait_tenant_status_404, wait_timeline_detail_404 from fixtures.remote_storage import LocalFsStorage from fixtures.utils import assert_pageserver_backups_equal @@ -555,6 +559,32 @@ def test_compaction_induced_by_detaches_in_history( assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set()) +def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with pytest.raises(PageserverApiException, match=".* no ancestors") as info: + client.detach_ancestor(env.initial_tenant, env.initial_timeline) + assert info.value.status_code == 409 + + first_branch = env.neon_cli.create_branch("first_branch") + second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") + + # funnily enough this does not have a prefix + with pytest.raises(PageserverApiException, match="too many ancestors") as info: + client.detach_ancestor(env.initial_tenant, second_branch) + assert info.value.status_code == 400 + + client.tenant_delete(env.initial_tenant) + wait_tenant_status_404(client, env.initial_tenant, 10, 1) + + with pytest.raises(PageserverApiException) as e: + client.detach_ancestor(env.initial_tenant, first_branch) + assert e.value.status_code == 404 + + # TODO: # - after starting the operation, tenant is deleted # - after starting the operation, pageserver is shutdown, restarted From 8901ce9c99f068464e9ee15673f25569ff298f17 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 21 May 2024 18:26:47 -0500 Subject: [PATCH 113/130] Fix typos in action definitions --- .github/actions/neon-branch-create/action.yml | 6 +++--- .github/actions/neon-branch-delete/action.yml | 8 ++++---- .github/actions/neon-project-create/action.yml | 12 ++++++------ .github/actions/neon-project-delete/action.yml | 6 +++--- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index dea3fc2357..9f752d5a89 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -3,13 +3,13 @@ description: 'Create Branch using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true project_id: - desctiption: 'ID of the Project to create Branch in' + description: 'ID of the Project to create Branch in' required: true api_host: - desctiption: 'Neon API host' + description: 'Neon API host' default: console-stage.neon.build outputs: dsn: diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml index 8acba7ad00..58141a4a3f 100644 --- a/.github/actions/neon-branch-delete/action.yml +++ b/.github/actions/neon-branch-delete/action.yml @@ -3,16 +3,16 @@ description: 'Delete Branch using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true project_id: - desctiption: 'ID of the Project which should be deleted' + description: 'ID of the Project which should be deleted' required: true branch_id: - desctiption: 'ID of the branch to delete' + description: 'ID of the branch to delete' required: true api_host: - desctiption: 'Neon API host' + description: 'Neon API host' default: console-stage.neon.build runs: diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index 7f0e599b97..4039a58b9e 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -3,22 +3,22 @@ description: 'Create Neon Project using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true region_id: - desctiption: 'Region ID, if not set the project will be created in the default region' + description: 'Region ID, if not set the project will be created in the default region' default: aws-us-east-2 postgres_version: - desctiption: 'Postgres version; default is 15' + description: 'Postgres version; default is 15' default: 15 api_host: - desctiption: 'Neon API host' + description: 'Neon API host' default: console-stage.neon.build provisioner: - desctiption: 'k8s-pod or k8s-neonvm' + description: 'k8s-pod or k8s-neonvm' default: 'k8s-pod' compute_units: - desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal' + description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal' default: '[1, 1]' outputs: diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml index b8ec6cac70..35e165fd61 100644 --- a/.github/actions/neon-project-delete/action.yml +++ b/.github/actions/neon-project-delete/action.yml @@ -3,13 +3,13 @@ description: 'Delete Neon Project using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true project_id: - desctiption: 'ID of the Project to delete' + description: 'ID of the Project to delete' required: true api_host: - desctiption: 'Neon API host' + description: 'Neon API host' default: console-stage.neon.build runs: From 900f39111507bc058bd541b66deddd66d6473443 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 21 May 2024 18:27:30 -0500 Subject: [PATCH 114/130] Make postgres_version action input default to a string This is "required" by GitHub Actions, though they must do some coersion on their side. --- .github/actions/neon-project-create/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index 4039a58b9e..16759ad038 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -10,7 +10,7 @@ inputs: default: aws-us-east-2 postgres_version: description: 'Postgres version; default is 15' - default: 15 + default: '15' api_host: description: 'Neon API host' default: console-stage.neon.build From 325f3784f9fe46b5152000ba50ae4a01c9fd68a4 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 22 May 2024 16:02:20 +0100 Subject: [PATCH 115/130] CI(promote-images): simplify & fix the job (#7826) ## Problem Currently, `latest` tag is added to the images in several cases: ``` github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' ``` This leads to a race; the `latest` tag jumps back and forth depending on the branch that has built images. ## Summary of changes - Do not push `latest` images to prod ECR (we don't use it) - Use `docker buildx imagetools` instead of `crane` for tagging images - Unify `vm-compute-node-image` job with others and use dockerhub as a first source for images (sync images with ECR) - Tag images with `latest` only for commits in `main` --- .github/workflows/build_and_test.yml | 117 ++++++++++++--------------- 1 file changed, 52 insertions(+), 65 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d8ad6e26d0..5056025457 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -883,22 +883,39 @@ jobs: curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder chmod +x vm-builder + # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings + # The default value is ~/.docker + - name: Set custom docker config directory + run: | + mkdir -p .docker-custom + echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV + + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | - docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - name: Build vm image run: | ./vm-builder \ -spec=vm-image-spec.yaml \ - -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \ - -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ + -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - name: Pushing vm-compute-node image run: | - docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + + - name: Remove custom docker config directory + if: always() + run: | + rm -rf .docker-custom test-images: needs: [ check-permissions, tag, neon-image, compute-node-image ] @@ -946,78 +963,48 @@ jobs: promote-images: needs: [ check-permissions, tag, test-images, vm-compute-node-image ] - runs-on: [ self-hosted, gen3, small ] - container: golang:1.19-bullseye - # Don't add if-condition here. - # The job should always be run because we have dependant other jobs that shouldn't be skipped + runs-on: ubuntu-latest + + env: + VERSIONS: v14 v15 v16 steps: - - name: Install Crane & ECR helper - run: | - go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 - go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Configure ECR login - run: | - mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + - uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - name: Copy vm-compute-node images to Docker Hub + - name: Copy vm-compute-node images to ECR run: | - crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14 - crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15 - crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16 + for version in ${VERSIONS}; do + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} + done - name: Add latest tag to images - if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' + if: github.ref_name == 'main' run: | - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest + for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do + docker buildx imagetools create -t $repo/neon:latest \ + $repo/neon:${{ needs.tag.outputs.build-tag }} - - name: Push images to production ECR - if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy' - run: | - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest + docker buildx imagetools create -t $repo/compute-tools:latest \ + $repo/compute-tools:${{ needs.tag.outputs.build-tag }} - - name: Configure Docker Hub login - run: | - # ECR Credential Helper & Docker Hub don't work together in config, hence reset - echo "" > /github/home/.docker/config.json - crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io + for version in ${VERSIONS}; do + docker buildx imagetools create -t $repo/compute-node-${version}:latest \ + $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }} - - name: Push vm-compute-node to Docker Hub - run: | - crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} - crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} - crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} - - - name: Push latest tags to Docker Hub - if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy' - run: | - crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest - - - name: Cleanup ECR folder - run: rm -rf ~/.ecr + docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \ + $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} + done + done trigger-custom-extensions-build-and-wait: needs: [ check-permissions, tag ] From a7f31f1a59aec1cdd66df71ffaf3b7ad176c4966 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 22 May 2024 16:06:05 +0100 Subject: [PATCH 116/130] CI: build multi-arch images (#7696) ## Problem We don't build our docker images for ARM arch, and that makes it harder to run images on ARM (on MacBooks with Apple Silicon, for example). ## Summary of changes - Build `neondatabase/neon` for ARM and create a multi-arch image - Build `neondatabase/compute-node-vXX` for ARM and create a multi-arch image - Run `test-images` job on ARM as well --- .github/workflows/build_and_test.yml | 123 +++++++++++++++++----- docker-compose/compute_wrapper/Dockerfile | 2 +- docker-compose/docker_compose_test.sh | 2 - 3 files changed, 97 insertions(+), 30 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5056025457..2ab1417d6d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -723,9 +723,13 @@ jobs: uses: ./.github/workflows/trigger-e2e-tests.yml secrets: inherit - neon-image: + neon-image-arch: needs: [ check-permissions, build-build-tools-image, tag ] - runs-on: [ self-hosted, gen3, large ] + strategy: + matrix: + arch: [ x64, arm64 ] + + runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - name: Checkout @@ -747,12 +751,6 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - uses: docker/login-action@v3 - with: - registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com - username: ${{ secrets.AWS_ACCESS_KEY_DEV }} - password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - uses: docker/build-push-action@v5 with: context: . @@ -764,25 +762,52 @@ jobs: push: true pull: true file: Dockerfile - cache-from: type=registry,ref=neondatabase/neon:cache - cache-to: type=registry,ref=neondatabase/neon:cache,mode=max + cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }} + cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max tags: | - 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} - neondatabase/neon:${{needs.tag.outputs.build-tag}} + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - name: Remove custom docker config directory if: always() run: | rm -rf .docker-custom - compute-node-image: - needs: [ check-permissions, build-build-tools-image, tag ] - runs-on: [ self-hosted, gen3, large ] + neon-image: + needs: [ neon-image-arch, tag ] + runs-on: ubuntu-latest + steps: + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Create multi-arch image + run: | + docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64 + + - uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} + + - name: Push multi-arch image to ECR + run: | + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }} + + compute-node-image-arch: + needs: [ check-permissions, build-build-tools-image, tag ] strategy: fail-fast: false matrix: version: [ v14, v15, v16 ] + arch: [ x64, arm64 ] + + runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - name: Checkout @@ -829,15 +854,14 @@ jobs: push: true pull: true file: Dockerfile.compute-node - cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache - cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max + cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }} + cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max tags: | - 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once - if: ${{ matrix.version == 'v16' }} + if: matrix.version == 'v16' uses: docker/build-push-action@v5 with: target: compute-tools-image @@ -851,14 +875,57 @@ jobs: pull: true file: Dockerfile.compute-node tags: | - 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - name: Remove custom docker config directory if: always() run: | rm -rf .docker-custom + compute-node-image: + needs: [ compute-node-image-arch, tag ] + runs-on: ubuntu-latest + + strategy: + matrix: + version: [ v14, v15, v16 ] + + steps: + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Create multi-arch compute-node image + run: | + docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 + + - name: Create multi-arch compute-tools image + if: matrix.version == 'v16' + run: | + docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64 + + - uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} + + - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR + run: | + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + + - name: Push multi-arch compute-tools image to ECR + if: matrix.version == 'v16' + run: | + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} + vm-compute-node-image: needs: [ check-permissions, tag, compute-node-image ] runs-on: [ self-hosted, gen3, large ] @@ -866,9 +933,6 @@ jobs: fail-fast: false matrix: version: [ v14, v15, v16 ] - defaults: - run: - shell: sh -eu {0} env: VM_BUILDER_VERSION: v0.28.1 @@ -919,7 +983,12 @@ jobs: test-images: needs: [ check-permissions, tag, neon-image, compute-node-image ] - runs-on: [ self-hosted, gen3, small ] + strategy: + fail-fast: false + matrix: + arch: [ x64, arm64 ] + + runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} steps: - name: Checkout @@ -937,7 +1006,7 @@ jobs: - name: Verify image versions shell: bash # ensure no set -e for better error messages run: | - pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") + pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") echo "Pageserver version string: $pageserver_version" diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile index f1b1986072..974dcd7f03 100644 --- a/docker-compose/compute_wrapper/Dockerfile +++ b/docker-compose/compute_wrapper/Dockerfile @@ -1,4 +1,4 @@ -ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG REPOSITORY=neondatabase ARG COMPUTE_IMAGE=compute-node-v14 ARG TAG=latest diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index e18b0f9176..062fc6fc92 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -8,8 +8,6 @@ # Their defaults point at DockerHub `neondatabase/neon:latest` image.`, # to verify custom image builds (e.g pre-published ones). -# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer. - set -eux -o pipefail SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" From e015b2bf3eb96c7a5d8b9bf0253ae13e569f5747 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 22 May 2024 16:10:58 +0100 Subject: [PATCH 117/130] safekeeper: use CancellationToken instead of watch channel (#7836) ## Problem Safekeeper Timeline uses a channel for cancellation, but we have a dedicated type for that. ## Summary of changes - Use CancellationToken in Timeline --- safekeeper/src/recovery.rs | 10 ++------- safekeeper/src/timeline.rs | 31 +++++++--------------------- safekeeper/src/timeline_manager.rs | 10 +-------- safekeeper/src/wal_backup_partial.rs | 14 +++---------- 4 files changed, 13 insertions(+), 52 deletions(-) diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index e8fa6c55f4..dfa1892c40 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -37,17 +37,11 @@ use crate::{ #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))] pub async fn recovery_main(tli: Arc, conf: SafeKeeperConf) { info!("started"); - let mut cancellation_rx = match tli.get_cancellation_rx() { - Ok(rx) => rx, - Err(_) => { - info!("timeline canceled during task start"); - return; - } - }; + let cancel = tli.cancel.clone(); select! { _ = recovery_main_loop(tli, conf) => { unreachable!() } - _ = cancellation_rx.changed() => { + _ = cancel.cancelled() => { info!("stopped"); } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index da2e3f4538..89c157d514 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -6,6 +6,7 @@ use camino::Utf8PathBuf; use postgres_ffi::XLogSegNo; use serde::{Deserialize, Serialize}; use tokio::fs; +use tokio_util::sync::CancellationToken; use std::cmp::max; use std::ops::{Deref, DerefMut}; @@ -342,12 +343,8 @@ pub struct Timeline { walsenders: Arc, walreceivers: Arc, - /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal. - cancellation_tx: watch::Sender, - - /// Timeline should not be used after cancellation. Background tasks should - /// monitor this channel and stop eventually after receiving `true` from this channel. - cancellation_rx: watch::Receiver, + /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires + pub(crate) cancel: CancellationToken, /// Directory where timeline state is stored. pub timeline_dir: Utf8PathBuf, @@ -376,7 +373,6 @@ impl Timeline { shared_state.sk.flush_lsn(), ))); let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0); - let (cancellation_tx, cancellation_rx) = watch::channel(false); let walreceivers = WalReceivers::new(); Ok(Timeline { @@ -390,8 +386,7 @@ impl Timeline { mutex: RwLock::new(shared_state), walsenders: WalSenders::new(walreceivers.clone()), walreceivers, - cancellation_rx, - cancellation_tx, + cancel: CancellationToken::default(), timeline_dir: conf.timeline_dir(&ttid), walsenders_keep_horizon: conf.walsenders_keep_horizon, broker_active: AtomicBool::new(false), @@ -411,7 +406,6 @@ impl Timeline { let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID))); let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0); - let (cancellation_tx, cancellation_rx) = watch::channel(false); let state = TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); @@ -428,8 +422,7 @@ impl Timeline { mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?), walsenders: WalSenders::new(walreceivers.clone()), walreceivers, - cancellation_rx, - cancellation_tx, + cancel: CancellationToken::default(), timeline_dir: conf.timeline_dir(&ttid), walsenders_keep_horizon: conf.walsenders_keep_horizon, broker_active: AtomicBool::new(false), @@ -535,7 +528,7 @@ impl Timeline { /// eventually after receiving cancellation signal. fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) { info!("timeline {} is cancelled", self.ttid); - let _ = self.cancellation_tx.send(true); + self.cancel.cancel(); // Close associated FDs. Nobody will be able to touch timeline data once // it is cancelled, so WAL storage won't be opened again. shared_state.sk.wal_store.close(); @@ -543,17 +536,7 @@ impl Timeline { /// Returns if timeline is cancelled. pub fn is_cancelled(&self) -> bool { - *self.cancellation_rx.borrow() - } - - /// Returns watch channel which gets value when timeline is cancelled. It is - /// guaranteed to have not cancelled value observed (errors otherwise). - pub fn get_cancellation_rx(&self) -> Result> { - let rx = self.cancellation_rx.clone(); - if *rx.borrow() { - bail!(TimelineError::Cancelled(self.ttid)); - } - Ok(rx) + self.cancel.is_cancelled() } /// Take a writing mutual exclusive lock on timeline shared_state. diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index 52ad915065..e74ba37ad8 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -47,14 +47,6 @@ pub async fn main_task( conf: SafeKeeperConf, broker_active_set: Arc, ) { - let mut cancellation_rx = match tli.get_cancellation_rx() { - Ok(rx) => rx, - Err(_) => { - info!("timeline canceled during task start"); - return; - } - }; - scopeguard::defer! { if tli.is_cancelled() { info!("manager task finished"); @@ -129,7 +121,7 @@ pub async fn main_task( // wait until something changes. tx channels are stored under Arc, so they will not be // dropped until the manager task is finished. tokio::select! { - _ = cancellation_rx.changed() => { + _ = tli.cancel.cancelled() => { // timeline was deleted break 'outer state_snapshot; } diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 200096ac5c..29e944bff3 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -277,14 +277,6 @@ pub async fn main_task(tli: Arc, conf: SafeKeeperConf) { debug!("started"); let await_duration = conf.partial_backup_timeout; - let mut cancellation_rx = match tli.get_cancellation_rx() { - Ok(rx) => rx, - Err(_) => { - info!("timeline canceled during task start"); - return; - } - }; - // sleep for random time to avoid thundering herd { let randf64 = rand::thread_rng().gen_range(0.0..1.0); @@ -327,7 +319,7 @@ pub async fn main_task(tli: Arc, conf: SafeKeeperConf) { && flush_lsn_rx.borrow().term == seg.term { tokio::select! { - _ = cancellation_rx.changed() => { + _ = backup.tli.cancel.cancelled() => { info!("timeline canceled"); return; } @@ -340,7 +332,7 @@ pub async fn main_task(tli: Arc, conf: SafeKeeperConf) { // if we don't have any data and zero LSNs, wait for something while flush_lsn_rx.borrow().lsn == Lsn(0) { tokio::select! { - _ = cancellation_rx.changed() => { + _ = backup.tli.cancel.cancelled() => { info!("timeline canceled"); return; } @@ -357,7 +349,7 @@ pub async fn main_task(tli: Arc, conf: SafeKeeperConf) { // waiting until timeout expires OR segno changes 'inner: loop { tokio::select! { - _ = cancellation_rx.changed() => { + _ = backup.tli.cancel.cancelled() => { info!("timeline canceled"); return; } From 62aac6c8add432da1ab2d206f19323e808622e8d Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 22 May 2024 18:13:45 +0300 Subject: [PATCH 118/130] fix(Layer): carry gate until eviction is complete (#7838) the gate was accidentially being dropped before the final blocking phase, possibly explaining the resident physical size global problems during deletions. it could had caused more harm as well, but the path is not actively being tested because cplane no longer puts locationconfigs with higher generation number during normal operation which prompted the last wave of fixes. Cc: #7341. --- pageserver/src/tenant/storage_layer/layer.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 45d61ce048..8c64621710 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -12,7 +12,7 @@ use std::time::{Duration, SystemTime}; use tracing::Instrument; use utils::id::TimelineId; use utils::lsn::Lsn; -use utils::sync::heavier_once_cell; +use utils::sync::{gate, heavier_once_cell}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; @@ -1333,7 +1333,7 @@ impl LayerInner { is_good_to_continue(&rx.borrow_and_update())?; - let Ok(_gate) = timeline.gate.enter() else { + let Ok(gate) = timeline.gate.enter() else { return Err(EvictionCancelled::TimelineGone); }; @@ -1421,7 +1421,7 @@ impl LayerInner { Self::spawn_blocking(move || { let _span = span.entered(); - let res = self.evict_blocking(&timeline, &permit); + let res = self.evict_blocking(&timeline, &gate, &permit); let waiters = self.inner.initializer_count(); @@ -1447,6 +1447,7 @@ impl LayerInner { fn evict_blocking( &self, timeline: &Timeline, + _gate: &gate::GateGuard, _permit: &heavier_once_cell::InitPermit, ) -> Result<(), EvictionCancelled> { // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit` From 3404e76a51311a595e28beb7d0962488cafaf143 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 22 May 2024 14:36:13 +0300 Subject: [PATCH 119/130] Fix confusion between 1-based Buffer and 0-based index (#7825) The code was working correctly, but was incorrectly using Buffer for a 0-based index into the BufferDesc array. --- pgxn/neon/pagestore_smgr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 249ad313b0..4361c74905 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -3216,7 +3216,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) BufferTag tag; uint32 hash; LWLock *partitionLock; - Buffer buffer; + int buf_id; bool no_redo_needed; if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id)) @@ -3254,9 +3254,9 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) else { /* Try to find the relevant buffer */ - buffer = BufTableLookup(&tag, hash); + buf_id = BufTableLookup(&tag, hash); - no_redo_needed = buffer < 0; + no_redo_needed = buf_id < 0; } /* In both cases st lwlsn past this WAL record */ SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); From 921756402673e9e2fa764fbacf33457f05bc3d59 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 22 May 2024 14:57:09 +0300 Subject: [PATCH 120/130] Fix issues with determining request LSN in read replica (#7795) Don't set last-written LSN of a page when the record is replayed, only when the page is evicted from cache. For comparison, we don't update the last-written LSN on every page modification on the primary either, only when the page is evicted. Do update the last-written LSN when the page update is skipped in WAL redo, however. In neon_get_request_lsns(), don't be surprised if the last-written LSN is equal to the record being replayed. Use the LSN of the record being replayed as the request LSN in that case. Add a long comment explaining how that can happen. In neon_wallog_page, update last-written LSN also when Shutdown has been requested. We might still fetch and evict pages for a while, after shutdown has been requested, so we better continue to do that correctly. Enable the check that we don't evict a page with zero LSN also in standby, but make it a LOG message instead of PANIC Fixes issue https://github.com/neondatabase/neon/issues/7791 --- pgxn/neon/pagestore_smgr.c | 142 +++++++++++++++++++++--- test_runner/regress/test_hot_standby.py | 60 ++++++++++ 2 files changed, 185 insertions(+), 17 deletions(-) diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 4361c74905..41546eae85 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1349,6 +1349,10 @@ PageIsEmptyHeapPage(char *buffer) return memcmp(buffer, empty_page.data, BLCKSZ) == 0; } +/* + * A page is being evicted from the shared buffer cache. Update the + * last-written LSN of the page, and WAL-log it if needed. + */ static void #if PG_MAJORVERSION_NUM < 16 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) @@ -1357,12 +1361,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co #endif { XLogRecPtr lsn = PageGetLSN((Page) buffer); - - if (ShutdownRequestPending) - return; - /* Don't log any pages if we're not allowed to do so. */ - if (!XLogInsertAllowed()) - return; + bool log_page; /* * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM @@ -1371,9 +1370,21 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co * correctness, the non-logged updates are not critical. But we want to * have a reasonably up-to-date VM and FSM in the page server. */ - if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress()) + log_page = false; + if (force) + { + Assert(XLogInsertAllowed()); + log_page = true; + } + else if (XLogInsertAllowed() && + !ShutdownRequestPending && + (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM)) + { + log_page = true; + } + + if (log_page) { - /* FSM is never WAL-logged and we don't care. */ XLogRecPtr recptr; recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum, @@ -1386,7 +1397,8 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); } - else if (lsn == InvalidXLogRecPtr) + + if (lsn == InvalidXLogRecPtr) { /* * When PostgreSQL extends a relation, it calls smgrextend() with an @@ -1422,19 +1434,31 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); } - else + else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM) { - ereport(PANIC, + /* + * Its a bad sign if there is a page with zero LSN in the buffer + * cache in a standby, too. However, PANICing seems like a cure + * worse than the disease, as the damage has likely already been + * done in the primary. So in a standby, make this an assertion, + * and in a release build just LOG the error and soldier on. We + * update the last-written LSN of the page with a conservative + * value in that case, which is the last replayed LSN. + */ + ereport(RecoveryInProgress() ? LOG : PANIC, (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); + Assert(false); + + lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */ } } else { ereport(SmgrTrace, - (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X", + (errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); @@ -1527,8 +1551,92 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) if (RecoveryInProgress()) { - /* Request the page at the last replayed LSN. */ - result.request_lsn = GetXLogReplayRecPtr(NULL); + /*--- + * In broad strokes, a replica always requests the page at the current + * replay LSN. But looking closer, what exactly is the replay LSN? Is + * it the last replayed record, or the record being replayed? And does + * the startup process performing the replay need to do something + * differently than backends running queries? Let's take a closer look + * at the different scenarios: + * + * 1. Startup process reads a page, last_written_lsn is old. + * + * Read the old version of the page. We will apply the WAL record on + * it to bring it up-to-date. + * + * We could read the new version, with the changes from this WAL + * record already applied, to offload the work of replaying the record + * to the pageserver. The pageserver might not have received the WAL + * record yet, though, so a read of the old page version and applying + * the record ourselves is likely faster. Also, the redo function + * might be surprised if the changes have already applied. That's + * normal during crash recovery, but not in hot standby. + * + * 2. Startup process reads a page, last_written_lsn == record we're + * replaying. + * + * Can this happen? There are a few theoretical cases when it might: + * + * A) The redo function reads the same page twice. We had already read + * and applied the changes once, and now we're reading it for the + * second time. That would be a rather silly thing for a redo + * function to do, and I'm not aware of any that would do it. + * + * B) The redo function modifies multiple pages, and it already + * applied the changes to one of the pages, released the lock on + * it, and is now reading a second page. Furthermore, the first + * page was already evicted from the buffer cache, and also from + * the last-written LSN cache, so that the per-relation or global + * last-written LSN was already updated. All the WAL redo functions + * hold the locks on pages that they modify, until all the changes + * have been modified (?), which would make that impossible. + * However, we skip the locking, if the page isn't currently in the + * page cache (see neon_redo_read_buffer_filter below). + * + * Even if the one of the above cases were possible in theory, they + * would also require the pages being modified by the redo function to + * be immediately evicted from the page cache. + * + * So this probably does not happen in practice. But if it does, we + * request the new version, including the changes from the record + * being replayed. That seems like the correct behavior in any case. + * + * 3. Backend process reads a page with old last-written LSN + * + * Nothing special here. Read the old version. + * + * 4. Backend process reads a page with last_written_lsn == record being replayed + * + * This can happen, if the redo function has started to run, and saw + * that the page isn't present in the page cache (see + * neon_redo_read_buffer_filter below). Normally, in a normal + * Postgres server, the redo function would hold a lock on the page, + * so we would get blocked waiting the redo function to release the + * lock. To emulate that, wait for the WAL replay of the record to + * finish. + */ + /* Request the page at the end of the last fully replayed LSN. */ + XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL); + + if (last_written_lsn > replay_lsn) + { + /* GetCurrentReplayRecPtr was introduced in v15 */ +#if PG_VERSION_NUM >= 150000 + Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); +#endif + + /* + * Cases 2 and 4. If this is a backend (case 4), the + * neon_read_at_lsn() call later will wait for the WAL record to be + * fully replayed. + */ + result.request_lsn = last_written_lsn; + } + else + { + /* cases 1 and 3 */ + result.request_lsn = replay_lsn; + } result.not_modified_since = last_written_lsn; result.effective_request_lsn = result.request_lsn; Assert(last_written_lsn <= result.request_lsn); @@ -3258,16 +3366,16 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) no_redo_needed = buf_id < 0; } - /* In both cases st lwlsn past this WAL record */ - SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); /* * we don't have the buffer in memory, update lwLsn past this record, also * evict page from file cache */ if (no_redo_needed) + { + SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); lfc_evict(rinfo, forknum, blkno); - + } LWLockRelease(partitionLock); diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 244d482c18..cf7a1c56ee 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -1,3 +1,4 @@ +import asyncio import os import re import threading @@ -292,3 +293,62 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): assert slot_xmin is None wait_until(10, 1.0, xmin_is_null) + + +# Test race condition between WAL replay and backends performing queries +# https://github.com/neondatabase/neon/issues/7791 +def test_replica_query_race(neon_simple_env: NeonEnv): + env = neon_simple_env + + primary_ep = env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) + + with primary_ep.connect() as p_con: + with p_con.cursor() as p_cur: + p_cur.execute("CREATE EXTENSION neon_test_utils") + p_cur.execute("CREATE TABLE test AS SELECT 0 AS counter") + + standby_ep = env.endpoints.new_replica_start(origin=primary_ep, endpoint_id="standby") + time.sleep(1) + + # In primary, run a lot of UPDATEs on a single page + finished = False + writecounter = 1 + + async def primary_workload(): + nonlocal writecounter, finished + conn = await primary_ep.connect_async() + while writecounter < 10000: + writecounter += 1 + await conn.execute(f"UPDATE test SET counter = {writecounter}") + finished = True + + # In standby, at the same time, run queries on it. And repeatedly drop caches + async def standby_workload(): + nonlocal writecounter, finished + conn = await standby_ep.connect_async() + reads = 0 + while not finished: + readcounter = await conn.fetchval("SELECT counter FROM test") + + # Check that the replica is keeping up with the primary. In local + # testing, the lag between primary and standby is much smaller, in + # the ballpark of 2-3 counter values. But be generous in case there's + # some hiccup. + # assert(writecounter - readcounter < 1000) + assert readcounter <= writecounter + if reads % 100 == 0: + log.info(f"read {reads}: counter {readcounter}, last update {writecounter}") + reads += 1 + + await conn.execute("SELECT clear_buffer_cache()") + + async def both(): + await asyncio.gather( + primary_workload(), + standby_workload(), + ) + + asyncio.run(both()) From 37f81289c2d1b27f88316980fb6b307e7e46d409 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 22 May 2024 18:24:52 +0300 Subject: [PATCH 121/130] Make 'neon.protocol_version = 2' the default, take two (#7819) Once all the computes in production have restarted, we can remove protocol version 1 altogether. See issue #6211. This was done earlier already in commit 0115fe6cb2, but reverted before it was released to production in commit bbe730d7ca because of issue https://github.com/neondatabase/neon/issues/7692. That issue was fixed in commit 22afaea6e1, so we are ready to change the default again. --- pgxn/neon/libpagestore.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index b7b1e7ccbf..f5ce2caff3 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -49,7 +49,7 @@ char *neon_auth_token; int readahead_buffer_size = 128; int flush_every_n_requests = 8; -int neon_protocol_version = 1; +int neon_protocol_version = 2; static int n_reconnect_attempts = 0; static int max_reconnect_attempts = 60; @@ -860,7 +860,7 @@ pg_init_libpagestore(void) "Version of compute<->page server protocol", NULL, &neon_protocol_version, - 1, /* default to old protocol for now */ + 2, /* use protocol version 2 */ 1, /* min */ 2, /* max */ PGC_SU_BACKEND, From 64577cfddcdac89d649e1ba6db3a6f44e14e2eee Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Wed, 22 May 2024 12:41:13 -0400 Subject: [PATCH 122/130] feat(pageserver): auto-detect previous aux file policy (#7841) ## Problem If an existing user already has some aux v1 files, we don't want to switch them to the global tenant-level config. Part of #7462 --------- Signed-off-by: Alex Chi Z --- pageserver/src/pgdatadir_mapping.rs | 29 +++++++++++++ pageserver/src/tenant.rs | 67 ++++++++++++++++++++++++++++- 2 files changed, 95 insertions(+), 1 deletion(-) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index f9d8c1020d..7dea687c46 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1480,6 +1480,29 @@ impl<'a> DatadirModification<'a> { // Allowed switch path: // * no aux files -> v1/v2/cross-validation // * cross-validation->v2 + + let current_policy = if current_policy.is_none() { + // This path will only be hit once per tenant: we will decide the final policy in this code block. + // The next call to `put_file` will always have `last_aux_file_policy != None`. + let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); + let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?; + if aux_files_key_v1.is_empty() { + None + } else { + self.tline + .last_aux_file_policy + .store(Some(AuxFilePolicy::V1)); + self.tline + .remote_client + .schedule_index_upload_for_aux_file_policy_update(Some( + AuxFilePolicy::V1, + ))?; + Some(AuxFilePolicy::V1) + } + } else { + current_policy + }; + if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) { self.tline.last_aux_file_policy.store(Some(switch_policy)); self.tline @@ -1775,6 +1798,12 @@ impl<'a> DatadirModification<'a> { self.tline.get(key, lsn, ctx).await } + /// Only used during unit tests, force putting a key into the modification. + #[cfg(test)] + pub(crate) fn put_for_test(&mut self, key: Key, val: Value) { + self.put(key, val); + } + fn put(&mut self, key: Key, val: Value) { let values = self.pending_updates.entry(key).or_default(); // Replace the previous value if it exists at the same lsn diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 1a66f2c919..caf26e0a0b 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3964,18 +3964,20 @@ mod tests { use super::*; use crate::keyspace::KeySpaceAccum; + use crate::pgdatadir_mapping::AuxFilesDirectory; use crate::repository::{Key, Value}; use crate::tenant::harness::*; use crate::tenant::timeline::CompactFlags; use crate::DEFAULT_PG_VERSION; use bytes::{Bytes, BytesMut}; use hex_literal::hex; - use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE}; + use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::CompactionAlgorithm; use rand::{thread_rng, Rng}; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; + use utils::bin_ser::BeSer; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); @@ -5997,6 +5999,69 @@ mod tests { ); } + #[tokio::test] + async fn aux_file_policy_auto_detect() { + let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap(); + harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + None, + "no aux file is written so it should be unset" + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { + files: vec![( + "test_file".to_string(), + Bytes::copy_from_slice(b"test_file"), + )] + .into_iter() + .collect(), + }) + .unwrap(); + modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); + modification.commit(&ctx).await.unwrap(); + } + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V1), + "keep using v1 because there are aux files writting with v1" + ); + + // we can still read the auxfile v1 + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + assert_eq!( + files.get("test_file"), + Some(&bytes::Bytes::from_static(b"test_file")) + ); + } + #[tokio::test] async fn test_metadata_image_creation() -> anyhow::Result<()> { let harness = TenantHarness::create("test_metadata_image_creation")?; From 9cfe08e3d9f1181a163322705ff41cbcfb11db3b Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Wed, 22 May 2024 18:05:43 +0100 Subject: [PATCH 123/130] proxy password threadpool (#7806) ## Problem Despite making password hashing async, it can still take time away from the network code. ## Summary of changes Introduce a custom threadpool, inspired by rayon. Features: ### Fairness Each task is tagged with it's endpoint ID. The more times we have seen the endpoint, the more likely we are to skip the task if it comes up in the queue. This is using a min-count-sketch estimator for the number of times we have seen the endpoint, resetting it every 1000+ steps. Since tasks are immediately rescheduled if they do not complete, the worker could get stuck in a "always work available loop". To combat this, we check the global queue every 61 steps to ensure all tasks quickly get a worker assigned to them. ### Balanced Using crossbeam_deque, like rayon does, we have workstealing out of the box. I've tested it a fair amount and it seems to balance the workload accordingly --- Cargo.lock | 20 +- Cargo.toml | 2 + proxy/Cargo.toml | 4 +- proxy/src/auth/backend.rs | 10 +- proxy/src/auth/backend/hacks.rs | 11 +- proxy/src/auth/flow.rs | 24 ++- proxy/src/bin/proxy.rs | 8 + proxy/src/config.rs | 2 + proxy/src/metrics.rs | 89 +++++++-- proxy/src/scram.rs | 18 +- proxy/src/scram/countmin.rs | 173 +++++++++++++++++ proxy/src/scram/exchange.rs | 49 ++--- proxy/src/scram/pbkdf2.rs | 89 +++++++++ proxy/src/scram/threadpool.rs | 321 ++++++++++++++++++++++++++++++++ proxy/src/serverless/backend.rs | 11 +- workspace_hack/Cargo.toml | 2 + 16 files changed, 759 insertions(+), 74 deletions(-) create mode 100644 proxy/src/scram/countmin.rs create mode 100644 proxy/src/scram/pbkdf2.rs create mode 100644 proxy/src/scram/threadpool.rs diff --git a/Cargo.lock b/Cargo.lock index e6060c82f5..d8f9021eb8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1471,26 +1471,21 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ - "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" -version = "0.9.14" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "autocfg", - "cfg-if", "crossbeam-utils", - "memoffset 0.8.0", - "scopeguard", ] [[package]] @@ -3961,9 +3956,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" [[package]] name = "pbkdf2" -version = "0.12.1" +version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31" +checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" dependencies = [ "digest", "hmac", @@ -4386,6 +4381,7 @@ dependencies = [ name = "proxy" version = "0.1.0" dependencies = [ + "ahash", "anyhow", "async-compression", "async-trait", @@ -4402,6 +4398,7 @@ dependencies = [ "chrono", "clap", "consumption_metrics", + "crossbeam-deque", "dashmap", "env_logger", "fallible-iterator", @@ -7473,6 +7470,7 @@ dependencies = [ name = "workspace_hack" version = "0.1.0" dependencies = [ + "ahash", "anyhow", "aws-config", "aws-runtime", diff --git a/Cargo.toml b/Cargo.toml index 2a7dea447e..0887c039f8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ license = "Apache-2.0" ## All dependency versions, used in the project [workspace.dependencies] +ahash = "0.8" anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } @@ -74,6 +75,7 @@ clap = { version = "4.0", features = ["derive"] } comfy-table = "6.1" const_format = "0.2" crc32c = "0.6" +crossbeam-deque = "0.8.5" crossbeam-utils = "0.8.5" dashmap = { version = "5.5.0", features = ["raw-api"] } either = "1.8" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 5f9b0aa75b..7da0763bc1 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -9,6 +9,7 @@ default = [] testing = [] [dependencies] +ahash.workspace = true anyhow.workspace = true async-compression.workspace = true async-trait.workspace = true @@ -24,6 +25,7 @@ camino.workspace = true chrono.workspace = true clap.workspace = true consumption_metrics.workspace = true +crossbeam-deque.workspace = true dashmap.workspace = true env_logger.workspace = true framed-websockets.workspace = true @@ -52,7 +54,6 @@ opentelemetry.workspace = true parking_lot.workspace = true parquet.workspace = true parquet_derive.workspace = true -pbkdf2 = { workspace = true, features = ["simple", "std"] } pin-project-lite.workspace = true postgres_backend.workspace = true pq_proto.workspace = true @@ -106,6 +107,7 @@ workspace_hack.workspace = true camino-tempfile.workspace = true fallible-iterator.workspace = true tokio-tungstenite.workspace = true +pbkdf2 = { workspace = true, features = ["simple", "std"] } rcgen.workspace = true rstest.workspace = true tokio-postgres-rustls.workspace = true diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 6a906b299b..3555eba543 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -365,7 +365,10 @@ async fn authenticate_with_secret( config: &'static AuthenticationConfig, ) -> auth::Result { if let Some(password) = unauthenticated_password { - let auth_outcome = validate_password_and_exchange(&password, secret).await?; + let ep = EndpointIdInt::from(&info.endpoint); + + let auth_outcome = + validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?; let keys = match auth_outcome { crate::sasl::Outcome::Success(key) => key, crate::sasl::Outcome::Failure(reason) => { @@ -386,7 +389,7 @@ async fn authenticate_with_secret( // Currently, we use it for websocket connections (latency). if allow_cleartext { ctx.set_auth_method(crate::context::AuthMethod::Cleartext); - return hacks::authenticate_cleartext(ctx, info, client, secret).await; + return hacks::authenticate_cleartext(ctx, info, client, secret, config).await; } // Finally, proceed with the main auth flow (SCRAM-based). @@ -554,7 +557,7 @@ mod tests { context::RequestMonitoring, proxy::NeonOptions, rate_limiter::{EndpointRateLimiter, RateBucketInfo}, - scram::ServerSecret, + scram::{threadpool::ThreadPool, ServerSecret}, stream::{PqStream, Stream}, }; @@ -596,6 +599,7 @@ mod tests { } static CONFIG: Lazy = Lazy::new(|| AuthenticationConfig { + thread_pool: ThreadPool::new(1), scram_protocol_timeout: std::time::Duration::from_secs(5), rate_limiter_enabled: true, rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET), diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index f7241be4a9..6b0f5e1726 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -3,8 +3,10 @@ use super::{ }; use crate::{ auth::{self, AuthFlow}, + config::AuthenticationConfig, console::AuthSecret, context::RequestMonitoring, + intern::EndpointIdInt, sasl, stream::{self, Stream}, }; @@ -20,6 +22,7 @@ pub async fn authenticate_cleartext( info: ComputeUserInfo, client: &mut stream::PqStream>, secret: AuthSecret, + config: &'static AuthenticationConfig, ) -> auth::Result { warn!("cleartext auth flow override is enabled, proceeding"); ctx.set_auth_method(crate::context::AuthMethod::Cleartext); @@ -27,8 +30,14 @@ pub async fn authenticate_cleartext( // pause the timer while we communicate with the client let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); + let ep = EndpointIdInt::from(&info.endpoint); + let auth_flow = AuthFlow::new(client) - .begin(auth::CleartextPassword(secret)) + .begin(auth::CleartextPassword { + secret, + endpoint: ep, + pool: config.thread_pool.clone(), + }) .await?; drop(paused); // cleartext auth is only allowed to the ws/http protocol. diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 45bbad8cb2..59d1ac17f4 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -5,12 +5,14 @@ use crate::{ config::TlsServerEndPoint, console::AuthSecret, context::RequestMonitoring, - sasl, scram, + intern::EndpointIdInt, + sasl, + scram::{self, threadpool::ThreadPool}, stream::{PqStream, Stream}, }; use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS}; use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; -use std::io; +use std::{io, sync::Arc}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; @@ -53,7 +55,11 @@ impl AuthMethod for PasswordHack { /// Use clear-text password auth called `password` in docs /// -pub struct CleartextPassword(pub AuthSecret); +pub struct CleartextPassword { + pub pool: Arc, + pub endpoint: EndpointIdInt, + pub secret: AuthSecret, +} impl AuthMethod for CleartextPassword { #[inline(always)] @@ -126,7 +132,13 @@ impl AuthFlow<'_, S, CleartextPassword> { .strip_suffix(&[0]) .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; - let outcome = validate_password_and_exchange(password, self.state.0).await?; + let outcome = validate_password_and_exchange( + &self.state.pool, + self.state.endpoint, + password, + self.state.secret, + ) + .await?; if let sasl::Outcome::Success(_) = &outcome { self.stream.write_message_noflush(&Be::AuthenticationOk)?; @@ -181,6 +193,8 @@ impl AuthFlow<'_, S, Scram<'_>> { } pub(crate) async fn validate_password_and_exchange( + pool: &ThreadPool, + endpoint: EndpointIdInt, password: &[u8], secret: AuthSecret, ) -> super::Result> { @@ -194,7 +208,7 @@ pub(crate) async fn validate_password_and_exchange( } // perform scram authentication as both client and server to validate the keys AuthSecret::Scram(scram_secret) => { - let outcome = crate::scram::exchange(&scram_secret, password).await?; + let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?; let client_key = match outcome { sasl::Outcome::Success(client_key) => client_key, diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index be7d961b8c..30f2e6f4b7 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -27,6 +27,7 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient; use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use proxy::redis::elasticache; use proxy::redis::notifications; +use proxy::scram::threadpool::ThreadPool; use proxy::serverless::cancel_set::CancelSet; use proxy::serverless::GlobalConnPoolOptions; use proxy::usage_metrics; @@ -132,6 +133,9 @@ struct ProxyCliArgs { /// timeout for scram authentication protocol #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] scram_protocol_timeout: tokio::time::Duration, + /// size of the threadpool for password hashing + #[clap(long, default_value_t = 4)] + scram_thread_pool_size: u8, /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated. #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] require_client_ip: bool, @@ -489,6 +493,9 @@ async fn main() -> anyhow::Result<()> { /// ProxyConfig is created at proxy startup, and lives forever. fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { + let thread_pool = ThreadPool::new(args.scram_thread_pool_size); + Metrics::install(thread_pool.metrics.clone()); + let tls_config = match (&args.tls_key, &args.tls_cert) { (Some(key_path), Some(cert_path)) => Some(config::configure_tls( key_path, @@ -624,6 +631,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, }; let authentication_config = AuthenticationConfig { + thread_pool, scram_protocol_timeout: args.scram_protocol_timeout, rate_limiter_enabled: args.auth_rate_limit_enabled, rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), diff --git a/proxy/src/config.rs b/proxy/src/config.rs index b7ab2c00f9..5a0c251ce2 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -2,6 +2,7 @@ use crate::{ auth::{self, backend::AuthRateLimiter}, console::locks::ApiLocks, rate_limiter::RateBucketInfo, + scram::threadpool::ThreadPool, serverless::{cancel_set::CancelSet, GlobalConnPoolOptions}, Host, }; @@ -61,6 +62,7 @@ pub struct HttpConfig { } pub struct AuthenticationConfig { + pub thread_pool: Arc, pub scram_protocol_timeout: tokio::time::Duration, pub rate_limiter_enabled: bool, pub rate_limiter: AuthRateLimiter, diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 1590316925..e2a75a8720 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,11 +1,11 @@ -use std::sync::OnceLock; +use std::sync::{Arc, OnceLock}; use lasso::ThreadedRodeo; use measured::{ - label::StaticLabelSet, + label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet}, metric::{histogram::Thresholds, name::MetricName}, - Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, - MetricGroup, + Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec, + LabelGroup, MetricGroup, }; use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; @@ -14,26 +14,36 @@ use tokio::time::{self, Instant}; use crate::console::messages::ColdStartInfo; #[derive(MetricGroup)] +#[metric(new(thread_pool: Arc))] pub struct Metrics { #[metric(namespace = "proxy")] + #[metric(init = ProxyMetrics::new(thread_pool))] pub proxy: ProxyMetrics, #[metric(namespace = "wake_compute_lock")] pub wake_compute_lock: ApiLockMetrics, } +static SELF: OnceLock = OnceLock::new(); impl Metrics { + pub fn install(thread_pool: Arc) { + SELF.set(Metrics::new(thread_pool)) + .ok() + .expect("proxy metrics must not be installed more than once"); + } + pub fn get() -> &'static Self { - static SELF: OnceLock = OnceLock::new(); - SELF.get_or_init(|| Metrics { - proxy: ProxyMetrics::default(), - wake_compute_lock: ApiLockMetrics::new(), - }) + #[cfg(test)] + return SELF.get_or_init(|| Metrics::new(Arc::new(ThreadPoolMetrics::new(0)))); + + #[cfg(not(test))] + SELF.get() + .expect("proxy metrics must be installed by the main() function") } } #[derive(MetricGroup)] -#[metric(new())] +#[metric(new(thread_pool: Arc))] pub struct ProxyMetrics { #[metric(flatten)] pub db_connections: CounterPairVec, @@ -129,6 +139,10 @@ pub struct ProxyMetrics { #[metric(namespace = "connect_compute_lock")] pub connect_compute_lock: ApiLockMetrics, + + #[metric(namespace = "scram_pool")] + #[metric(init = thread_pool)] + pub scram_pool: Arc, } #[derive(MetricGroup)] @@ -146,12 +160,6 @@ pub struct ApiLockMetrics { pub semaphore_acquire_seconds: Histogram<16>, } -impl Default for ProxyMetrics { - fn default() -> Self { - Self::new() - } -} - impl Default for ApiLockMetrics { fn default() -> Self { Self::new() @@ -553,3 +561,52 @@ pub enum RedisEventsCount { PasswordUpdate, AllowedIpsUpdate, } + +pub struct ThreadPoolWorkers(usize); +pub struct ThreadPoolWorkerId(pub usize); + +impl LabelValue for ThreadPoolWorkerId { + fn visit(&self, v: V) -> V::Output { + v.write_int(self.0 as i64) + } +} + +impl LabelGroup for ThreadPoolWorkerId { + fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) { + v.write_value(LabelName::from_str("worker"), self); + } +} + +impl LabelSet for ThreadPoolWorkers { + type Value<'a> = ThreadPoolWorkerId; + + fn dynamic_cardinality(&self) -> Option { + Some(self.0) + } + + fn encode(&self, value: Self::Value<'_>) -> Option { + (value.0 < self.0).then_some(value.0) + } + + fn decode(&self, value: usize) -> Self::Value<'_> { + ThreadPoolWorkerId(value) + } +} + +impl FixedCardinalitySet for ThreadPoolWorkers { + fn cardinality(&self) -> usize { + self.0 + } +} + +#[derive(MetricGroup)] +#[metric(new(workers: usize))] +pub struct ThreadPoolMetrics { + pub injector_queue_depth: Gauge, + #[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))] + pub worker_queue_depth: GaugeVec, + #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))] + pub worker_task_turns_total: CounterVec, + #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))] + pub worker_task_skips_total: CounterVec, +} diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index ed80675f8a..862facb4e5 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -6,11 +6,14 @@ //! * //! * +mod countmin; mod exchange; mod key; mod messages; +mod pbkdf2; mod secret; mod signature; +pub mod threadpool; pub use exchange::{exchange, Exchange}; pub use key::ScramKey; @@ -56,9 +59,13 @@ fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { #[cfg(test)] mod tests { - use crate::sasl::{Mechanism, Step}; + use crate::{ + intern::EndpointIdInt, + sasl::{Mechanism, Step}, + EndpointId, + }; - use super::{Exchange, ServerSecret}; + use super::{threadpool::ThreadPool, Exchange, ServerSecret}; #[test] fn snapshot() { @@ -112,8 +119,13 @@ mod tests { } async fn run_round_trip_test(server_password: &str, client_password: &str) { + let pool = ThreadPool::new(1); + + let ep = EndpointId::from("foo"); + let ep = EndpointIdInt::from(ep); + let scram_secret = ServerSecret::build(server_password).await.unwrap(); - let outcome = super::exchange(&scram_secret, client_password.as_bytes()) + let outcome = super::exchange(&pool, ep, &scram_secret, client_password.as_bytes()) .await .unwrap(); diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs new file mode 100644 index 0000000000..f2b794e5fe --- /dev/null +++ b/proxy/src/scram/countmin.rs @@ -0,0 +1,173 @@ +use std::hash::Hash; + +/// estimator of hash jobs per second. +/// +pub struct CountMinSketch { + // one for each depth + hashers: Vec, + width: usize, + depth: usize, + // buckets, width*depth + buckets: Vec, +} + +impl CountMinSketch { + /// Given parameters (ε, δ), + /// set width = ceil(e/ε) + /// set depth = ceil(ln(1/δ)) + /// + /// guarantees: + /// actual <= estimate + /// estimate <= actual + ε * N with probability 1 - δ + /// where N is the cardinality of the stream + pub fn with_params(epsilon: f64, delta: f64) -> Self { + CountMinSketch::new( + (std::f64::consts::E / epsilon).ceil() as usize, + (1.0_f64 / delta).ln().ceil() as usize, + ) + } + + fn new(width: usize, depth: usize) -> Self { + Self { + #[cfg(test)] + hashers: (0..depth) + .map(|i| { + // digits of pi for good randomness + ahash::RandomState::with_seeds( + 314159265358979323, + 84626433832795028, + 84197169399375105, + 82097494459230781 + i as u64, + ) + }) + .collect(), + #[cfg(not(test))] + hashers: (0..depth).map(|_| ahash::RandomState::new()).collect(), + width, + depth, + buckets: vec![0; width * depth], + } + } + + pub fn inc_and_return(&mut self, t: &T, x: u32) -> u32 { + let mut min = u32::MAX; + for row in 0..self.depth { + let col = (self.hashers[row].hash_one(t) as usize) % self.width; + + let row = &mut self.buckets[row * self.width..][..self.width]; + row[col] = row[col].saturating_add(x); + min = std::cmp::min(min, row[col]); + } + min + } + + pub fn reset(&mut self) { + self.buckets.clear(); + self.buckets.resize(self.width * self.depth, 0); + } +} + +#[cfg(test)] +mod tests { + use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; + + use super::CountMinSketch; + + fn eval_precision(n: usize, p: f64, q: f64) -> usize { + // fixed value of phi for consistent test + let mut rng = StdRng::seed_from_u64(16180339887498948482); + + #[allow(non_snake_case)] + let mut N = 0; + + let mut ids = vec![]; + + for _ in 0..n { + // number of insert operations + let n = rng.gen_range(1..100); + // number to insert at once + let m = rng.gen_range(1..4096); + + let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid(); + ids.push((id, n, m)); + + // N = sum(actual) + N += n * m; + } + + // q% of counts will be within p of the actual value + let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q); + + dbg!(sketch.buckets.len()); + + // insert a bunch of entries in a random order + let mut ids2 = ids.clone(); + while !ids2.is_empty() { + ids2.shuffle(&mut rng); + + let mut i = 0; + while i < ids2.len() { + sketch.inc_and_return(&ids2[i].0, ids2[i].1); + ids2[i].2 -= 1; + if ids2[i].2 == 0 { + ids2.remove(i); + } else { + i += 1; + } + } + } + + let mut within_p = 0; + for (id, n, m) in ids { + let actual = n * m; + let estimate = sketch.inc_and_return(&id, 0); + + // This estimate has the guarantee that actual <= estimate + assert!(actual <= estimate); + + // This estimate has the guarantee that estimate <= actual + εN with probability 1 - δ. + // ε = p / N, δ = 1 - q; + // therefore, estimate <= actual + p with probability q. + if estimate as f64 <= actual as f64 + p { + within_p += 1; + } + } + within_p + } + + #[test] + fn precision() { + assert_eq!(eval_precision(100, 100.0, 0.99), 100); + assert_eq!(eval_precision(1000, 100.0, 0.99), 1000); + assert_eq!(eval_precision(100, 4096.0, 0.99), 100); + assert_eq!(eval_precision(1000, 4096.0, 0.99), 1000); + + // seems to be more precise than the literature indicates? + // probably numbers are too small to truly represent the probabilities. + assert_eq!(eval_precision(100, 4096.0, 0.90), 100); + assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000); + assert_eq!(eval_precision(100, 4096.0, 0.1), 98); + assert_eq!(eval_precision(1000, 4096.0, 0.1), 991); + } + + // returns memory usage in bytes, and the time complexity per insert. + fn eval_cost(p: f64, q: f64) -> (usize, usize) { + #[allow(non_snake_case)] + // N = sum(actual) + // Let's assume 1021 samples, all of 4096 + let N = 1021 * 4096; + let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q); + + let memory = std::mem::size_of::() * sketch.buckets.len(); + let time = sketch.depth; + (memory, time) + } + + #[test] + fn memory_usage() { + assert_eq!(eval_cost(100.0, 0.99), (2273580, 5)); + assert_eq!(eval_cost(4096.0, 0.99), (55520, 5)); + assert_eq!(eval_cost(4096.0, 0.90), (33312, 3)); + assert_eq!(eval_cost(4096.0, 0.1), (11104, 1)); + } +} diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 89dd33e59f..d0adbc780e 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -4,15 +4,17 @@ use std::convert::Infallible; use hmac::{Hmac, Mac}; use sha2::Sha256; -use tokio::task::yield_now; use super::messages::{ ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN, }; +use super::pbkdf2::Pbkdf2; use super::secret::ServerSecret; use super::signature::SignatureBuilder; +use super::threadpool::ThreadPool; use super::ScramKey; use crate::config; +use crate::intern::EndpointIdInt; use crate::sasl::{self, ChannelBinding, Error as SaslError}; /// The only channel binding mode we currently support. @@ -74,37 +76,18 @@ impl<'a> Exchange<'a> { } } -// copied from -async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] { - let hmac = Hmac::::new_from_slice(str).expect("HMAC is able to accept all key sizes"); - let mut prev = hmac - .clone() - .chain_update(salt) - .chain_update(1u32.to_be_bytes()) - .finalize() - .into_bytes(); - - let mut hi = prev; - - for i in 1..iterations { - prev = hmac.clone().chain_update(prev).finalize().into_bytes(); - - for (hi, prev) in hi.iter_mut().zip(prev) { - *hi ^= prev; - } - // yield every ~250us - // hopefully reduces tail latencies - if i % 1024 == 0 { - yield_now().await - } - } - - hi.into() -} - // copied from -async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey { - let salted_password = pbkdf2(password, salt, iterations).await; +async fn derive_client_key( + pool: &ThreadPool, + endpoint: EndpointIdInt, + password: &[u8], + salt: &[u8], + iterations: u32, +) -> ScramKey { + let salted_password = pool + .spawn_job(endpoint, Pbkdf2::start(password, salt, iterations)) + .await + .expect("job should not be cancelled"); let make_key = |name| { let key = Hmac::::new_from_slice(&salted_password) @@ -119,11 +102,13 @@ async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> Scr } pub async fn exchange( + pool: &ThreadPool, + endpoint: EndpointIdInt, secret: &ServerSecret, password: &[u8], ) -> sasl::Result> { let salt = base64::decode(&secret.salt_base64)?; - let client_key = derive_client_key(password, &salt, secret.iterations).await; + let client_key = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await; if secret.is_password_invalid(&client_key).into() { Ok(sasl::Outcome::Failure("password doesn't match")) diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs new file mode 100644 index 0000000000..a803ba7e1b --- /dev/null +++ b/proxy/src/scram/pbkdf2.rs @@ -0,0 +1,89 @@ +use hmac::{ + digest::{consts::U32, generic_array::GenericArray}, + Hmac, Mac, +}; +use sha2::Sha256; + +pub struct Pbkdf2 { + hmac: Hmac, + prev: GenericArray, + hi: GenericArray, + iterations: u32, +} + +// inspired from +impl Pbkdf2 { + pub fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self { + let hmac = + Hmac::::new_from_slice(str).expect("HMAC is able to accept all key sizes"); + + let prev = hmac + .clone() + .chain_update(salt) + .chain_update(1u32.to_be_bytes()) + .finalize() + .into_bytes(); + + Self { + hmac, + // one consumed for the hash above + iterations: iterations - 1, + hi: prev, + prev, + } + } + + pub fn cost(&self) -> u32 { + (self.iterations).clamp(0, 4096) + } + + pub fn turn(&mut self) -> std::task::Poll<[u8; 32]> { + let Self { + hmac, + prev, + hi, + iterations, + } = self; + + // only do 4096 iterations per turn before sharing the thread for fairness + let n = (*iterations).clamp(0, 4096); + for _ in 0..n { + *prev = hmac.clone().chain_update(*prev).finalize().into_bytes(); + + for (hi, prev) in hi.iter_mut().zip(*prev) { + *hi ^= prev; + } + } + + *iterations -= n; + if *iterations == 0 { + std::task::Poll::Ready((*hi).into()) + } else { + std::task::Poll::Pending + } + } +} + +#[cfg(test)] +mod tests { + use super::Pbkdf2; + use pbkdf2::pbkdf2_hmac_array; + use sha2::Sha256; + + #[test] + fn works() { + let salt = b"sodium chloride"; + let pass = b"Ne0n_!5_50_C007"; + + let mut job = Pbkdf2::start(pass, salt, 600000); + let hash = loop { + let std::task::Poll::Ready(hash) = job.turn() else { + continue; + }; + break hash; + }; + + let expected = pbkdf2_hmac_array::(pass, salt, 600000); + assert_eq!(hash, expected) + } +} diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs new file mode 100644 index 0000000000..7701b869a3 --- /dev/null +++ b/proxy/src/scram/threadpool.rs @@ -0,0 +1,321 @@ +//! Custom threadpool implementation for password hashing. +//! +//! Requirements: +//! 1. Fairness per endpoint. +//! 2. Yield support for high iteration counts. + +use std::sync::{ + atomic::{AtomicU64, Ordering}, + Arc, +}; + +use crossbeam_deque::{Injector, Stealer, Worker}; +use itertools::Itertools; +use parking_lot::{Condvar, Mutex}; +use rand::Rng; +use rand::{rngs::SmallRng, SeedableRng}; +use tokio::sync::oneshot; + +use crate::{ + intern::EndpointIdInt, + metrics::{ThreadPoolMetrics, ThreadPoolWorkerId}, + scram::countmin::CountMinSketch, +}; + +use super::pbkdf2::Pbkdf2; + +pub struct ThreadPool { + queue: Injector, + stealers: Vec>, + parkers: Vec<(Condvar, Mutex)>, + /// bitpacked representation. + /// lower 8 bits = number of sleeping threads + /// next 8 bits = number of idle threads (searching for work) + counters: AtomicU64, + + pub metrics: Arc, +} + +#[derive(PartialEq)] +enum ThreadState { + Parked, + Active, +} + +impl ThreadPool { + pub fn new(n_workers: u8) -> Arc { + let workers = (0..n_workers).map(|_| Worker::new_fifo()).collect_vec(); + let stealers = workers.iter().map(|w| w.stealer()).collect_vec(); + + let parkers = (0..n_workers) + .map(|_| (Condvar::new(), Mutex::new(ThreadState::Active))) + .collect_vec(); + + let pool = Arc::new(Self { + queue: Injector::new(), + stealers, + parkers, + // threads start searching for work + counters: AtomicU64::new((n_workers as u64) << 8), + metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)), + }); + + for (i, worker) in workers.into_iter().enumerate() { + let pool = Arc::clone(&pool); + std::thread::spawn(move || thread_rt(pool, worker, i)); + } + + pool + } + + pub fn spawn_job( + &self, + endpoint: EndpointIdInt, + pbkdf2: Pbkdf2, + ) -> oneshot::Receiver<[u8; 32]> { + let (tx, rx) = oneshot::channel(); + + let queue_was_empty = self.queue.is_empty(); + + self.metrics.injector_queue_depth.inc(); + self.queue.push(JobSpec { + response: tx, + pbkdf2, + endpoint, + }); + + // inspired from + let counts = self.counters.load(Ordering::SeqCst); + let num_awake_but_idle = (counts >> 8) & 0xff; + let num_sleepers = counts & 0xff; + + // If the queue is non-empty, then we always wake up a worker + // -- clearly the existing idle jobs aren't enough. Otherwise, + // check to see if we have enough idle workers. + if !queue_was_empty || num_awake_but_idle == 0 { + let num_to_wake = Ord::min(1, num_sleepers); + self.wake_any_threads(num_to_wake); + } + + rx + } + + #[cold] + fn wake_any_threads(&self, mut num_to_wake: u64) { + if num_to_wake > 0 { + for i in 0..self.parkers.len() { + if self.wake_specific_thread(i) { + num_to_wake -= 1; + if num_to_wake == 0 { + return; + } + } + } + } + } + + fn wake_specific_thread(&self, index: usize) -> bool { + let (condvar, lock) = &self.parkers[index]; + + let mut state = lock.lock(); + if *state == ThreadState::Parked { + condvar.notify_one(); + + // When the thread went to sleep, it will have incremented + // this value. When we wake it, its our job to decrement + // it. We could have the thread do it, but that would + // introduce a delay between when the thread was + // *notified* and when this counter was decremented. That + // might mislead people with new work into thinking that + // there are sleeping threads that they should try to + // wake, when in fact there is nothing left for them to + // do. + self.counters.fetch_sub(1, Ordering::SeqCst); + *state = ThreadState::Active; + + true + } else { + false + } + } + + fn steal(&self, rng: &mut impl Rng, skip: usize, worker: &Worker) -> Option { + // announce thread as idle + self.counters.fetch_add(256, Ordering::SeqCst); + + // try steal from the global queue + loop { + match self.queue.steal_batch_and_pop(worker) { + crossbeam_deque::Steal::Success(job) => { + self.metrics + .injector_queue_depth + .set(self.queue.len() as i64); + // no longer idle + self.counters.fetch_sub(256, Ordering::SeqCst); + return Some(job); + } + crossbeam_deque::Steal::Retry => continue, + crossbeam_deque::Steal::Empty => break, + } + } + + // try steal from our neighbours + loop { + let mut retry = false; + let start = rng.gen_range(0..self.stealers.len()); + let job = (start..self.stealers.len()) + .chain(0..start) + .filter(|i| *i != skip) + .find_map( + |victim| match self.stealers[victim].steal_batch_and_pop(worker) { + crossbeam_deque::Steal::Success(job) => Some(job), + crossbeam_deque::Steal::Empty => None, + crossbeam_deque::Steal::Retry => { + retry = true; + None + } + }, + ); + if job.is_some() { + // no longer idle + self.counters.fetch_sub(256, Ordering::SeqCst); + return job; + } + if !retry { + return None; + } + } + } +} + +fn thread_rt(pool: Arc, worker: Worker, index: usize) { + /// interval when we should steal from the global queue + /// so that tail latencies are managed appropriately + const STEAL_INTERVAL: usize = 61; + + /// How often to reset the sketch values + const SKETCH_RESET_INTERVAL: usize = 1021; + + let mut rng = SmallRng::from_entropy(); + + // used to determine whether we should temporarily skip tasks for fairness. + // 99% of estimates will overcount by no more than 4096 samples + let mut sketch = CountMinSketch::with_params(1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01); + + let (condvar, lock) = &pool.parkers[index]; + + 'wait: loop { + // wait for notification of work + { + let mut lock = lock.lock(); + + // queue is empty + pool.metrics + .worker_queue_depth + .set(ThreadPoolWorkerId(index), 0); + + // subtract 1 from idle count, add 1 to sleeping count. + pool.counters.fetch_sub(255, Ordering::SeqCst); + + *lock = ThreadState::Parked; + condvar.wait(&mut lock); + } + + for i in 0.. { + let mut job = match worker + .pop() + .or_else(|| pool.steal(&mut rng, index, &worker)) + { + Some(job) => job, + None => continue 'wait, + }; + + pool.metrics + .worker_queue_depth + .set(ThreadPoolWorkerId(index), worker.len() as i64); + + // receiver is closed, cancel the task + if !job.response.is_closed() { + let rate = sketch.inc_and_return(&job.endpoint, job.pbkdf2.cost()); + + const P: f64 = 2000.0; + // probability decreases as rate increases. + // lower probability, higher chance of being skipped + // + // estimates (rate in terms of 4096 rounds): + // rate = 0 => probability = 100% + // rate = 10 => probability = 71.3% + // rate = 50 => probability = 62.1% + // rate = 500 => probability = 52.3% + // rate = 1021 => probability = 49.8% + // + // My expectation is that the pool queue will only begin backing up at ~1000rps + // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above + // are in requests per second. + let probability = P.ln() / (P + rate as f64).ln(); + if pool.queue.len() > 32 || rng.gen_bool(probability) { + pool.metrics + .worker_task_turns_total + .inc(ThreadPoolWorkerId(index)); + + match job.pbkdf2.turn() { + std::task::Poll::Ready(result) => { + let _ = job.response.send(result); + } + std::task::Poll::Pending => worker.push(job), + } + } else { + pool.metrics + .worker_task_skips_total + .inc(ThreadPoolWorkerId(index)); + + // skip for now + worker.push(job) + } + } + + // if we get stuck with a few long lived jobs in the queue + // it's better to try and steal from the queue too for fairness + if i % STEAL_INTERVAL == 0 { + let _ = pool.queue.steal_batch(&worker); + } + + if i % SKETCH_RESET_INTERVAL == 0 { + sketch.reset(); + } + } + } +} + +struct JobSpec { + response: oneshot::Sender<[u8; 32]>, + pbkdf2: Pbkdf2, + endpoint: EndpointIdInt, +} + +#[cfg(test)] +mod tests { + use crate::EndpointId; + + use super::*; + + #[tokio::test] + async fn hash_is_correct() { + let pool = ThreadPool::new(1); + + let ep = EndpointId::from("foo"); + let ep = EndpointIdInt::from(ep); + + let salt = [0x55; 32]; + let actual = pool + .spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096)) + .await + .unwrap(); + + let expected = [ + 10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242, + 178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140, + ]; + assert_eq!(actual, expected) + } +} diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 6b79c12316..52fc7b556a 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -15,6 +15,7 @@ use crate::{ }, context::RequestMonitoring, error::{ErrorKind, ReportableError, UserFacingError}, + intern::EndpointIdInt, proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry}, rate_limiter::EndpointRateLimiter, Host, @@ -66,8 +67,14 @@ impl PoolingBackend { return Err(AuthError::auth_failed(&*user_info.user)); } }; - let auth_outcome = - crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?; + let ep = EndpointIdInt::from(&conn_info.user_info.endpoint); + let auth_outcome = crate::auth::validate_password_and_exchange( + &config.thread_pool, + ep, + &conn_info.password, + secret, + ) + .await?; let res = match auth_outcome { crate::sasl::Outcome::Success(key) => { info!("user successfully authenticated"); diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 7582562450..f364a6c2e0 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -13,6 +13,7 @@ publish = false ### BEGIN HAKARI SECTION [dependencies] +ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } aws-config = { version = "1", default-features = false, features = ["rustls", "sso"] } aws-runtime = { version = "1", default-features = false, features = ["event-stream", "http-02x", "sigv4a"] } @@ -85,6 +86,7 @@ zstd-safe = { version = "7", default-features = false, features = ["arrays", "le zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] } [build-dependencies] +ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } cc = { version = "1", default-features = false, features = ["parallel"] } From ddd8ebd2536f90872da2678aecbd94a3e1b3f547 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Wed, 22 May 2024 13:06:00 -0400 Subject: [PATCH 124/130] chore(pageserver): use kebab case for aux file flag (#7840) part of https://github.com/neondatabase/neon/issues/7462 --------- Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/models.rs | 45 +++++++++++-------- test_runner/fixtures/neon_fixtures.py | 2 +- test_runner/fixtures/utils.py | 6 +-- .../regress/test_attach_tenant_config.py | 2 +- test_runner/regress/test_aux_files.py | 12 +++-- 5 files changed, 39 insertions(+), 28 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 55e9c48421..6b0403c8ab 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -9,7 +9,6 @@ use std::{ collections::HashMap, io::{BufRead, Read}, num::{NonZeroU64, NonZeroUsize}, - str::FromStr, sync::atomic::AtomicUsize, time::{Duration, SystemTime}, }; @@ -334,14 +333,28 @@ pub struct TenantConfig { /// Unset -> V1 /// -> V2 /// -> CrossValidation -> V2 -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] pub enum AuxFilePolicy { /// V1 aux file policy: store everything in AUX_FILE_KEY + #[strum(ascii_case_insensitive)] V1, /// V2 aux file policy: store in the AUX_FILE keyspace + #[strum(ascii_case_insensitive)] V2, /// Cross validation runs both formats on the write path and does validation /// on the read path. + #[strum(ascii_case_insensitive)] CrossValidation, } @@ -407,23 +420,6 @@ impl AuxFilePolicy { } } -impl FromStr for AuxFilePolicy { - type Err = anyhow::Error; - - fn from_str(s: &str) -> Result { - let s = s.to_lowercase(); - if s == "v1" { - Ok(Self::V1) - } else if s == "v2" { - Ok(Self::V2) - } else if s == "crossvalidation" || s == "cross_validation" { - Ok(Self::CrossValidation) - } else { - anyhow::bail!("cannot parse {} to aux file policy", s) - } - } -} - #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[serde(tag = "kind")] pub enum EvictionPolicy { @@ -1405,6 +1401,7 @@ impl PagestreamBeMessage { #[cfg(test)] mod tests { use serde_json::json; + use std::str::FromStr; use super::*; @@ -1667,4 +1664,14 @@ mod tests { AuxFilePolicy::V2 )); } + + #[test] + fn test_aux_parse() { + assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2); + assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2); + assert_eq!( + AuxFilePolicy::from_str("cross-validation").unwrap(), + AuxFilePolicy::CrossValidation + ); + } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b02054a702..796ae7217b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1625,7 +1625,7 @@ class NeonCli(AbstractNeonCli): args.extend(["-c", "switch_aux_file_policy:v1"]) if aux_file_v2 is AuxFileStore.CrossValidation: - args.extend(["-c", "switch_aux_file_policy:cross_validation"]) + args.extend(["-c", "switch_aux_file_policy:cross-validation"]) if set_default: args.append("--set-default") diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 70263245e7..22bb43c580 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -495,9 +495,9 @@ def assert_no_errors(log_file, service, allowed_errors): @enum.unique class AuxFileStore(str, enum.Enum): - V1 = "V1" - V2 = "V2" - CrossValidation = "CrossValidation" + V1 = "v1" + V2 = "v2" + CrossValidation = "cross-validation" def __repr__(self) -> str: return f"'aux-{self.value}'" diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 2ec375271c..531679c7af 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -190,7 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "trace_read_requests": True, "walreceiver_connect_timeout": "13m", "image_layer_creation_check_threshold": 1, - "switch_aux_file_policy": "CrossValidation", + "switch_aux_file_policy": "cross-validation", } ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_aux_files.py b/test_runner/regress/test_aux_files.py index be9c41a867..5328aef156 100644 --- a/test_runner/regress/test_aux_files.py +++ b/test_runner/regress/test_aux_files.py @@ -1,5 +1,6 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( + AuxFileStore, NeonEnvBuilder, logical_replication_sync, ) @@ -14,7 +15,7 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg): timeline_id = env.initial_timeline tenant_config = client.tenant_config(tenant_id).effective_config - tenant_config["switch_aux_file_policy"] = "V2" + tenant_config["switch_aux_file_policy"] = AuxFileStore.V2 client.set_tenant_config(tenant_id, tenant_config) # aux file v2 is enabled on the write path, so for now, it should be unset (or null) assert ( @@ -49,7 +50,10 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg): with env.pageserver.http_client() as client: # aux file v2 flag should be enabled at this point - assert client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] == "V2" + assert ( + client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] + == AuxFileStore.V2 + ) with env.pageserver.http_client() as client: tenant_config = client.tenant_config(tenant_id).effective_config tenant_config["switch_aux_file_policy"] = "V1" @@ -59,7 +63,7 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg): client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[ "last_aux_file_policy" ] - == "V2" + == AuxFileStore.V2 ) env.pageserver.restart() with env.pageserver.http_client() as client: @@ -68,5 +72,5 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg): client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[ "last_aux_file_policy" ] - == "V2" + == AuxFileStore.V2 ) From 014f822a789efd2a64fbcf79de5acd2f07018952 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 22 May 2024 19:17:47 +0100 Subject: [PATCH 125/130] tests: refine test_secondary_background_downloads (#7829) ## Problem This test relied on some sleeps, and was failing ~5% of the time. ## Summary of changes Use log-watching rather than straight waits, and make timeouts more generous for the CI environment. --- .../regress/test_pageserver_secondary.py | 51 +++++++++++++++++-- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 127340a1e7..25a3f8521c 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -578,7 +578,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): default_download_period_secs = 60 # The upload period, which will also be the download once the secondary has seen its first heatmap - upload_period_secs = 20 + upload_period_secs = 30 for _i in range(0, tenant_count): tenant_id = TenantId.generate() @@ -596,11 +596,26 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): tenant_timelines[tenant_id] = [timeline_a, timeline_b] + def await_log(pageserver, deadline, expression): + """ + Wrapper around assert_log_contains that waits with a deadline rather than timeout + """ + now = time.time() + if now > deadline: + raise RuntimeError(f"Timed out waiting for {expression}") + else: + timeout = int(deadline - now) + 1 + try: + wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) # type: ignore + except: + log.error(f"Timed out waiting for '{expression}'") + raise + t_start = time.time() # Wait long enough that the background downloads should happen; we expect all the inital layers # of all the initial timelines to show up on the secondary location of each tenant. - time.sleep(default_download_period_secs * 1.5) + initial_download_deadline = time.time() + default_download_period_secs * 3 for tenant_id, timelines in tenant_timelines.items(): attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] @@ -608,8 +623,24 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): # We only have two: the other one must be secondary ps_secondary = next(p for p in env.pageservers if p != ps_attached) + now = time.time() + if now > initial_download_deadline: + raise RuntimeError("Timed out waiting for initial secondary download") + else: + for timeline_id in timelines: + log.info( + f"Waiting for downloads of timeline {timeline_id} on secondary pageserver {ps_secondary.id}" + ) + await_log( + ps_secondary, + initial_download_deadline, + f".*{timeline_id}.*Wrote timeline_detail.*", + ) + for timeline_id in timelines: - log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}") + log.info( + f"Checking for secondary timeline downloads {timeline_id} on node {ps_secondary.id}" + ) # One or more layers should be present for all timelines assert ps_secondary.list_layers(tenant_id, timeline_id) @@ -617,7 +648,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1]) # Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor - time.sleep(upload_period_secs * 2.5) + deletion_deadline = time.time() + upload_period_secs * 3 for tenant_id, timelines in tenant_timelines.items(): attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] @@ -625,6 +656,16 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): # We only have two: the other one must be secondary ps_secondary = next(p for p in env.pageservers if p != ps_attached) + expect_del_timeline = timelines[1] + log.info( + f"Waiting for deletion of timeline {expect_del_timeline} on secondary pageserver {ps_secondary.id}" + ) + await_log( + ps_secondary, + deletion_deadline, + f".*Timeline no longer in heatmap.*{expect_del_timeline}.*", + ) + # This one was not deleted assert ps_secondary.list_layers(tenant_id, timelines[0]) @@ -632,7 +673,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): log.info( f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}" ) - assert not ps_secondary.list_layers(tenant_id, timelines[1]) + assert not ps_secondary.list_layers(tenant_id, expect_del_timeline) t_end = time.time() From f98fdd20e390822a890f596a7932ca81b47e00ce Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 22 May 2024 19:38:22 +0100 Subject: [PATCH 126/130] tests: add a couple of allow lists for shutdown cases (#7844) ## Problem Failures on some of our uglier shutdown log messages: https://neon-github-public-dev.s3.amazonaws.com/reports/main/9192662995/index.html#suites/07874de07c4a1c9effe0d92da7755ebf/51b365408678c66f/ ## Summary of changes - Allow-list these errors. --- test_runner/fixtures/pageserver/allowed_errors.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 91cd67d107..fa6e4eaafd 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -70,6 +70,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( # this is expected given our collaborative shutdown approach for the UploadQueue ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*", ".*Compaction failed.*, retrying in .*: ShuttingDown", + ".*Compaction failed.*, retrying in .*: timeline shutting down.*", # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally ".*Error processing HTTP request: NotFound: Timeline .* was not found", ".*took more than expected to complete.*", @@ -91,6 +92,10 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*", # Can happen when the test shuts down the storage controller while it is calling the utilization API ".*WARN.*path=/v1/utilization .*request was dropped before completing", + # Can happen during shutdown + ".*scheduling deletion on drop failed: queue is in state Stopped.*", + # Can happen during shutdown + ".*ignoring failure to find gc cutoffs: timeline shutting down.*", ) From 4a278cce7ce5b7f32360e85fd41219df95cc9a86 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Wed, 22 May 2024 15:05:26 -0400 Subject: [PATCH 127/130] chore(pageserver): add force aux file policy switch handler (#7842) For existing users, we want to allow doing a force switch for their aux file policy. Part of #7462 --------- Signed-off-by: Alex Chi Z --- pageserver/src/http/routes.rs | 78 ++++++++++++++++++----------- pageserver/src/pgdatadir_mapping.rs | 14 +----- pageserver/src/tenant.rs | 61 ++++++++++++++++++++++ pageserver/src/tenant/timeline.rs | 8 +++ 4 files changed, 119 insertions(+), 42 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 34b9806a26..7b55e88096 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -16,6 +16,7 @@ use hyper::header; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; +use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::IngestAuxFilesRequest; use pageserver_api::models::ListAuxFilesRequest; use pageserver_api::models::LocationConfig; @@ -2307,6 +2308,31 @@ async fn post_tracing_event_handler( json_response(StatusCode::OK, ()) } +async fn force_aux_policy_switch_handler( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&r, None)?; + let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?; + let policy: AuxFilePolicy = json_request(&mut r).await?; + + let state = get_state(&r); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + timeline + .do_switch_aux_policy(policy) + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + async fn put_io_engine_handler( mut r: Request, _cancel: CancellationToken, @@ -2384,19 +2410,9 @@ async fn list_aux_files( active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; - let process = || async move { - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let files = timeline.list_aux_files(body.lsn, &ctx).await?; - Ok::<_, anyhow::Error>(files) - }; - - match process().await { - Ok(st) => json_response(StatusCode::OK, st), - Err(err) => json_response( - StatusCode::INTERNAL_SERVER_ERROR, - ApiError::InternalServerError(err).to_string(), - ), - } + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let files = timeline.list_aux_files(body.lsn, &ctx).await?; + json_response(StatusCode::OK, files) } async fn ingest_aux_files( @@ -2414,24 +2430,22 @@ async fn ingest_aux_files( active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; - let process = || async move { - let mut modification = timeline.begin_modification(Lsn( - timeline.get_last_record_lsn().0 + 8 - ) /* advance LSN by 8 */); - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - for (fname, content) in body.aux_files { - modification - .put_file(&fname, content.as_bytes(), &ctx) - .await?; - } - modification.commit(&ctx).await?; - Ok::<_, anyhow::Error>(()) - }; - - match process().await { - Ok(st) => json_response(StatusCode::OK, st), - Err(err) => Err(ApiError::InternalServerError(err)), + let mut modification = timeline.begin_modification( + Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */ + ); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + for (fname, content) in body.aux_files { + modification + .put_file(&fname, content.as_bytes(), &ctx) + .await + .map_err(ApiError::InternalServerError)?; } + modification + .commit(&ctx) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) } /// Report on the largest tenants on this pageserver, for the storage controller to identify @@ -2814,6 +2828,10 @@ pub fn make_router( |r| api_handler(r, timeline_collect_keyspace), ) .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch", + |r| api_handler(r, force_aux_policy_switch_handler), + ) .get("/v1/utilization", |r| api_handler(r, get_utilization)) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files", diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 7dea687c46..afba34c6d1 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1489,14 +1489,7 @@ impl<'a> DatadirModification<'a> { if aux_files_key_v1.is_empty() { None } else { - self.tline - .last_aux_file_policy - .store(Some(AuxFilePolicy::V1)); - self.tline - .remote_client - .schedule_index_upload_for_aux_file_policy_update(Some( - AuxFilePolicy::V1, - ))?; + self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?; Some(AuxFilePolicy::V1) } } else { @@ -1504,10 +1497,7 @@ impl<'a> DatadirModification<'a> { }; if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) { - self.tline.last_aux_file_policy.store(Some(switch_policy)); - self.tline - .remote_client - .schedule_index_upload_for_aux_file_policy_update(Some(switch_policy))?; + self.tline.do_switch_aux_policy(switch_policy)?; info!(current=?current_policy, next=?switch_policy, "switching aux file policy"); switch_policy } else { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index caf26e0a0b..2bb199b228 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -5999,6 +5999,67 @@ mod tests { ); } + #[tokio::test] + async fn aux_file_policy_force_switch() { + let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap(); + harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1; + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + None, + "no aux file is written so it should be unset" + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V2), + "dirty index_part.json reflected state is yet to be updated" + ); + + // lose all data from v1 + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!(files.get("pg_logical/mappings/test1"), None); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"second", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // read data ingested in v2 + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"second")) + ); + // lose all data from v1 + assert_eq!(files.get("pg_logical/mappings/test1"), None); + } + #[tokio::test] async fn aux_file_policy_auto_detect() { let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1f8ee9ffc4..63d03b3c68 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4606,6 +4606,14 @@ impl Timeline { ) -> Result, anyhow::Error> { detach_ancestor::complete(self, tenant, prepared, ctx).await } + + /// Switch aux file policy and schedule upload to the index part. + pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> { + self.last_aux_file_policy.store(Some(policy)); + self.remote_client + .schedule_index_upload_for_aux_file_policy_update(Some(policy))?; + Ok(()) + } } /// Top-level failure to compact. From ff560a1113046ff29acf2723c0183db7c2d128f1 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Wed, 22 May 2024 17:28:47 -0400 Subject: [PATCH 128/130] chore(pageserver): use kebab case for compaction algorithms (#7845) Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/models.rs | 21 ++++++++++++++++--- pageserver/src/tenant.rs | 10 ++++++--- pageserver/src/tenant/config.rs | 13 ++++++++---- pageserver/src/tenant/timeline.rs | 14 +++++++------ .../regress/test_attach_tenant_config.py | 2 +- test_runner/regress/test_compaction.py | 4 ++-- 6 files changed, 45 insertions(+), 19 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 6b0403c8ab..9311dab33c 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -305,7 +305,7 @@ pub struct TenantConfig { pub compaction_period: Option, pub compaction_threshold: Option, // defer parsing compaction_algorithm, like eviction_policy - pub compaction_algorithm: Option, + pub compaction_algorithm: Option, pub gc_horizon: Option, pub gc_period: Option, pub image_creation_threshold: Option, @@ -438,13 +438,28 @@ impl EvictionPolicy { } } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(tag = "kind")] +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] pub enum CompactionAlgorithm { Legacy, Tiered, } +#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)] +pub struct CompactionAlgorithmSettings { + pub kind: CompactionAlgorithm, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct EvictionPolicyLayerAccessThreshold { #[serde(with = "humantime_serde")] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2bb199b228..540eb10ed2 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3973,7 +3973,7 @@ mod tests { use hex_literal::hex; use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE}; use pageserver_api::keyspace::KeySpace; - use pageserver_api::models::CompactionAlgorithm; + use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; use rand::{thread_rng, Rng}; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; @@ -5169,7 +5169,9 @@ mod tests { compaction_algorithm: CompactionAlgorithm, ) -> anyhow::Result<()> { let mut harness = TenantHarness::create(name)?; - harness.tenant_conf.compaction_algorithm = compaction_algorithm; + harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { + kind: compaction_algorithm, + }; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) @@ -5526,7 +5528,9 @@ mod tests { compaction_algorithm: CompactionAlgorithm, ) -> anyhow::Result<()> { let mut harness = TenantHarness::create(name)?; - harness.tenant_conf.compaction_algorithm = compaction_algorithm; + harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { + kind: compaction_algorithm, + }; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index a695363cdc..342d705954 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -11,6 +11,7 @@ use anyhow::bail; use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::CompactionAlgorithm; +use pageserver_api::models::CompactionAlgorithmSettings; use pageserver_api::models::EvictionPolicy; use pageserver_api::models::{self, ThrottleConfig}; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; @@ -320,7 +321,7 @@ pub struct TenantConf { pub compaction_period: Duration, // Level0 delta layer threshold for compaction. pub compaction_threshold: usize, - pub compaction_algorithm: CompactionAlgorithm, + pub compaction_algorithm: CompactionAlgorithmSettings, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. // The unit is #of bytes of WAL. @@ -406,7 +407,7 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] - pub compaction_algorithm: Option, + pub compaction_algorithm: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] @@ -497,7 +498,9 @@ impl TenantConfOpt { .unwrap_or(global_conf.compaction_threshold), compaction_algorithm: self .compaction_algorithm - .unwrap_or(global_conf.compaction_algorithm), + .as_ref() + .unwrap_or(&global_conf.compaction_algorithm) + .clone(), gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon), gc_period: self.gc_period.unwrap_or(global_conf.gc_period), image_creation_threshold: self @@ -550,7 +553,9 @@ impl Default for TenantConf { compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) .expect("cannot parse default compaction period"), compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, - compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM, + compaction_algorithm: CompactionAlgorithmSettings { + kind: DEFAULT_COMPACTION_ALGORITHM, + }, gc_horizon: DEFAULT_GC_HORIZON, gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) .expect("cannot parse default gc period"), diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 63d03b3c68..881e7f8f3c 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -23,9 +23,9 @@ use pageserver_api::{ }, keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, models::{ - AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo, - DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, - LsnLease, TimelineState, + AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings, + DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, + InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState, }, reltag::BlockNumber, shard::{ShardIdentity, ShardNumber, TenantShardId}, @@ -1700,7 +1700,7 @@ impl Timeline { return Ok(()); } - match self.get_compaction_algorithm() { + match self.get_compaction_algorithm_settings().kind { CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await, CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await, } @@ -2096,12 +2096,14 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } - fn get_compaction_algorithm(&self) -> CompactionAlgorithm { + fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings { let tenant_conf = &self.tenant_conf.load(); tenant_conf .tenant_conf .compaction_algorithm - .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm) + .as_ref() + .unwrap_or(&self.conf.default_tenant_conf.compaction_algorithm) + .clone() } fn get_eviction_policy(&self) -> EvictionPolicy { diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 531679c7af..8c60b454d8 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -162,7 +162,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "checkpoint_distance": 10000, "checkpoint_timeout": "13m", "compaction_algorithm": { - "kind": "Tiered", + "kind": "tiered", }, "eviction_policy": { "kind": "LayerAccessThreshold", diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 6a3515e1bd..4850a5c688 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -194,8 +194,8 @@ def test_sharding_compaction( class CompactionAlgorithm(str, enum.Enum): - LEGACY = "Legacy" - TIERED = "Tiered" + LEGACY = "legacy" + TIERED = "tiered" @pytest.mark.parametrize( From eb0c026aac95bf8aad1c115aabb5697e86594eac Mon Sep 17 00:00:00 2001 From: Oleg Vasilev Date: Thu, 23 May 2024 00:48:59 +0300 Subject: [PATCH 129/130] Bump vm-builder v0.28.1 -> v0.29.3 (#7849) One change: runner: allow coredump collection (#931) --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 2ab1417d6d..f8c011a0a5 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -934,7 +934,7 @@ jobs: matrix: version: [ v14, v15, v16 ] env: - VM_BUILDER_VERSION: v0.28.1 + VM_BUILDER_VERSION: v0.29.3 steps: - name: Checkout From 03c2c569be7a0e379743d99884c2990d484c67f2 Mon Sep 17 00:00:00 2001 From: Anna Khanova <32508607+khanova@users.noreply.github.com> Date: Thu, 23 May 2024 11:41:29 +0200 Subject: [PATCH 130/130] [proxy] Do not fail after parquet upload error (#7858) ## Problem If the parquet upload was unsuccessful, it will panic. ## Summary of changes Write error in logs instead. --- proxy/src/context/parquet.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 392821c430..a213a32ca4 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -355,7 +355,7 @@ async fn upload_parquet( "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet" ))?; let cancel = CancellationToken::new(); - backoff::retry( + let maybe_err = backoff::retry( || async { let stream = futures::stream::once(futures::future::ready(Ok(data.clone()))); storage @@ -372,7 +372,12 @@ async fn upload_parquet( .await .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) .and_then(|x| x) - .context("request_data_upload")?; + .context("request_data_upload") + .err(); + + if let Some(err) = maybe_err { + tracing::warn!(%id, %err, "failed to upload request data"); + } Ok(buffer.writer()) }