From 84bc3380cc82a27b06717d2f239ac1fea74880fc Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 28 Apr 2025 17:34:47 +0200 Subject: [PATCH 001/142] Remove SAFEKEEPER_AUTH_TOKEN env var parsing from safekeeper (#11698) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Remove SAFEKEEPER_AUTH_TOKEN env var parsing from safekeeper This PR is a follow-up to #11443 that removes the parsing of the `SAFEKEEPER_AUTH_TOKEN` environment variable from the safekeeper codebase while keeping the `auth_token_path` CLI flag functionality. ## Changes: - Removed code that checks for the `SAFEKEEPER_AUTH_TOKEN` environment variable - Updated comments to reflect that only the `auth_token_path` CLI flag is now used As mentioned in PR #11443, the environment variable approach was planned to be deprecated and removed in favor of the file-based approach, which is more secure since environment variables can be quite public in both procfs and unit files. Link to Devin run: https://app.devin.ai/sessions/d6f56cf1b4164ea9880a9a06358a58ac Requested by: arpad@neon.tech --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: arpad@neon.tech Co-authored-by: Arpad Müller --- control_plane/src/safekeeper.rs | 39 ++++++++++++++++++++------------ safekeeper/src/bin/safekeeper.rs | 31 ++++++------------------- 2 files changed, 32 insertions(+), 38 deletions(-) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 948e3c8c93..eec2c997e6 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -112,7 +112,7 @@ impl SafekeeperNode { } /// Initializes a safekeeper node by creating all necessary files, - /// e.g. SSL certificates. + /// e.g. SSL certificates and JWT token file. pub fn initialize(&self) -> anyhow::Result<()> { if self.env.generate_local_ssl_certs { self.env.generate_ssl_cert( @@ -120,6 +120,17 @@ impl SafekeeperNode { &self.datadir_path().join("server.key"), )?; } + + // Generate a token file for authentication with other safekeepers + if self.conf.auth_enabled { + let token = self + .env + .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?; + + let token_path = self.datadir_path().join("peer_jwt_token"); + std::fs::write(token_path, token)?; + } + Ok(()) } @@ -218,14 +229,26 @@ impl SafekeeperNode { args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap())); } + if self.conf.auth_enabled { + let token_path = self.datadir_path().join("peer_jwt_token"); + let token_path_str = token_path + .to_str() + .with_context(|| { + format!("Token path {token_path:?} cannot be represented as a unicode string") + })? + .to_owned(); + args.extend(["--auth-token-path".to_owned(), token_path_str]); + } + args.extend_from_slice(extra_opts); + let env_variables = Vec::new(); background_process::start_process( &format!("safekeeper-{id}"), &datadir, &self.env.safekeeper_bin(), &args, - self.safekeeper_env_variables()?, + env_variables, background_process::InitialPidFile::Expect(self.pid_file()), retry_timeout, || async { @@ -239,18 +262,6 @@ impl SafekeeperNode { .await } - fn safekeeper_env_variables(&self) -> anyhow::Result> { - // Generate a token to connect from safekeeper to peers - if self.conf.auth_enabled { - let token = self - .env - .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?; - Ok(vec![("SAFEKEEPER_AUTH_TOKEN".to_owned(), token)]) - } else { - Ok(Vec::new()) - } - } - /// /// Stop the server. /// diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index dd71420efb..c267a55cb6 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -1,7 +1,6 @@ // // Main entry point for the safekeeper executable // -use std::env::{VarError, var}; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::str::FromStr; @@ -354,29 +353,13 @@ async fn main() -> anyhow::Result<()> { }; // Load JWT auth token to connect to other safekeepers for pull_timeline. - // First check if the env var is present, then check the arg with the path. - // We want to deprecate and remove the env var method in the future. - let sk_auth_token = match var("SAFEKEEPER_AUTH_TOKEN") { - Ok(v) => { - info!("loaded JWT token for authentication with safekeepers"); - Some(SecretString::from(v)) - } - Err(VarError::NotPresent) => { - if let Some(auth_token_path) = args.auth_token_path.as_ref() { - info!( - "loading JWT token for authentication with safekeepers from {auth_token_path}" - ); - let auth_token = tokio::fs::read_to_string(auth_token_path).await?; - Some(SecretString::from(auth_token.trim().to_owned())) - } else { - info!("no JWT token for authentication with safekeepers detected"); - None - } - } - Err(_) => { - warn!("JWT token for authentication with safekeepers is not unicode"); - None - } + let sk_auth_token = if let Some(auth_token_path) = args.auth_token_path.as_ref() { + info!("loading JWT token for authentication with safekeepers from {auth_token_path}"); + let auth_token = tokio::fs::read_to_string(auth_token_path).await?; + Some(SecretString::from(auth_token.trim().to_owned())) + } else { + info!("no JWT token for authentication with safekeepers detected"); + None }; let ssl_ca_certs = match args.ssl_ca_file.as_ref() { From b1fa68f6592672a8e988b85f0de83749fece2dab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Mon, 28 Apr 2025 18:37:36 +0200 Subject: [PATCH 002/142] impr(ci): switch release PR creation over to use python based action (#11679) ## Problem Our different repositories had both had code to achieve very similar results in terms of release PR creation, but they were structured differently and had different extensions. This was likely to cause maintainability problems in the long run. ## Summary of changes Switch to a python cli based composite action for creating the release PRs that will also be introduced in our other repos later. ## To Do - [ ] Adjust our docs to reflect the changes from this. --- .github/workflows/_create-release-pr.yml | 103 ----------------------- .github/workflows/release-compute.yml | 12 +++ .github/workflows/release-proxy.yml | 12 +++ .github/workflows/release-storage.yml | 12 +++ .github/workflows/release.yml | 93 ++++++++++---------- 5 files changed, 82 insertions(+), 150 deletions(-) delete mode 100644 .github/workflows/_create-release-pr.yml create mode 100644 .github/workflows/release-compute.yml create mode 100644 .github/workflows/release-proxy.yml create mode 100644 .github/workflows/release-storage.yml diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml deleted file mode 100644 index f96ed7d69b..0000000000 --- a/.github/workflows/_create-release-pr.yml +++ /dev/null @@ -1,103 +0,0 @@ -name: Create Release PR - -on: - workflow_call: - inputs: - component-name: - description: 'Component name' - required: true - type: string - source-branch: - description: 'Source branch' - required: true - type: string - secrets: - ci-access-token: - description: 'CI access token' - required: true - -defaults: - run: - shell: bash -euo pipefail {0} - -permissions: - contents: read - -jobs: - create-release-branch: - runs-on: ubuntu-22.04 - - permissions: - contents: write # for `git push` - - steps: - - name: Harden the runner (Audit all outbound calls) - uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 - with: - egress-policy: audit - - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - ref: ${{ inputs.source-branch }} - fetch-depth: 0 - - - name: Set variables - id: vars - env: - COMPONENT_NAME: ${{ inputs.component-name }} - RELEASE_BRANCH: >- - ${{ - false - || inputs.component-name == 'Storage' && 'release' - || inputs.component-name == 'Proxy' && 'release-proxy' - || inputs.component-name == 'Compute' && 'release-compute' - }} - run: | - now_date=$(date -u +'%Y-%m-%d') - now_time=$(date -u +'%H-%M-%Z') - { - echo "title=${COMPONENT_NAME} release ${now_date}" - echo "rc-branch=rc/${RELEASE_BRANCH}/${now_date}_${now_time}" - echo "release-branch=${RELEASE_BRANCH}" - } | tee -a ${GITHUB_OUTPUT} - - - name: Configure git - run: | - git config user.name "github-actions[bot]" - git config user.email "41898282+github-actions[bot]@users.noreply.github.com" - - - name: Create RC branch - env: - RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }} - RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} - TITLE: ${{ steps.vars.outputs.title }} - run: | - git switch -c "${RC_BRANCH}" - - # Manually create a merge commit on the current branch, keeping the - # tree and setting the parents to the current HEAD and the HEAD of the - # release branch. This commit is what we'll fast-forward the release - # branch to when merging the release branch. - # For details on why, look at - # https://docs.neon.build/overview/repositories/neon.html#background-on-commit-history-of-release-prs - current_tree=$(git rev-parse 'HEAD^{tree}') - release_head=$(git rev-parse "origin/${RELEASE_BRANCH}") - current_head=$(git rev-parse HEAD) - merge_commit=$(git commit-tree -p "${current_head}" -p "${release_head}" -m "${TITLE}" "${current_tree}") - - # Fast-forward the current branch to the newly created merge_commit - git merge --ff-only ${merge_commit} - - git push origin "${RC_BRANCH}" - - - name: Create a PR into ${{ steps.vars.outputs.release-branch }} - env: - GH_TOKEN: ${{ secrets.ci-access-token }} - RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} - RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }} - TITLE: ${{ steps.vars.outputs.title }} - run: | - gh pr create --title "${TITLE}" \ - --body "" \ - --head "${RC_BRANCH}" \ - --base "${RELEASE_BRANCH}" diff --git a/.github/workflows/release-compute.yml b/.github/workflows/release-compute.yml new file mode 100644 index 0000000000..f123dd2f44 --- /dev/null +++ b/.github/workflows/release-compute.yml @@ -0,0 +1,12 @@ +name: Create compute release PR + +on: + schedule: + - cron: '0 7 * * FRI' + +jobs: + create-release-pr: + uses: ./.github/workflows/release.yml + with: + component: compute + secrets: inherit diff --git a/.github/workflows/release-proxy.yml b/.github/workflows/release-proxy.yml new file mode 100644 index 0000000000..d9055984d2 --- /dev/null +++ b/.github/workflows/release-proxy.yml @@ -0,0 +1,12 @@ +name: Create proxy release PR + +on: + schedule: + - cron: '0 6 * * TUE' + +jobs: + create-release-pr: + uses: ./.github/workflows/release.yml + with: + component: proxy + secrets: inherit diff --git a/.github/workflows/release-storage.yml b/.github/workflows/release-storage.yml new file mode 100644 index 0000000000..91f02fddda --- /dev/null +++ b/.github/workflows/release-storage.yml @@ -0,0 +1,12 @@ +name: Create storage release PR + +on: + schedule: + - cron: '0 6 * * FRI' + +jobs: + create-release-pr: + uses: ./.github/workflows/release.yml + with: + component: storage + secrets: inherit diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4068eafb95..4b19d6aa3f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,25 +1,34 @@ -name: Create Release Branch +name: Create release PR on: - schedule: - # It should be kept in sync with if-condition in jobs - - cron: '0 6 * * TUE' # Proxy release - - cron: '0 6 * * FRI' # Storage release - - cron: '0 7 * * FRI' # Compute release workflow_dispatch: inputs: - create-storage-release-branch: - type: boolean - description: 'Create Storage release PR' + component: + description: "Component to release" + required: true + type: choice + options: + - compute + - proxy + - storage + cherry-pick: + description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)" required: false - create-proxy-release-branch: - type: boolean - description: 'Create Proxy release PR' - required: false - create-compute-release-branch: - type: boolean - description: 'Create Compute release PR' + type: string + default: '' + + workflow_call: + inputs: + component: + description: "Component to release" + required: true + type: string + cherry-pick: + description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)" required: false + type: string + default: '' + # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} @@ -29,41 +38,31 @@ defaults: shell: bash -euo pipefail {0} jobs: - create-storage-release-branch: - if: ${{ github.event.schedule == '0 6 * * FRI' || inputs.create-storage-release-branch }} + create-release-pr: + runs-on: ubuntu-22.04 permissions: contents: write - uses: ./.github/workflows/_create-release-pr.yml - with: - component-name: 'Storage' - source-branch: ${{ github.ref_name }} - secrets: - ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit - create-proxy-release-branch: - if: ${{ github.event.schedule == '0 6 * * TUE' || inputs.create-proxy-release-branch }} + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 - permissions: - contents: write + - name: Configure git + run: | + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" - uses: ./.github/workflows/_create-release-pr.yml - with: - component-name: 'Proxy' - source-branch: ${{ github.ref_name }} - secrets: - ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} - - create-compute-release-branch: - if: ${{ github.event.schedule == '0 7 * * FRI' || inputs.create-compute-release-branch }} - - permissions: - contents: write - - uses: ./.github/workflows/_create-release-pr.yml - with: - component-name: 'Compute' - source-branch: ${{ github.ref_name }} - secrets: - ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} + - name: Create release PR + uses: neondatabase/dev-actions/release-pr@02b41460646b70d12dd33e5f56ebc5af2384c993 + with: + component: ${{ inputs.component }} + cherry-pick: ${{ inputs.cherry-pick }} + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} From 998d2c2ce9315f2ec7b7bc2b5ce0253e6c758d9d Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 28 Apr 2025 17:43:35 +0100 Subject: [PATCH 003/142] storcon: use shard 0's initdb for timeline creation (#11727) ## Problem In princple, pageservers with different postgres binaries might generate different initdbs, resulting in inconsistency between shards. To avoid that, we should have shard 0 generate the initdb and other shards re-use it. Fixes: https://github.com/neondatabase/neon/issues/11340 ## Summary of changes - For shards with index greater than zero, set `existing_initdb_timeline_id` in timeline creation to consume the existing initdb rather than creating a new one --- storage_controller/src/service.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index ca9b911c4d..0f71a87f13 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -3720,6 +3720,10 @@ impl Service { // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then // use whatever LSN that shard picked when creating on subsequent shards. We arbitrarily use shard zero as the shard // that will get the first creation request, and propagate the LSN to all the >0 shards. + // + // This also enables non-zero shards to use the initdb that shard 0 generated and uploaded to S3, rather than + // independently generating their own initdb. This guarantees that shards cannot end up with different initial + // states if e.g. they have different postgres binary versions. let timeline_info = create_one( shard_zero_tid, shard_zero_locations, @@ -3729,11 +3733,16 @@ impl Service { ) .await?; - // Propagate the LSN that shard zero picked, if caller didn't provide one + // Update the create request for shards >= 0 match &mut create_req.mode { models::TimelineCreateRequestMode::Branch { ancestor_start_lsn, .. } if ancestor_start_lsn.is_none() => { + // Propagate the LSN that shard zero picked, if caller didn't provide one *ancestor_start_lsn = timeline_info.ancestor_lsn; }, + models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, .. } => { + // For shards >= 0, do not run initdb: use the one that shard 0 uploaded to S3 + *existing_initdb_timeline_id = Some(create_req.new_timeline_id) + } _ => {} } From a750026c2e69344908464c59ba820fc0639abeb1 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Mon, 28 Apr 2025 12:09:48 -0500 Subject: [PATCH 004/142] Fix compiler warning in libpagestore.c when WITH_SANITIZERS=yes (#11755) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Postgres has a nice self-documenting macro called pg_unreachable() when you want to assert that a location in code won't be hit. Warning in question: ``` /home/tristan957/Projects/work/neon//pgxn/neon/libpagestore.c: In function ‘pageserver_connect’: /home/tristan957/Projects/work/neon//pgxn/neon/libpagestore.c:739:1: warning: control reaches end of non-void function [-Wreturn-type] 739 | } | ^ ``` Signed-off-by: Tristan Partin --- pgxn/neon/libpagestore.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index ccb072d6f9..5287c12a84 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -736,8 +736,8 @@ pageserver_connect(shardno_t shard_no, int elevel) default: neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state); } - /* This shouldn't be hit */ - Assert(false); + + pg_unreachable(); } static void From 04826905340da9ac80aa96b2892a137eb91b3b7d Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 28 Apr 2025 18:24:55 +0100 Subject: [PATCH 005/142] pageserver: make control_plane_api & generations fully mandatory (#10715) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem We had retained the ability to run in a generation-less mode to support test_generations_upgrade, which was replaced with a cleaner backward compat test in https://github.com/neondatabase/neon/pull/10701 ## Summary of changes - Remove all the special cases for "if no generation" or "if no control plane api" - Make control_plane_api config mandatory --------- Co-authored-by: Arpad Müller --- .../pageserver_config/pageserver.toml | 2 ++ pageserver/src/bin/pageserver.rs | 2 +- pageserver/src/config.rs | 13 +++++++--- pageserver/src/controller_upcall_client.rs | 20 ++++++-------- pageserver/src/deletion_queue.rs | 11 +++----- pageserver/src/deletion_queue/validator.rs | 26 ++++++++----------- pageserver/src/tenant.rs | 4 +-- pageserver/src/tenant/mgr.rs | 19 +++----------- .../src/tenant/timeline/import_pgdata.rs | 3 +-- test_runner/fixtures/neon_fixtures.py | 3 +-- .../regress/test_pageserver_generations.py | 2 +- 11 files changed, 43 insertions(+), 62 deletions(-) diff --git a/docker-compose/pageserver_config/pageserver.toml b/docker-compose/pageserver_config/pageserver.toml index 76935453b6..7d603b6c65 100644 --- a/docker-compose/pageserver_config/pageserver.toml +++ b/docker-compose/pageserver_config/pageserver.toml @@ -3,3 +3,5 @@ pg_distrib_dir='/usr/local/' listen_pg_addr='0.0.0.0:6400' listen_http_addr='0.0.0.0:9898' remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' } +control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address +control_plane_emergency_mode=true diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 6cfaec955b..4c2572a577 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -504,7 +504,7 @@ fn start_pageserver( // Set up deletion queue let (deletion_queue, deletion_workers) = DeletionQueue::new( remote_storage.clone(), - StorageControllerUpcallClient::new(conf, &shutdown_pageserver)?, + StorageControllerUpcallClient::new(conf, &shutdown_pageserver), conf, ); deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle()); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 95143e58b7..ded2805602 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -150,7 +150,7 @@ pub struct PageServerConf { /// not terrible. pub background_task_maximum_delay: Duration, - pub control_plane_api: Option, + pub control_plane_api: Url, /// JWT token for use with the control plane API. pub control_plane_api_token: Option, @@ -438,7 +438,8 @@ impl PageServerConf { test_remote_failures, ondemand_download_behavior_treat_error_as_warn, background_task_maximum_delay, - control_plane_api, + control_plane_api: control_plane_api + .ok_or_else(|| anyhow::anyhow!("`control_plane_api` must be set"))?, control_plane_emergency_mode, heatmap_upload_concurrency, secondary_download_concurrency, @@ -573,6 +574,7 @@ impl PageServerConf { background_task_maximum_delay: Duration::ZERO, load_previous_heatmap: Some(true), generate_unarchival_heatmap: Some(true), + control_plane_api: Some(Url::parse("http://localhost:6666").unwrap()), ..Default::default() }; PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap() @@ -641,9 +643,12 @@ mod tests { use super::PageServerConf; #[test] - fn test_empty_config_toml_is_valid() { - // we use Default impl of everything in this situation + fn test_minimal_config_toml_is_valid() { + // The minimal valid config for running a pageserver: + // - control_plane_api is mandatory, as pageservers cannot run in isolation + // - we use Default impl of everything else in this situation let input = r#" + control_plane_api = "http://localhost:6666" "#; let config_toml = toml_edit::de::from_str::(input) .expect("empty config is valid"); diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 59c94f1549..468e5463b0 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -58,14 +58,8 @@ pub trait StorageControllerUpcallApi { impl StorageControllerUpcallClient { /// A None return value indicates that the input `conf` object does not have control /// plane API enabled. - pub fn new( - conf: &'static PageServerConf, - cancel: &CancellationToken, - ) -> Result, reqwest::Error> { - let mut url = match conf.control_plane_api.as_ref() { - Some(u) => u.clone(), - None => return Ok(None), - }; + pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Self { + let mut url = conf.control_plane_api.clone(); if let Ok(mut segs) = url.path_segments_mut() { // This ensures that `url` ends with a slash if it doesn't already. @@ -85,15 +79,17 @@ impl StorageControllerUpcallClient { } for cert in &conf.ssl_ca_certs { - client = client.add_root_certificate(Certificate::from_der(cert.contents())?); + client = client.add_root_certificate( + Certificate::from_der(cert.contents()).expect("Invalid certificate in config"), + ); } - Ok(Some(Self { - http_client: client.build()?, + Self { + http_client: client.build().expect("Failed to construct HTTP client"), base_url: url, node_id: conf.id, cancel: cancel.clone(), - })) + } } #[tracing::instrument(skip_all)] diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 6dd7d741c1..4d62bc4ab5 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -585,7 +585,7 @@ impl DeletionQueue { /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice. pub fn new( remote_storage: GenericRemoteStorage, - controller_upcall_client: Option, + controller_upcall_client: C, conf: &'static PageServerConf, ) -> (Self, DeletionQueueWorkers) where @@ -701,7 +701,7 @@ mod test { async fn restart(&mut self) { let (deletion_queue, workers) = DeletionQueue::new( self.storage.clone(), - Some(self.mock_control_plane.clone()), + self.mock_control_plane.clone(), self.harness.conf, ); @@ -821,11 +821,8 @@ mod test { let mock_control_plane = MockStorageController::new(); - let (deletion_queue, worker) = DeletionQueue::new( - storage.clone(), - Some(mock_control_plane.clone()), - harness.conf, - ); + let (deletion_queue, worker) = + DeletionQueue::new(storage.clone(), mock_control_plane.clone(), harness.conf); let worker_join = worker.spawn_with(&tokio::runtime::Handle::current()); diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index 4e775f15eb..363b1427f5 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -53,7 +53,7 @@ where tx: tokio::sync::mpsc::Sender, // Client for calling into control plane API for validation of deletes - controller_upcall_client: Option, + controller_upcall_client: C, // DeletionLists which are waiting generation validation. Not safe to // execute until [`validate`] has processed them. @@ -86,7 +86,7 @@ where conf: &'static PageServerConf, rx: tokio::sync::mpsc::Receiver, tx: tokio::sync::mpsc::Sender, - controller_upcall_client: Option, + controller_upcall_client: C, lsn_table: Arc>, cancel: CancellationToken, ) -> Self { @@ -137,20 +137,16 @@ where return Ok(()); } - let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client { - match controller_upcall_client - .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect()) - .await - { - Ok(tenants) => tenants, - Err(RetryForeverError::ShuttingDown) => { - // The only way a validation call returns an error is when the cancellation token fires - return Err(DeletionQueueError::ShuttingDown); - } + let tenants_valid = match self + .controller_upcall_client + .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect()) + .await + { + Ok(tenants) => tenants, + Err(RetryForeverError::ShuttingDown) => { + // The only way a validation call returns an error is when the cancellation token fires + return Err(DeletionQueueError::ShuttingDown); } - } else { - // Control plane API disabled. In legacy mode we consider everything valid. - tenant_generations.keys().map(|k| (*k, true)).collect() }; let mut validated_sequence: Option = None; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 698579e8fb..dcd043c4a1 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -4254,9 +4254,7 @@ impl TenantShard { deletion_queue_client: DeletionQueueClient, l0_flush_global_state: L0FlushGlobalState, ) -> TenantShard { - debug_assert!( - !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none() - ); + assert!(!attached_conf.location.generation.is_none()); let (state, mut rx) = watch::channel(state); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 2ae7e1e875..86aef9b42c 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -346,7 +346,8 @@ async fn init_load_generations( "Emergency mode! Tenants will be attached unsafely using their last known generation" ); emergency_generations(tenant_confs) - } else if let Some(client) = StorageControllerUpcallClient::new(conf, cancel)? { + } else { + let client = StorageControllerUpcallClient::new(conf, cancel); info!("Calling {} API to re-attach tenants", client.base_url()); // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. match client.re_attach(conf).await { @@ -360,9 +361,6 @@ async fn init_load_generations( anyhow::bail!("Shut down while waiting for control plane re-attach response") } } - } else { - info!("Control plane API not configured, tenant generations are disabled"); - return Ok(None); }; // The deletion queue needs to know about the startup attachment state to decide which (if any) stored @@ -1153,17 +1151,8 @@ impl TenantManager { // Testing hack: if we are configured with no control plane, then drop the generation // from upserts. This enables creating generation-less tenants even though neon_local // always uses generations when calling the location conf API. - let attached_conf = if cfg!(feature = "testing") { - let mut conf = AttachedTenantConf::try_from(new_location_config) - .map_err(UpsertLocationError::BadRequest)?; - if self.conf.control_plane_api.is_none() { - conf.location.generation = Generation::none(); - } - conf - } else { - AttachedTenantConf::try_from(new_location_config) - .map_err(UpsertLocationError::BadRequest)? - }; + let attached_conf = AttachedTenantConf::try_from(new_location_config) + .map_err(UpsertLocationError::BadRequest)?; let tenant = tenant_spawn( self.conf, diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index b917fdbfd8..6ab6b90cb6 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -163,8 +163,7 @@ pub async fn doit( // Ensure at-least-once delivery of the upcall to storage controller // before we mark the task as done and never come here again. // - let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel)? - .expect("storcon configured"); + let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel); storcon_client .put_timeline_import_status( timeline.tenant_shard_id, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1d668d4b2d..b93df4ede4 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1194,8 +1194,7 @@ class NeonEnv: else: cfg["broker"]["listen_addr"] = self.broker.listen_addr() - if self.control_plane_api is not None: - cfg["control_plane_api"] = self.control_plane_api + cfg["control_plane_api"] = self.control_plane_api if self.control_plane_hooks_api is not None: cfg["control_plane_hooks_api"] = self.control_plane_hooks_api diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index fa1cd61206..e3f9982486 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -3,7 +3,7 @@ Tests in this module exercise the pageserver's behavior around generation numbers, as defined in docs/rfcs/025-generation-numbers.md. Briefly, the behaviors we require of the pageserver are: -- Do not start a tenant without a generation number if control_plane_api is set +- Do not start a tenant without a generation number - Remote objects must be suffixed with generation - Deletions may only be executed after validating generation - Updates to remote_consistent_lsn may only be made visible after validating generation From 6d6b83e737b5b89796f6d84e06a3f7f420bccd6c Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 28 Apr 2025 21:17:03 +0300 Subject: [PATCH 006/142] Prewarm implementation (#11741) ## Problem Continue work on prewarm started in PR https://github.com/neondatabase/neon/pull/11740 ## Summary of changes Implement prewarm using prefetch --------- Co-authored-by: Konstantin Knizhnik --- pgxn/neon/file_cache.c | 389 ++++++++++++++++++++++++++++++++++++++++- pgxn/neon/file_cache.h | 14 ++ 2 files changed, 397 insertions(+), 6 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index e2c1f7682f..924e0055c1 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -98,7 +98,6 @@ #define MB ((uint64)1024*1024) #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ >> lfc_chunk_size_log)) - #define BLOCK_TO_CHUNK_OFF(blkno) ((blkno) & (lfc_blocks_per_chunk-1)) /* @@ -135,6 +134,15 @@ typedef struct FileCacheEntry #define N_COND_VARS 64 #define CV_WAIT_TIMEOUT 10 +#define MAX_PREWARM_WORKERS 8 + +typedef struct PrewarmWorkerState +{ + uint32 prewarmed_pages; + uint32 skipped_pages; + TimestampTz completed; +} PrewarmWorkerState; + typedef struct FileCacheControl { uint64 generation; /* generation is needed to handle correct hash @@ -156,25 +164,43 @@ typedef struct FileCacheControl dlist_head holes; /* double linked list of punched holes */ HyperLogLogState wss_estimation; /* estimation of working set size */ ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */ + PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS]; + size_t n_prewarm_workers; + size_t n_prewarm_entries; + size_t total_prewarm_pages; + size_t prewarm_batch; + bool prewarm_active; + bool prewarm_canceled; + dsm_handle prewarm_lfc_state_handle; } FileCacheControl; -bool lfc_store_prefetch_result; +#define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc + +#define FILE_CACHE_STATE_BITMAP(fcs) ((uint8*)&(fcs)->chunks[(fcs)->n_chunks]) +#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks) (sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * lfc_blocks_per_chunk)+7)/8) +#define FILE_CACHE_STATE_SIZE(fcs) (sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8) static HTAB *lfc_hash; static int lfc_desc = -1; static LWLockId lfc_lock; static int lfc_max_size; static int lfc_size_limit; +static int lfc_prewarm_limit; +static int lfc_prewarm_batch; static int lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG; static int lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK; static char *lfc_path; static uint64 lfc_generation; static FileCacheControl *lfc_ctl; +static bool lfc_do_prewarm; static shmem_startup_hook_type prev_shmem_startup_hook; #if PG_VERSION_NUM>=150000 static shmem_request_hook_type prev_shmem_request_hook; #endif +bool lfc_store_prefetch_result; +bool lfc_prewarm_update_ws_estimation; + #define LFC_ENABLED() (lfc_ctl->limit != 0) /* @@ -500,6 +526,17 @@ lfc_init(void) NULL, NULL); + DefineCustomBoolVariable("neon.prewarm_update_ws_estimation", + "Consider prewarmed pages for working set estimation", + NULL, + &lfc_prewarm_update_ws_estimation, + true, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + DefineCustomIntVariable("neon.max_file_cache_size", "Maximal size of Neon local file cache", NULL, @@ -550,6 +587,32 @@ lfc_init(void) lfc_change_chunk_size, NULL); + DefineCustomIntVariable("neon.file_cache_prewarm_limit", + "Maximal number of prewarmed chunks", + NULL, + &lfc_prewarm_limit, + INT_MAX, /* no limit by default */ + 0, + INT_MAX, + PGC_SIGHUP, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("neon.file_cache_prewarm_batch", + "Number of pages retrivied by prewarm from page server", + NULL, + &lfc_prewarm_batch, + 64, + 1, + INT_MAX, + PGC_SIGHUP, + 0, + NULL, + NULL, + NULL); + if (lfc_max_size == 0) return; @@ -563,6 +626,314 @@ lfc_init(void) #endif } +FileCacheState* +lfc_get_state(size_t max_entries) +{ + FileCacheState* fcs = NULL; + + if (lfc_maybe_disabled() || max_entries == 0) /* fast exit if file cache is disabled */ + return NULL; + + LWLockAcquire(lfc_lock, LW_SHARED); + + if (LFC_ENABLED()) + { + dlist_iter iter; + size_t i = 0; + uint8* bitmap; + size_t n_pages = 0; + size_t n_entries = Min(max_entries, lfc_ctl->used - lfc_ctl->pinned); + size_t state_size = FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_entries); + fcs = (FileCacheState*)palloc0(state_size); + SET_VARSIZE(fcs, state_size); + fcs->magic = FILE_CACHE_STATE_MAGIC; + fcs->chunk_size_log = lfc_chunk_size_log; + fcs->n_chunks = n_entries; + bitmap = FILE_CACHE_STATE_BITMAP(fcs); + + dlist_reverse_foreach(iter, &lfc_ctl->lru) + { + FileCacheEntry *entry = dlist_container(FileCacheEntry, list_node, iter.cur); + fcs->chunks[i] = entry->key; + for (int j = 0; j < lfc_blocks_per_chunk; j++) + { + if (GET_STATE(entry, j) != UNAVAILABLE) + { + BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j); + n_pages += 1; + } + } + if (++i == n_entries) + break; + } + Assert(i == n_entries); + fcs->n_pages = n_pages; + Assert(pg_popcount((char*)bitmap, ((n_entries << lfc_chunk_size_log) + 7)/8) == n_pages); + elog(LOG, "LFC: save state of %d chunks %d pages", (int)n_entries, (int)n_pages); + } + + LWLockRelease(lfc_lock); + + return fcs; +} + +/* + * Prewarm LFC cache to the specified state. It uses lfc_prefetch function to load prewarmed page without hoilding shared buffer lock + * and avoid race conditions with other backends. + */ +void +lfc_prewarm(FileCacheState* fcs, uint32 n_workers) +{ + size_t fcs_chunk_size_log; + size_t n_entries; + size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size); + size_t fcs_size; + dsm_segment *seg; + BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS]; + + + if (!lfc_ensure_opened()) + return; + + if (prewarm_batch == 0 || lfc_prewarm_limit == 0 || n_workers == 0) + { + elog(LOG, "LFC: prewarm is disabled"); + return; + } + + if (n_workers > MAX_PREWARM_WORKERS) + { + elog(ERROR, "LFC: Too much prewarm workers, maximum is %d", MAX_PREWARM_WORKERS); + } + + if (fcs == NULL || fcs->n_chunks == 0) + { + elog(LOG, "LFC: nothing to prewarm"); + return; + } + + if (fcs->magic != FILE_CACHE_STATE_MAGIC) + { + elog(ERROR, "LFC: Invalid file cache state magic: %X", fcs->magic); + } + + fcs_size = VARSIZE(fcs); + if (FILE_CACHE_STATE_SIZE(fcs) != fcs_size) + { + elog(ERROR, "LFC: Invalid file cache state size: %u vs. %u", (unsigned)FILE_CACHE_STATE_SIZE(fcs), VARSIZE(fcs)); + } + + fcs_chunk_size_log = fcs->chunk_size_log; + if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG) + { + elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log); + } + + n_entries = Min(fcs->n_chunks, lfc_prewarm_limit); + Assert(n_entries != 0); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + /* Do not prewarm more entries than LFC limit */ + if (lfc_ctl->limit <= lfc_ctl->size) + { + elog(LOG, "LFC: skip prewarm because LFC is already filled"); + LWLockRelease(lfc_lock); + return; + } + + if (lfc_ctl->prewarm_active) + { + LWLockRelease(lfc_lock); + elog(ERROR, "LFC: skip prewarm because another prewarm is still active"); + } + lfc_ctl->n_prewarm_entries = n_entries; + lfc_ctl->n_prewarm_workers = n_workers; + lfc_ctl->prewarm_active = true; + lfc_ctl->prewarm_canceled = false; + lfc_ctl->prewarm_batch = prewarm_batch; + memset(lfc_ctl->prewarm_workers, 0, n_workers*sizeof(PrewarmWorkerState)); + + LWLockRelease(lfc_lock); + + /* Calculate total number of pages to be prewarmed */ + lfc_ctl->total_prewarm_pages = fcs->n_pages; + + seg = dsm_create(fcs_size, 0); + memcpy(dsm_segment_address(seg), fcs, fcs_size); + lfc_ctl->prewarm_lfc_state_handle = dsm_segment_handle(seg); + + /* Spawn background workers */ + for (uint32 i = 0; i < n_workers; i++) + { + BackgroundWorker worker = {0}; + + worker.bgw_flags = BGWORKER_SHMEM_ACCESS; + worker.bgw_start_time = BgWorkerStart_ConsistentState; + worker.bgw_restart_time = BGW_NEVER_RESTART; + strcpy(worker.bgw_library_name, "neon"); + strcpy(worker.bgw_function_name, "lfc_prewarm_main"); + snprintf(worker.bgw_name, BGW_MAXLEN, "LFC prewarm worker %d", i+1); + strcpy(worker.bgw_type, "LFC prewarm worker"); + worker.bgw_main_arg = Int32GetDatum(i); + /* must set notify PID to wait for shutdown */ + worker.bgw_notify_pid = MyProcPid; + + if (!RegisterDynamicBackgroundWorker(&worker, &bgw_handle[i])) + { + ereport(LOG, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("LFC: registering dynamic bgworker prewarm failed"), + errhint("Consider increasing the configuration parameter \"%s\".", "max_worker_processes"))); + n_workers = i; + lfc_ctl->prewarm_canceled = true; + break; + } + } + + for (uint32 i = 0; i < n_workers; i++) + { + while (true) + { + PG_TRY(); + { + BgwHandleStatus status = WaitForBackgroundWorkerShutdown(bgw_handle[i]); + if (status != BGWH_STOPPED && status != BGWH_POSTMASTER_DIED) + { + elog(LOG, "LFC: Unexpected status of prewarm worker termination: %d", status); + } + break; + } + PG_CATCH(); + { + elog(LOG, "LFC: cancel prewarm"); + lfc_ctl->prewarm_canceled = true; + } + PG_END_TRY(); + } + if (!lfc_ctl->prewarm_workers[i].completed) + { + /* Background worker doesn't set completion time: it means that it was abnormally terminated */ + elog(LOG, "LFC: prewarm worker %d failed", i+1); + /* Set completion time to prevent get_prewarm_info from considering this worker as active */ + lfc_ctl->prewarm_workers[i].completed = GetCurrentTimestamp(); + } + } + dsm_detach(seg); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + lfc_ctl->prewarm_active = false; + LWLockRelease(lfc_lock); +} + +void +lfc_prewarm_main(Datum main_arg) +{ + size_t snd_idx = 0, rcv_idx = 0; + size_t n_sent = 0, n_received = 0; + size_t fcs_chunk_size_log; + size_t max_prefetch_pages; + size_t prewarm_batch; + size_t n_workers; + dsm_segment *seg; + FileCacheState* fcs; + uint8* bitmap; + BufferTag tag; + PrewarmWorkerState* ws; + uint32 worker_id = DatumGetInt32(main_arg); + + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + seg = dsm_attach(lfc_ctl->prewarm_lfc_state_handle); + if (seg == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not map dynamic shared memory segment"))); + + fcs = (FileCacheState*) dsm_segment_address(seg); + prewarm_batch = lfc_ctl->prewarm_batch; + fcs_chunk_size_log = fcs->chunk_size_log; + n_workers = lfc_ctl->n_prewarm_workers; + max_prefetch_pages = lfc_ctl->n_prewarm_entries << fcs_chunk_size_log; + ws = &lfc_ctl->prewarm_workers[worker_id]; + bitmap = FILE_CACHE_STATE_BITMAP(fcs); + + /* enable prefetch in LFC */ + lfc_store_prefetch_result = true; + lfc_do_prewarm = true; /* Flag for lfc_prefetch preventing replacement of existed entries if LFC cache is full */ + + elog(LOG, "LFC: worker %d start prewarming", worker_id); + while (!lfc_ctl->prewarm_canceled) + { + if (snd_idx < max_prefetch_pages) + { + if ((snd_idx >> fcs_chunk_size_log) % n_workers != worker_id) + { + /* If there are multiple workers, split chunks between them */ + snd_idx += 1 << fcs_chunk_size_log; + } + else + { + if (BITMAP_ISSET(bitmap, snd_idx)) + { + tag = fcs->chunks[snd_idx >> fcs_chunk_size_log]; + tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1); + if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum)) + { + (void)communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); + n_sent += 1; + } + else + { + ws->skipped_pages += 1; + BITMAP_CLR(bitmap, snd_idx); + } + } + snd_idx += 1; + } + } + if (n_sent >= n_received + prewarm_batch || snd_idx == max_prefetch_pages) + { + if (n_received == n_sent && snd_idx == max_prefetch_pages) + { + break; + } + if ((rcv_idx >> fcs_chunk_size_log) % n_workers != worker_id) + { + /* Skip chunks processed by other workers */ + rcv_idx += 1 << fcs_chunk_size_log; + continue; + } + + /* Locate next block to prefetch */ + while (!BITMAP_ISSET(bitmap, rcv_idx)) + { + rcv_idx += 1; + } + tag = fcs->chunks[rcv_idx >> fcs_chunk_size_log]; + tag.blockNum += rcv_idx & ((1 << fcs_chunk_size_log) - 1); + if (communicator_prefetch_receive(tag)) + { + ws->prewarmed_pages += 1; + } + else + { + ws->skipped_pages += 1; + } + rcv_idx += 1; + n_received += 1; + } + } + /* No need to perform prefetch cleanup here because prewarm worker will be terminated and + * connection to PS dropped just after return from this function. + */ + Assert(n_sent == n_received || lfc_ctl->prewarm_canceled); + elog(LOG, "LFC: worker %d complete prewarming: loaded %ld pages", worker_id, (long)n_received); + lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp(); +} + + /* * Check if page is present in the cache. * Returns true if page is found in local cache. @@ -1001,8 +1372,11 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash) * If we can't (e.g. because all other slots are being accessed) * then we will remove this entry from the hash and continue * on to the next chunk, as we may not exceed the limit. + * + * While prewarming LFC we do not want to replace existed entries, + * so we just stop prewarm is LFC cache is full. */ - else if (!dlist_is_empty(&lfc_ctl->lru)) + else if (!dlist_is_empty(&lfc_ctl->lru) && !lfc_do_prewarm) { /* Cache overflow: evict least recently used chunk */ FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, @@ -1026,6 +1400,7 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash) /* Can't add this chunk - we don't have the space for it */ hash_search_with_hash_value(lfc_hash, &entry->key, hash, HASH_REMOVE, NULL); + lfc_ctl->prewarm_canceled = true; /* cancel prewarm if LFC limit is reached */ return false; } @@ -1112,9 +1487,11 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); - tag.blockNum = blkno; - addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); - + if (lfc_prewarm_update_ws_estimation) + { + tag.blockNum = blkno; + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + } if (found) { state = GET_STATE(entry, chunk_offs); diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h index 849558b83d..c7b6b09f72 100644 --- a/pgxn/neon/file_cache.h +++ b/pgxn/neon/file_cache.h @@ -13,6 +13,17 @@ #include "neon_pgversioncompat.h" +typedef struct FileCacheState +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + uint32 magic; + uint32 n_chunks; + uint32 n_pages; + uint16 chunk_size_log; + BufferTag chunks[FLEXIBLE_ARRAY_MEMBER]; + /* followed by bitmap */ +} FileCacheState; + /* GUCs */ extern bool lfc_store_prefetch_result; @@ -32,7 +43,10 @@ extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, extern void lfc_init(void); extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, const void* buffer, XLogRecPtr lsn); +extern FileCacheState* lfc_get_state(size_t max_entries); +extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers); +PGDLLEXPORT void lfc_prewarm_main(Datum main_arg); static inline bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, From c1ff7db1874c6b5ff8ee134b944f1c54616ba37b Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Mon, 28 Apr 2025 14:54:26 -0400 Subject: [PATCH 007/142] fix(pageserver): consider tombstones in replorigin (#11752) ## Problem We didn't consider tombstones in replorigin read path in the past. This was fine because tombstones are stored as LSN::Invalid before we universally define what the tombstone is for sparse keyspaces. Now we remove non-inherited keys during detach ancestor and write the universal tombstone "empty image". So we need to consider it across all the read paths. related: https://github.com/neondatabase/neon/pull/11299 ## Summary of changes Empty value gets ignored for replorigin scans. --------- Signed-off-by: Alex Chi Z --- pageserver/src/pgdatadir_mapping.rs | 16 +++++- pageserver/src/tenant.rs | 52 ++++++++++++++++++- .../src/tenant/timeline/detach_ancestor.rs | 2 +- 3 files changed, 67 insertions(+), 3 deletions(-) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 81e548a095..ccb48d8bc1 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1084,8 +1084,17 @@ impl Timeline { let mut result = HashMap::new(); for (k, v) in kv { let v = v?; + if v.is_empty() { + // This is a tombstone -- we can skip it. + // Originally, the replorigin code uses `Lsn::INVALID` to represent a tombstone. However, as it part of + // the sparse keyspace and the sparse keyspace uses an empty image to universally represent a tombstone, + // we also need to consider that. Such tombstones might be written on the detach ancestor code path to + // avoid the value going into the child branch. (See [`crate::tenant::timeline::detach_ancestor::generate_tombstone_image_layer`] for more details.) + continue; + } let origin_id = k.field6 as RepOriginId; - let origin_lsn = Lsn::des(&v).unwrap(); + let origin_lsn = Lsn::des(&v) + .with_context(|| format!("decode replorigin value for {}: {v:?}", origin_id))?; if origin_lsn != Lsn::INVALID { result.insert(origin_id, origin_lsn); } @@ -2578,6 +2587,11 @@ impl DatadirModification<'_> { } } + #[cfg(test)] + pub fn put_for_unit_test(&mut self, key: Key, val: Value) { + self.put(key, val); + } + fn put(&mut self, key: Key, val: Value) { if Self::is_data_key(&key) { self.put_data(key.to_compact(), val) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index dcd043c4a1..e59db74479 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -5947,7 +5947,9 @@ mod tests { use itertools::Itertools; #[cfg(feature = "testing")] use models::CompactLsnRange; - use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX}; + use pageserver_api::key::{ + AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX, repl_origin_key, + }; use pageserver_api::keyspace::KeySpace; #[cfg(feature = "testing")] use pageserver_api::keyspace::KeySpaceRandomAccum; @@ -8183,6 +8185,54 @@ mod tests { assert_eq!(files.get("pg_logical/mappings/test2"), None); } + #[tokio::test] + async fn test_repl_origin_tombstones() { + let harness = TenantHarness::create("test_repl_origin_tombstones") + .await + .unwrap(); + + let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let repl_lsn = Lsn(0x10); + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification.put_for_unit_test(repl_origin_key(2), Value::Image(Bytes::new())); + modification.set_replorigin(1, repl_lsn).await.unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // we can read everything from the storage + let repl_origins = tline + .get_replorigins(lsn, &ctx, io_concurrency.clone()) + .await + .unwrap(); + assert_eq!(repl_origins.len(), 1); + assert_eq!(repl_origins[&1], lsn); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification.put_for_unit_test( + repl_origin_key(3), + Value::Image(Bytes::copy_from_slice(b"cannot_decode_this")), + ); + modification.commit(&ctx).await.unwrap(); + } + let result = tline + .get_replorigins(lsn, &ctx, io_concurrency.clone()) + .await; + assert!(result.is_err()); + } + #[tokio::test] async fn test_metadata_image_creation() -> anyhow::Result<()> { let harness = TenantHarness::create("test_metadata_image_creation").await?; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 8e95c3a8ff..649b33e294 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -178,7 +178,7 @@ impl Attempt { } } -async fn generate_tombstone_image_layer( +pub(crate) async fn generate_tombstone_image_layer( detached: &Arc, ancestor: &Arc, ancestor_lsn: Lsn, From 9e8ab2ab4f653d1ca96869a1c52d4aec0deb202c Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Mon, 28 Apr 2025 14:13:35 -0500 Subject: [PATCH 008/142] Skip remote extensions WITH_LIB test when sanitizers are enabled (#11758) In order for the test to work when sanitizers are enabled, we would need to compile the dummy Postgres extension with the same sanitizer flags that we compile Postgres and the neon extension with. Doing this work would be a little more than trivial, so skipping is the best option, at least for now. Signed-off-by: Tristan Partin --- test_runner/regress/test_download_extensions.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 3b6c94a268..d28240c722 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -14,7 +14,7 @@ from fixtures.log_helper import log from fixtures.metrics import parse_metrics from fixtures.paths import BASE_DIR from fixtures.pg_config import PgConfigKey -from fixtures.utils import subprocess_capture +from fixtures.utils import WITH_SANITIZERS, subprocess_capture from werkzeug.wrappers.response import Response if TYPE_CHECKING: @@ -148,6 +148,15 @@ def test_remote_extensions( pg_config: PgConfig, extension: RemoteExtension, ): + if WITH_SANITIZERS and extension is RemoteExtension.WITH_LIB: + pytest.skip( + """ + For this test to work with sanitizers enabled, we would need to + compile the dummy Postgres extension with the same CFLAGS that we + compile Postgres and the neon extension with to link the sanitizers. + """ + ) + # Setup a mock nginx S3 gateway which will return our test extension. (host, port) = httpserver_listen_address extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway" From 3593356c1055f8462011542c561d90b02c84e4eb Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 29 Apr 2025 09:44:28 +0300 Subject: [PATCH 009/142] Prewarm sql api (#11742) ## Problem Continue work on prewarm, see https://github.com/neondatabase/neon/pull/11740 https://github.com/neondatabase/neon/pull/11741 ## Summary of changes Add SQL API to prewarm --------- Co-authored-by: Konstantin Knizhnik --- pgxn/neon/Makefile | 2 + pgxn/neon/file_cache.c | 88 +++++++++++++- pgxn/neon/neon--1.5--1.6.sql | 22 ++++ pgxn/neon/neon--1.6--1.5.sql | 7 ++ test_runner/regress/test_lfc_prewarm.py | 147 ++++++++++++++++++++++++ 5 files changed, 263 insertions(+), 3 deletions(-) create mode 100644 pgxn/neon/neon--1.5--1.6.sql create mode 100644 pgxn/neon/neon--1.6--1.5.sql create mode 100644 test_runner/regress/test_lfc_prewarm.py diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 426b176af9..8bcc6bf924 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -36,6 +36,8 @@ DATA = \ neon--1.2--1.3.sql \ neon--1.3--1.4.sql \ neon--1.4--1.5.sql \ + neon--1.5--1.6.sql \ + neon--1.6--1.5.sql \ neon--1.5--1.4.sql \ neon--1.4--1.3.sql \ neon--1.3--1.2.sql \ diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 924e0055c1..ecc55bb540 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -793,8 +793,10 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers) for (uint32 i = 0; i < n_workers; i++) { - while (true) + bool interrupted; + do { + interrupted = false; PG_TRY(); { BgwHandleStatus status = WaitForBackgroundWorkerShutdown(bgw_handle[i]); @@ -802,15 +804,16 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers) { elog(LOG, "LFC: Unexpected status of prewarm worker termination: %d", status); } - break; } PG_CATCH(); { elog(LOG, "LFC: cancel prewarm"); lfc_ctl->prewarm_canceled = true; + interrupted = true; } PG_END_TRY(); - } + } while (interrupted); + if (!lfc_ctl->prewarm_workers[i].completed) { /* Background worker doesn't set completion time: it means that it was abnormally terminated */ @@ -2125,3 +2128,82 @@ approximate_working_set_size(PG_FUNCTION_ARGS) } PG_RETURN_NULL(); } + +PG_FUNCTION_INFO_V1(get_local_cache_state); + +Datum +get_local_cache_state(PG_FUNCTION_ARGS) +{ + size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0); + FileCacheState* fcs = lfc_get_state(max_entries); + if (fcs != NULL) + PG_RETURN_BYTEA_P((bytea*)fcs); + else + PG_RETURN_NULL(); +} + +PG_FUNCTION_INFO_V1(prewarm_local_cache); + +Datum +prewarm_local_cache(PG_FUNCTION_ARGS) +{ + bytea* state = PG_GETARG_BYTEA_PP(0); + uint32 n_workers = PG_GETARG_INT32(1); + FileCacheState* fcs = (FileCacheState*)state; + + lfc_prewarm(fcs, n_workers); + + PG_RETURN_NULL(); +} + +PG_FUNCTION_INFO_V1(get_prewarm_info); + +Datum +get_prewarm_info(PG_FUNCTION_ARGS) +{ + Datum values[4]; + bool nulls[4]; + TupleDesc tupdesc; + uint32 prewarmed_pages = 0; + uint32 skipped_pages = 0; + uint32 active_workers = 0; + uint32 total_pages; + size_t n_workers; + + if (lfc_size_limit == 0) + PG_RETURN_NULL(); + + LWLockAcquire(lfc_lock, LW_SHARED); + if (!lfc_ctl || lfc_ctl->n_prewarm_workers == 0) + { + LWLockRelease(lfc_lock); + PG_RETURN_NULL(); + } + n_workers = lfc_ctl->n_prewarm_workers; + total_pages = lfc_ctl->total_prewarm_pages; + for (size_t i = 0; i < n_workers; i++) + { + PrewarmWorkerState* ws = &lfc_ctl->prewarm_workers[i]; + prewarmed_pages += ws->prewarmed_pages; + skipped_pages += ws->skipped_pages; + active_workers += ws->completed != 0; + } + LWLockRelease(lfc_lock); + + tupdesc = CreateTemplateTupleDesc(4); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "total_pages", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prewarmed_pages", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "skipped_pages", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "active_workers", INT4OID, -1, 0); + tupdesc = BlessTupleDesc(tupdesc); + + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = Int32GetDatum(total_pages); + values[1] = Int32GetDatum(prewarmed_pages); + values[2] = Int32GetDatum(skipped_pages); + values[3] = Int32GetDatum(active_workers); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + diff --git a/pgxn/neon/neon--1.5--1.6.sql b/pgxn/neon/neon--1.5--1.6.sql new file mode 100644 index 0000000000..c05f0f87aa --- /dev/null +++ b/pgxn/neon/neon--1.5--1.6.sql @@ -0,0 +1,22 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.6'" to load this file. \quit + +CREATE FUNCTION get_prewarm_info(out total_pages integer, out prewarmed_pages integer, out skipped_pages integer, out active_workers integer) +RETURNS record +AS 'MODULE_PATHNAME', 'get_prewarm_info' +LANGUAGE C STRICT +PARALLEL SAFE; + +CREATE FUNCTION get_local_cache_state(max_chunks integer default null) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_local_cache_state' +LANGUAGE C +PARALLEL UNSAFE; + +CREATE FUNCTION prewarm_local_cache(state bytea, n_workers integer default 1) +RETURNS void +AS 'MODULE_PATHNAME', 'prewarm_local_cache' +LANGUAGE C STRICT +PARALLEL UNSAFE; + + + diff --git a/pgxn/neon/neon--1.6--1.5.sql b/pgxn/neon/neon--1.6--1.5.sql new file mode 100644 index 0000000000..57512980f5 --- /dev/null +++ b/pgxn/neon/neon--1.6--1.5.sql @@ -0,0 +1,7 @@ +DROP FUNCTION IF EXISTS get_prewarm_info(out total_pages integer, out prewarmed_pages integer, out skipped_pages integer, out active_workers integer); + +DROP FUNCTION IF EXISTS get_local_cache_state(max_chunks integer); + +DROP FUNCTION IF EXISTS prewarm_local_cache(state bytea, n_workers integer default 1); + + diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py new file mode 100644 index 0000000000..dd0ae1921d --- /dev/null +++ b/test_runner/regress/test_lfc_prewarm.py @@ -0,0 +1,147 @@ +import random +import threading +import time + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import USE_LFC + + +def check_pinned_entries(cur): + # some LFC buffer can be temporary locked by autovacuum or background writer + for _ in range(10): + cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'") + n_pinned = cur.fetchall()[0][0] + if n_pinned == 0: + break + time.sleep(1) + assert n_pinned == 0 + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +def test_lfc_prewarm(neon_simple_env: NeonEnv): + env = neon_simple_env + n_records = 1000000 + + endpoint = env.endpoints.create_start( + branch_name="main", + config_lines=[ + "autovacuum = off", + "shared_buffers=1MB", + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + "neon.file_cache_prewarm_limit=1000", + ], + ) + conn = endpoint.connect() + cur = conn.cursor() + cur.execute("create extension neon version '1.6'") + cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))") + cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))") + cur.execute("select get_local_cache_state()") + lfc_state = cur.fetchall()[0][0] + + endpoint.stop() + endpoint.start() + + conn = endpoint.connect() + cur = conn.cursor() + time.sleep(1) # wait until compute_ctl complete downgrade of extension to default version + cur.execute("alter extension neon update to '1.6'") + cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + + cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'") + lfc_used_pages = cur.fetchall()[0][0] + log.info(f"Used LFC size: {lfc_used_pages}") + cur.execute("select * from get_prewarm_info()") + prewarm_info = cur.fetchall()[0] + log.info(f"Prewarm info: {prewarm_info}") + log.info(f"Prewarm progress: {(prewarm_info[1] + prewarm_info[2]) * 100 // prewarm_info[0]}%") + + assert lfc_used_pages > 10000 + assert ( + prewarm_info[0] > 0 + and prewarm_info[1] > 0 + and prewarm_info[0] == prewarm_info[1] + prewarm_info[2] + ) + + cur.execute("select sum(pk) from t") + assert cur.fetchall()[0][0] == n_records * (n_records + 1) / 2 + + check_pinned_entries(cur) + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv): + env = neon_simple_env + n_records = 10000 + n_threads = 4 + + endpoint = env.endpoints.create_start( + branch_name="main", + config_lines=[ + "shared_buffers=1MB", + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + "neon.file_cache_prewarm_limit=1000000", + ], + ) + conn = endpoint.connect() + cur = conn.cursor() + cur.execute("create extension neon version '1.6'") + cur.execute( + "create table accounts(id integer primary key, balance bigint default 0, payload text default repeat('?', 1000)) with (fillfactor=10)" + ) + cur.execute(f"insert into accounts(id) values (generate_series(1,{n_records}))") + cur.execute("select get_local_cache_state()") + lfc_state = cur.fetchall()[0][0] + + running = True + + def workload(): + conn = endpoint.connect() + cur = conn.cursor() + n_transfers = 0 + while running: + src = random.randint(1, n_records) + dst = random.randint(1, n_records) + cur.execute("update accounts set balance=balance-100 where id=%s", (src,)) + cur.execute("update accounts set balance=balance+100 where id=%s", (dst,)) + n_transfers += 1 + log.info(f"Number of transfers: {n_transfers}") + + def prewarm(): + conn = endpoint.connect() + cur = conn.cursor() + n_prewarms = 0 + while running: + cur.execute("alter system set neon.file_cache_size_limit='1MB'") + cur.execute("select pg_reload_conf()") + cur.execute("alter system set neon.file_cache_size_limit='1GB'") + cur.execute("select pg_reload_conf()") + cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + n_prewarms += 1 + log.info(f"Number of prewarms: {n_prewarms}") + + workload_threads = [] + for _ in range(n_threads): + t = threading.Thread(target=workload) + workload_threads.append(t) + t.start() + + prewarm_thread = threading.Thread(target=prewarm) + prewarm_thread.start() + + time.sleep(20) + + running = False + for t in workload_threads: + t.join() + prewarm_thread.join() + + cur.execute("select sum(balance) from accounts") + total_balance = cur.fetchall()[0][0] + assert total_balance == 0 + + check_pinned_entries(cur) From d15f2ff57a3a9aefe0a10ea034d977e83b90504f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Tue, 29 Apr 2025 10:56:44 +0200 Subject: [PATCH 010/142] fix(lint-release-pr): adjust lint and action to match (#11766) ## Problem The `lint-release-pr` workflow run for https://github.com/neondatabase/neon/pull/11763 failed, because the new action did not match the lint. ## Summary of changes Include time in expected merge message regex. --- .github/scripts/lint-release-pr.sh | 2 +- .github/workflows/release.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/lint-release-pr.sh b/.github/scripts/lint-release-pr.sh index 6dc5b99f0e..d3badf9562 100755 --- a/.github/scripts/lint-release-pr.sh +++ b/.github/scripts/lint-release-pr.sh @@ -41,7 +41,7 @@ echo "Merge base of ${MAIN_BRANCH} and ${RELEASE_BRANCH}: ${MERGE_BASE}" LAST_COMMIT=$(git rev-parse HEAD) MERGE_COMMIT_MESSAGE=$(git log -1 --format=%s "${LAST_COMMIT}") -EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2}$" +EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2} UTC$" if ! [[ "${MERGE_COMMIT_MESSAGE}" =~ ${EXPECTED_MESSAGE_REGEX} ]]; then report_error "Merge commit message does not match expected pattern: ' release YYYY-MM-DD' diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4b19d6aa3f..0f97cf7c87 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -60,7 +60,7 @@ jobs: git config user.email "41898282+github-actions[bot]@users.noreply.github.com" - name: Create release PR - uses: neondatabase/dev-actions/release-pr@02b41460646b70d12dd33e5f56ebc5af2384c993 + uses: neondatabase/dev-actions/release-pr@290dec821d86fa8a93f019e8c69720f5865b5677 with: component: ${{ inputs.component }} cherry-pick: ${{ inputs.cherry-pick }} From 7f8b1d79c015ba1158883c689d8ee230426194da Mon Sep 17 00:00:00 2001 From: Busra Kugler Date: Tue, 29 Apr 2025 11:02:01 +0200 Subject: [PATCH 011/142] Replace dorny/paths-filter with step-security maintained version (#11663) ## Problem Our CI/CD security tool StepSecurity maintains safer forks of popular GitHub Actions with low security scores. We're replacing dorny/paths-filter with the maintained step-security/paths-filter version to reduce risk of supply chain breaches and potential CVEs. ## Summary of changes replace ```uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 ``` with ```uses: step-security/paths-filter@v3``` This PR will fix: neondatabase/cloud#26141 --- .github/workflows/build_and_test.yml | 2 +- .github/workflows/neon_extra_builds.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1791cddacc..6c025ad2a9 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -69,7 +69,7 @@ jobs: submodules: true - name: Check for file changes - uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 + uses: step-security/paths-filter@v3 id: files-changed with: token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 79467c8f95..9c504eb5bf 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -53,7 +53,7 @@ jobs: submodules: true - name: Check for Postgres changes - uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242 #v3 + uses: step-security/paths-filter@v3 id: files_changed with: token: ${{ github.token }} From 498d852bde2c1b20761f5ca588bcc64c86fce282 Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Tue, 29 Apr 2025 11:12:14 +0200 Subject: [PATCH 012/142] Fix the empty region if run on schedule (#11764) ## Problem When the workflow ran on a schedule, the `region_id` input was not set. As a result, an empty region value was used, which caused errors during execution. ## Summary of Changes - Added fallback logic to set a default region (`aws-us-east-2`) when `region_id` is not provided. - Ensures the workflow works correctly both when triggered manually (`workflow_dispatch`) and on schedule (`cron`). --- .github/workflows/cloud-extensions.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cloud-extensions.yml b/.github/workflows/cloud-extensions.yml index 7d60469f92..4114f0f9b4 100644 --- a/.github/workflows/cloud-extensions.yml +++ b/.github/workflows/cloud-extensions.yml @@ -68,7 +68,7 @@ jobs: id: create-neon-project uses: ./.github/actions/neon-project-create with: - region_id: ${{ inputs.region_id }} + region_id: ${{ inputs.region_id || 'aws-us-east-2' }} postgres_version: ${{ matrix.pg-version }} project_settings: ${{ steps.project-settings.outputs.settings }} # We need these settings to get the expected output results. From b3db7f66ac39f072502b41fd6723e7753a0c37e6 Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Tue, 29 Apr 2025 14:49:16 +0300 Subject: [PATCH 013/142] fix(compute): Change the local_proxy log level (#11770) Related to the INC-496 --- compute/vm-image-spec-bookworm.yaml | 2 +- compute/vm-image-spec-bullseye.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index ec24d73242..057099994a 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -22,7 +22,7 @@ commands: - name: local_proxy user: postgres sysvInitAction: respawn - shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' + shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' - name: postgres-exporter user: nobody sysvInitAction: respawn diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index b40bdecebc..d048e20b2e 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -22,7 +22,7 @@ commands: - name: local_proxy user: postgres sysvInitAction: respawn - shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' + shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' - name: postgres-exporter user: nobody sysvInitAction: respawn From 0b3592921179fee85ab28725a6601213d195ba53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 29 Apr 2025 14:46:15 +0200 Subject: [PATCH 014/142] Make SafekeeperReconciler parallel via semaphore (#11757) Right now we only support running one reconciliation per safekeeper. This is of course usually way below of what a safekeeper can do. Therefore, introduce a semaphore and spawn the tasks asynchronously as they come in. Part of #11670 --- storage_controller/src/main.rs | 10 +++- storage_controller/src/service.rs | 4 ++ .../src/service/safekeeper_reconciler.rs | 59 +++++++++++++------ 3 files changed, 55 insertions(+), 18 deletions(-) diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 71dde9e126..2eea2f9d10 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -19,7 +19,8 @@ use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ Config, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, - PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, Service, + PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, + SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT, Service, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; @@ -132,6 +133,10 @@ struct Cli { #[arg(long)] priority_reconciler_concurrency: Option, + /// Maximum number of safekeeper reconciliations that may run in parallel (per safekeeper) + #[arg(long)] + safekeeper_reconciler_concurrency: Option, + /// Tenant API rate limit, as requests per second per tenant. #[arg(long, default_value = "10")] tenant_rate_limit: NonZeroU32, @@ -403,6 +408,9 @@ async fn async_main() -> anyhow::Result<()> { priority_reconciler_concurrency: args .priority_reconciler_concurrency .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT), + safekeeper_reconciler_concurrency: args + .safekeeper_reconciler_concurrency + .unwrap_or(SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT), tenant_rate_limit: args.tenant_rate_limit, split_threshold: args.split_threshold, max_split_shards: args.max_split_shards, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 0f71a87f13..50ce559cc0 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -194,6 +194,7 @@ pub(crate) enum LeadershipStatus { pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256; +pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32; // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly @@ -382,6 +383,9 @@ pub struct Config { /// How many high-priority Reconcilers may be spawned concurrently pub priority_reconciler_concurrency: usize, + /// How many safekeeper reconciles may happen concurrently (per safekeeper) + pub safekeeper_reconciler_concurrency: usize, + /// How many API requests per second to allow per tenant, across all /// tenant-scoped API endpoints. Further API requests queue until ready. pub tenant_rate_limit: NonZeroU32, diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index b15772a36c..74308cabff 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -3,7 +3,10 @@ use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration}; use clashmap::{ClashMap, Entry}; use safekeeper_api::models::PullTimelineRequest; use safekeeper_client::mgmt_api; -use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender}; +use tokio::sync::{ + Semaphore, + mpsc::{self, UnboundedReceiver, UnboundedSender}, +}; use tokio_util::sync::CancellationToken; use tracing::Instrument; use utils::{ @@ -206,18 +209,27 @@ impl ReconcilerHandle { } pub(crate) struct SafekeeperReconciler { - service: Arc, + inner: SafekeeperReconcilerInner, + concurrency_limiter: Arc, rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>, cancel: CancellationToken, } +/// Thin wrapper over `Service` to not clutter its inherent functions +#[derive(Clone)] +struct SafekeeperReconcilerInner { + service: Arc, +} + impl SafekeeperReconciler { fn spawn(cancel: CancellationToken, service: Arc) -> ReconcilerHandle { // We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking. let (tx, rx) = mpsc::unbounded_channel(); + let concurrency = service.config.safekeeper_reconciler_concurrency; let mut reconciler = SafekeeperReconciler { - service, + inner: SafekeeperReconcilerInner { service }, rx, + concurrency_limiter: Arc::new(Semaphore::new(concurrency)), cancel: cancel.clone(), }; let handle = ReconcilerHandle { @@ -230,31 +242,44 @@ impl SafekeeperReconciler { } async fn run(&mut self) { loop { - // TODO add parallelism with semaphore here let req = tokio::select! { req = self.rx.recv() => req, _ = self.cancel.cancelled() => break, }; let Some((req, req_cancel)) = req else { break }; + + let permit_res = tokio::select! { + req = self.concurrency_limiter.clone().acquire_owned() => req, + _ = self.cancel.cancelled() => break, + }; + let Ok(_permit) = permit_res else { return }; + + let inner = self.inner.clone(); if req_cancel.is_cancelled() { continue; } - let kind = req.kind; - let tenant_id = req.tenant_id; - let timeline_id = req.timeline_id; - let node_id = req.safekeeper.skp.id; - self.reconcile_one(req, req_cancel) - .instrument(tracing::info_span!( - "reconcile_one", - ?kind, - %tenant_id, - ?timeline_id, - %node_id, - )) - .await; + tokio::task::spawn(async move { + let kind = req.kind; + let tenant_id = req.tenant_id; + let timeline_id = req.timeline_id; + let node_id = req.safekeeper.skp.id; + inner + .reconcile_one(req, req_cancel) + .instrument(tracing::info_span!( + "reconcile_one", + ?kind, + %tenant_id, + ?timeline_id, + %node_id, + )) + .await; + }); } } +} + +impl SafekeeperReconcilerInner { async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) { let req_host = req.safekeeper.skp.host.clone(); match req.kind { From 09247de8d508be4d65d74d1879ccdfa930ec4794 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Tue, 29 Apr 2025 13:11:24 +0000 Subject: [PATCH 015/142] proxy: Enable JSON logging by default (#11772) This does not affect local_proxy. --- proxy/README.md | 4 ++-- proxy/src/logging.rs | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/proxy/README.md b/proxy/README.md index 1156bfd352..583db36f28 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -32,7 +32,7 @@ To play with it locally one may start proxy over a local postgres installation (see end of this page on how to generate certs with openssl): ``` -./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444 +LOGFMT=text ./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444 ``` If both postgres and proxy are running you may send a SQL query: @@ -130,7 +130,7 @@ openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key Then we need to build proxy with 'testing' feature and run, e.g.: ```sh -RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key +RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key ``` Now from client you can start a new session: diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index b83b03bc4f..efa3c0b514 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -132,11 +132,10 @@ impl Drop for LoggingGuard { } } -// TODO: make JSON the default #[derive(Copy, Clone, PartialEq, Eq, Default, Debug)] enum LogFormat { + Text, #[default] - Text = 1, Json, } From 768a580373f1a60fff77a8e385fef040b9c261ef Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 29 Apr 2025 15:07:23 +0100 Subject: [PATCH 016/142] pageserver: add not modified since lsn to get page span (#11774) It's useful when debugging. --- pageserver/src/page_service.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d1a210a786..0ce1a99681 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1035,10 +1035,25 @@ impl PageServerHandler { // avoid a somewhat costly Span::record() by constructing the entire span in one go. macro_rules! mkspan { (before shard routing) => {{ - tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn) + tracing::info_span!( + parent: &parent_span, + "handle_get_page_request", + rel = %req.rel, + blkno = %req.blkno, + req_lsn = %req.hdr.request_lsn, + not_modified_since_lsn = %req.hdr.not_modified_since + ) }}; ($shard_id:expr) => {{ - tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id) + tracing::info_span!( + parent: &parent_span, + "handle_get_page_request", + rel = %req.rel, + blkno = %req.blkno, + req_lsn = %req.hdr.request_lsn, + not_modified_since_lsn = %req.hdr.not_modified_since, + shard_id = %$shard_id + ) }}; } @@ -1102,6 +1117,7 @@ impl PageServerHandler { shard_id = %shard.get_shard_identity().shard_slug(), timeline_id = %timeline_id, lsn = %req.hdr.request_lsn, + not_modified_since_lsn = %req.hdr.not_modified_since, request_id = %req.hdr.reqid, key = %key, ) From a2adc7dbd380fadaa60255eac2c070243a01a7fe Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 29 Apr 2025 16:31:52 +0100 Subject: [PATCH 017/142] storcon: avoid multiple initdbs when shard 0 has stale locations (#11760) ## Problem In #11727 I overlooked the case of multiple attached locations for shard 0. I misread the code and thought `create_one` acts on one location, but it actually acts on one _shard_, which is potentially multiple locations. This was not a regression, but it meant that the fix was incomplete. ## Summary of changes - In `create_one`, when updating shard zero, have any "other" locations use the initdb from shard 0 --- storage_controller/src/service.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 50ce559cc0..72379f0810 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -3663,7 +3663,7 @@ impl Service { locations: ShardMutationLocations, http_client: reqwest::Client, jwt: Option, - create_req: TimelineCreateRequest, + mut create_req: TimelineCreateRequest, ) -> Result { let latest = locations.latest.node; @@ -3682,6 +3682,15 @@ impl Service { .await .map_err(|e| passthrough_api_error(&latest, e))?; + // If we are going to create the timeline on some stale locations for shard 0, then ask them to re-use + // the initdb generated by the latest location, rather than generating their own. This avoids racing uploads + // of initdb to S3 which might not be binary-identical if different pageservers have different postgres binaries. + if tenant_shard_id.is_shard_zero() { + if let models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, .. } = &mut create_req.mode { + *existing_initdb_timeline_id = Some(create_req.new_timeline_id); + } + } + // We propagate timeline creations to all attached locations such that a compute // for the new timeline is able to start regardless of the current state of the // tenant shard reconciliation. From a08c1a23eb378be1b5b22a19737b0c7f855d30a2 Mon Sep 17 00:00:00 2001 From: Elizabeth Murray <52375559+bizwark@users.noreply.github.com> Date: Tue, 29 Apr 2025 09:50:18 -0700 Subject: [PATCH 018/142] Upgrade the pgrag version in the compute Dockerfile. (#11687) Update the compute Dockerfile to use a new version of pgrag. The new version of pgrag uses the latest pgrx, and has a fix that terminates background workers on postmaster exit. --- compute/compute-node.Dockerfile | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index b9299eee90..267940a405 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1084,7 +1084,18 @@ RUN cargo install --locked --version 0.12.9 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root +######################################################################################### +# +# Layer "rust extensions pgrx14" +# +######################################################################################### +FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14 +ARG PG_VERSION +RUN cargo install --locked --version 0.14.1 cargo-pgrx && \ + /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' + +USER root ######################################################################################### # # Layers "pg-onnx-build" and "pgrag-build" @@ -1100,11 +1111,11 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar. mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ echo "#nothing to test here" > neon-test.sh -RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \ - echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.1.tar.gz -O pgrag.tar.gz && \ + echo "087b2ecd11ba307dc968042ef2e9e43dc04d9ba60e8306e882c407bbe1350a50 pgrag.tar.gz" | sha256sum --check && \ mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . -FROM rust-extensions-build-pgrx12 AS pgrag-build +FROM rust-extensions-build-pgrx14 AS pgrag-build COPY --from=pgrag-src /ext-src/ /ext-src/ # Install build-time dependencies @@ -1124,19 +1135,19 @@ RUN . venv/bin/activate && \ WORKDIR /ext-src/pgrag-src RUN cd exts/rag && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control RUN cd exts/rag_bge_small_en_v15 && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \ cargo pgrx install --release --features remote_onnx && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control RUN cd exts/rag_jina_reranker_v1_tiny_en && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \ cargo pgrx install --release --features remote_onnx && \ From 1d06172d59caa81e36dce0a17610d80067be4c54 Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 29 Apr 2025 19:34:56 +0100 Subject: [PATCH 019/142] pageserver: remove resident size from billing metrics (#11699) This is a rebase of PR #10739 by @henryliu2014 on the current main branch. ## Problem pageserver: remove resident size from billing metrics Fixes #10388 ## Summary of changes The following changes have been made to remove resident size from billing metrics: * removed the metric "resident_size" and related codes in consumption_metrics/metrics.rs * removed the item of the description of metric "resident_size" in consumption_metrics.md * refactored the metric "resident_size" related test case Requested by: John Spray (john@neon.tech) --------- Co-authored-by: liuheqing Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: John Spray --- docs/consumption_metrics.md | 5 --- pageserver/src/consumption_metrics/metrics.rs | 42 ++----------------- .../src/consumption_metrics/metrics/tests.rs | 10 +---- pageserver/src/consumption_metrics/upload.rs | 8 +--- .../test_pageserver_metric_collection.py | 1 - 5 files changed, 7 insertions(+), 59 deletions(-) diff --git a/docs/consumption_metrics.md b/docs/consumption_metrics.md index 6bcd28ab10..eb211af646 100644 --- a/docs/consumption_metrics.md +++ b/docs/consumption_metrics.md @@ -38,11 +38,6 @@ Currently, the following metrics are collected: Amount of WAL produced , by a timeline, i.e. last_record_lsn This is an absolute, per-timeline metric. -- `resident_size` - -Size of all the layer files in the tenant's directory on disk on the pageserver. -This is an absolute, per-tenant metric. - - `remote_storage_size` Size of the remote storage (S3) directory. diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 08ab69f349..acdf514101 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -30,9 +30,6 @@ pub(super) enum Name { /// Tenant remote size #[serde(rename = "remote_storage_size")] RemoteSize, - /// Tenant resident size - #[serde(rename = "resident_size")] - ResidentSize, /// Tenant synthetic size #[serde(rename = "synthetic_storage_size")] SyntheticSize, @@ -187,18 +184,6 @@ impl MetricsKey { .absolute_values() } - /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`. - /// - /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size - const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory { - MetricsKey { - tenant_id, - timeline_id: None, - metric: Name::ResidentSize, - } - .absolute_values() - } - /// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`]. /// /// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size @@ -261,10 +246,7 @@ where let mut tenants = std::pin::pin!(tenants); while let Some((tenant_id, tenant)) = tenants.next().await { - let mut tenant_resident_size = 0; - let timelines = tenant.list_timelines(); - let timelines_len = timelines.len(); for timeline in timelines { let timeline_id = timeline.timeline_id; @@ -287,16 +269,9 @@ where continue; } } - - tenant_resident_size += timeline.resident_physical_size(); } - if timelines_len == 0 { - // Force set it to 1 byte to avoid not being reported -- all timelines are offloaded. - tenant_resident_size = 1; - } - - let snap = TenantSnapshot::collect(&tenant, tenant_resident_size); + let snap = TenantSnapshot::collect(&tenant); snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics); } @@ -305,19 +280,14 @@ where /// In-between abstraction to allow testing metrics without actual Tenants. struct TenantSnapshot { - resident_size: u64, remote_size: u64, synthetic_size: u64, } impl TenantSnapshot { /// Collect tenant status to have metrics created out of it. - /// - /// `resident_size` is calculated of the timelines we had access to for other metrics, so we - /// cannot just list timelines here. - fn collect(t: &Arc, resident_size: u64) -> Self { + fn collect(t: &Arc) -> Self { TenantSnapshot { - resident_size, remote_size: t.remote_size(), // Note that this metric is calculated in a separate bgworker // Here we only use cached value, which may lag behind the real latest one @@ -334,8 +304,6 @@ impl TenantSnapshot { ) { let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size); - let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size); - let synthetic_size = { let factory = MetricsKey::synthetic_size(tenant_id); let mut synthetic_size = self.synthetic_size; @@ -355,11 +323,7 @@ impl TenantSnapshot { } }; - metrics.extend( - [Some(remote_size), Some(resident_size), synthetic_size] - .into_iter() - .flatten(), - ); + metrics.extend([Some(remote_size), synthetic_size].into_iter().flatten()); } } diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs index 52b4fb8680..5cfb361e40 100644 --- a/pageserver/src/consumption_metrics/metrics/tests.rs +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -224,7 +224,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() { let tenant_id = TenantId::generate(); let ts = TenantSnapshot { - resident_size: 1000, remote_size: 1000, // not yet calculated synthetic_size: 0, @@ -245,7 +244,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() { metrics, &[ MetricsKey::remote_storage_size(tenant_id).at(now, 1000), - MetricsKey::resident_size(tenant_id).at(now, 1000), MetricsKey::synthetic_size(tenant_id).at(now, 1000), ] ); @@ -256,7 +254,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() { let tenant_id = TenantId::generate(); let ts = TenantSnapshot { - resident_size: 1000, remote_size: 1000, // not yet calculated synthetic_size: 0, @@ -274,7 +271,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() { metrics, &[ MetricsKey::remote_storage_size(tenant_id).at(now, 1000), - MetricsKey::resident_size(tenant_id).at(now, 1000), // no synthetic size here ] ); @@ -295,14 +291,13 @@ pub(crate) const fn metric_examples_old( timeline_id: TimelineId, now: DateTime, before: DateTime, -) -> [RawMetric; 6] { +) -> [RawMetric; 5] { [ MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::written_size_delta(tenant_id, timeline_id) .from_until_old_format(before, now, 0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0), - MetricsKey::resident_size(tenant_id).at_old_format(now, 0), MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1), ] } @@ -312,13 +307,12 @@ pub(crate) const fn metric_examples( timeline_id: TimelineId, now: DateTime, before: DateTime, -) -> [NewRawMetric; 6] { +) -> [NewRawMetric; 5] { [ MetricsKey::written_size(tenant_id, timeline_id).at(now, 0), MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0), MetricsKey::remote_storage_size(tenant_id).at(now, 0), - MetricsKey::resident_size(tenant_id).at(now, 0), MetricsKey::synthetic_size(tenant_id).at(now, 1), ] } diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 59e0145a5b..19c5aec5b3 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -521,10 +521,6 @@ mod tests { line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#, ), - ( - line!(), - r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#, - ), ( line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#, @@ -564,7 +560,7 @@ mod tests { assert_eq!(upgraded_samples, new_samples); } - fn metric_samples_old() -> [RawMetric; 6] { + fn metric_samples_old() -> [RawMetric; 5] { let tenant_id = TenantId::from_array([0; 16]); let timeline_id = TimelineId::from_array([0xff; 16]); @@ -576,7 +572,7 @@ mod tests { super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before) } - fn metric_samples() -> [NewRawMetric; 6] { + fn metric_samples() -> [NewRawMetric; 5] { let tenant_id = TenantId::from_array([0; 16]); let timeline_id = TimelineId::from_array([0xff; 16]); diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index acec0ba44a..ffde08a73f 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -506,7 +506,6 @@ class SyntheticSizeVerifier: PER_METRIC_VERIFIERS = { "remote_storage_size": CannotVerifyAnything, - "resident_size": CannotVerifyAnything, "written_size": WrittenDataVerifier, "written_data_bytes_delta": WrittenDataDeltaVerifier, "timeline_logical_size": CannotVerifyAnything, From b48404952d9658f24f88441b43fb4df7d222b159 Mon Sep 17 00:00:00 2001 From: Em Sharnoff Date: Wed, 30 Apr 2025 04:32:25 -0700 Subject: [PATCH 020/142] Bump vm-builder: v0.42.2 -> v0.46.0 (#11782) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumped to pick up the changes from neondatabase/autoscaling#1366 — specifically including `uname` in the logs. Other changes included: * neondatabase/autoscaling#1301 * neondatabase/autoscaling#1296 --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6c025ad2a9..18bec1b461 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -824,7 +824,7 @@ jobs: - pg: v17 debian: bookworm env: - VM_BUILDER_VERSION: v0.42.2 + VM_BUILDER_VERSION: v0.46.0 steps: - name: Harden the runner (Audit all outbound calls) From 8da4ec9740c8f292170babecfdc07148a60cd9f9 Mon Sep 17 00:00:00 2001 From: Mikhail Kot Date: Wed, 30 Apr 2025 13:01:41 +0100 Subject: [PATCH 021/142] Postgres metrics for stuck getpage requests (#11710) https://github.com/neondatabase/neon/issues/10327 Resolves: #11720 New metrics: - `compute_getpage_stuck_requests_total` - `compute_getpage_max_inflight_stuck_time_ms` --- compute/etc/neon_collector.jsonnet | 2 ++ ...pute_getpage_max_inflight_stuck_time_ms.libsonnet | 9 +++++++++ .../compute_getpage_stuck_requests_total.libsonnet | 9 +++++++++ compute/etc/sql_exporter/neon_perf_counters.sql | 2 ++ pgxn/neon/libpagestore.c | 6 ++++++ pgxn/neon/neon_perf_counters.c | 9 ++++++++- pgxn/neon/neon_perf_counters.h | 12 ++++++++++++ 7 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet create mode 100644 compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index 449e1199d0..e64d907fe4 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -23,6 +23,8 @@ import 'sql_exporter/getpage_prefetch_requests_total.libsonnet', import 'sql_exporter/getpage_prefetches_buffered.libsonnet', import 'sql_exporter/getpage_sync_requests_total.libsonnet', + import 'sql_exporter/compute_getpage_stuck_requests_total.libsonnet', + import 'sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet', import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet', import 'sql_exporter/getpage_wait_seconds_count.libsonnet', import 'sql_exporter/getpage_wait_seconds_sum.libsonnet', diff --git a/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet b/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet new file mode 100644 index 0000000000..bc1100c832 --- /dev/null +++ b/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'compute_getpage_max_inflight_stuck_time_ms', + type: 'gauge', + help: 'Max wait time for stuck requests among all backends. Includes only active stuck requests, terminated or disconnected ones are not accounted for', + values: [ + 'compute_getpage_max_inflight_stuck_time_ms', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet b/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet new file mode 100644 index 0000000000..5f72f43254 --- /dev/null +++ b/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'compute_getpage_stuck_requests_total', + type: 'counter', + help: 'Total number of Getpage requests left without an answer for more than pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout', + values: [ + 'compute_getpage_stuck_requests_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql index 4a36f3bf2f..39a9d03412 100644 --- a/compute/etc/sql_exporter/neon_perf_counters.sql +++ b/compute/etc/sql_exporter/neon_perf_counters.sql @@ -9,6 +9,8 @@ SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d( getpage_wait_seconds_sum numeric, getpage_prefetch_requests_total numeric, getpage_sync_requests_total numeric, + compute_getpage_stuck_requests_total numeric, + compute_getpage_max_inflight_stuck_time_ms numeric, getpage_prefetch_misses_total numeric, getpage_prefetch_discards_total numeric, getpage_prefetches_buffered numeric, diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 5287c12a84..e758841beb 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -877,6 +877,7 @@ retry: int port; int sndbuf; int recvbuf; + uint64* max_wait; get_local_port(PQsocket(pageserver_conn), &port); get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf); @@ -887,7 +888,10 @@ retry: shard->nrequests_sent, shard->nresponses_received, port, sndbuf, recvbuf, pageserver_conn->inStart, pageserver_conn->inEnd); shard->receive_last_log_time = now; + MyNeonCounters->compute_getpage_stuck_requests_total += !shard->receive_logged; shard->receive_logged = true; + max_wait = &MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms; + *max_wait = Max(*max_wait, INSTR_TIME_GET_MILLISEC(since_start)); } /* @@ -910,6 +914,7 @@ retry: get_local_port(PQsocket(pageserver_conn), &port); neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting (socket port=%d)", INSTR_TIME_GET_DOUBLE(since_start), port); + MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0; pageserver_disconnect(shard_no); return -1; } @@ -933,6 +938,7 @@ retry: INSTR_TIME_SET_ZERO(shard->receive_start_time); INSTR_TIME_SET_ZERO(shard->receive_last_log_time); shard->receive_logged = false; + MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0; return ret; } diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index 05db187076..c77d99d636 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -148,7 +148,7 @@ histogram_to_metrics(IOHistogram histogram, static metric_t * neon_perf_counters_to_metrics(neon_per_backend_counters *counters) { -#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10) +#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 12) metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); int i = 0; @@ -166,6 +166,8 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters) APPEND_METRIC(getpage_prefetch_requests_total); APPEND_METRIC(getpage_sync_requests_total); + APPEND_METRIC(compute_getpage_stuck_requests_total); + APPEND_METRIC(compute_getpage_max_inflight_stuck_time_ms); APPEND_METRIC(getpage_prefetch_misses_total); APPEND_METRIC(getpage_prefetch_discards_total); APPEND_METRIC(pageserver_requests_sent_total); @@ -294,6 +296,11 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) totals.file_cache_hits_total += counters->file_cache_hits_total; histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist); histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist); + + totals.compute_getpage_stuck_requests_total += counters->compute_getpage_stuck_requests_total; + totals.compute_getpage_max_inflight_stuck_time_ms = Max( + totals.compute_getpage_max_inflight_stuck_time_ms, + counters->compute_getpage_max_inflight_stuck_time_ms); } metrics = neon_perf_counters_to_metrics(&totals); diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 5f5330bb69..10cf094d4a 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -57,6 +57,18 @@ typedef struct uint64 getpage_prefetch_requests_total; uint64 getpage_sync_requests_total; + /* + * Total number of Getpage requests left without an answer for more than + * pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout + */ + uint64 compute_getpage_stuck_requests_total; + + /* + * Longest waiting time for active stuck requests. If a stuck request gets a + * response or disconnects, this metric is updated + */ + uint64 compute_getpage_max_inflight_stuck_time_ms; + /* * Total number of readahead misses; consisting of either prefetches that * don't satisfy the LSN bounds, or cases where no readahead was issued From 60f63c076f9b6b362600d65e01f3f1f8a0f4a5dd Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 30 Apr 2025 15:23:20 +0300 Subject: [PATCH 022/142] Make safekeeper proto version 3 default (#11518) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem We have been running compute <-> sk protocol version 3 for a while on staging with no issues observed, and want to fully migrate to it eventually. ## Summary of changes Let's make v3 the default. ref https://github.com/neondatabase/neon/issues/10326 --------- Co-authored-by: Arpad Müller --- pgxn/neon/walproposer_pg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index a061639815..17582405db 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -63,7 +63,7 @@ char *wal_acceptors_list = ""; int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; -int safekeeper_proto_version = 2; +int safekeeper_proto_version = 3; /* Set to true in the walproposer bgw. */ static bool am_walproposer; @@ -228,7 +228,7 @@ nwp_register_gucs(void) "Version of compute <-> safekeeper protocol.", "Used while migrating from 2 to 3.", &safekeeper_proto_version, - 2, 0, INT_MAX, + 3, 0, INT_MAX, PGC_POSTMASTER, 0, NULL, NULL, NULL); From 1d68577fbd3c08496dbe7dd716dd5562bc51ca7a Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 30 Apr 2025 15:44:59 +0300 Subject: [PATCH 023/142] Check target slot state in prefetch_wait_for (#11779) ## Problem See https://neondb.slack.com/archives/C04DGM6SMTM/p1745599814030679 Assume the following scenario: prefetch_wait_for is doing `CHECK_FOR_INTERRUPTS` which tries to load prefetch responses. In case of error is calls pageserver_disconnect which aborts all in-flight requests. But such failure is not detected by `prefetch_wait_for` which returns true. As a result `communicator_read_at_lsnv` assumes that slot is received, but as far as asserts are disables at prod, it is not actually checked. Then it tries to interpret response and ... *SIGSEGV* ## Summary of changes Check target slot state in `prefetch_wait_for`. Resolves https://github.com/neondatabase/cloud/issues/28258 Co-authored-by: Konstantin Knizhnik --- pgxn/neon/communicator.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c index 61bb3206e7..818a149499 100644 --- a/pgxn/neon/communicator.c +++ b/pgxn/neon/communicator.c @@ -687,8 +687,14 @@ prefetch_wait_for(uint64 ring_index) END_PREFETCH_RECEIVE_WORK(); CHECK_FOR_INTERRUPTS(); } - - return result; + if (result) + { + /* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */ + PrefetchRequest *slot = GetPrfSlot(ring_index); + return slot->status == PRFS_RECEIVED; + } + return false; +; } /* From 6b4b8e0d8be55c7224cbf113fb1717179b53f0e9 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 30 Apr 2025 11:50:12 -0400 Subject: [PATCH 024/142] fix(pageserver): do not increase basebackup err counter when shutdown (#11778) ## Problem We occasionally see basebackup errors alerts but there were no errors logged. Looking at the code, the only codepath that will cause this is shutting down. ## Summary of changes Do not increase any counter (ok/err) when basebackup request gets cancelled due to shutdowns. Signed-off-by: Alex Chi Z --- pageserver/src/metrics.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 9a6c3f2378..a68b6acca1 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -2180,6 +2180,10 @@ impl BasebackupQueryTimeOngoingRecording<'_> { // If you want to change categorize of a specific error, also change it in `log_query_error`. let metric = match res { Ok(_) => &self.parent.ok, + Err(QueryError::Shutdown) => { + // Do not observe ok/err for shutdown + return; + } Err(QueryError::Disconnected(ConnectionError::Io(io_error))) if is_expected_io_error(io_error) => { From e2db76b9be5fc0de8f953dc4ba9f039ce05cdd95 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 30 Apr 2025 12:04:00 -0400 Subject: [PATCH 025/142] feat(pageserver): ondemand download reason observability (#11780) ## Problem Part of https://github.com/neondatabase/neon/issues/11615 ## Summary of changes We don't understand the root cause of why we get resident size surge every now and then. This patch adds observability for that, and in the next week, we might have a better understanding of what's going on. --------- Signed-off-by: Alex Chi Z --- pageserver/src/metrics.rs | 18 ++++++++++++++++++ pageserver/src/tenant/storage_layer/layer.rs | 9 +++++++++ 2 files changed, 27 insertions(+) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index a68b6acca1..8e4dbd6c3e 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -497,6 +497,24 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy = Lazy::n .expect("failed to define a metric") }); +pub(crate) static ONDEMAND_DOWNLOAD_BYTES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_ondemand_download_bytes_total", + "Total bytes of layers on-demand downloaded", + &["task_kind"] + ) + .expect("failed to define a metric") +}); + +pub(crate) static ONDEMAND_DOWNLOAD_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_ondemand_download_count", + "Total count of layers on-demand downloaded", + &["task_kind"] + ) + .expect("failed to define a metric") +}); + pub(crate) mod wait_ondemand_download_time { use super::*; const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[ diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index b7f6e5dc77..50810cb154 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -4,6 +4,7 @@ use std::sync::{Arc, Weak}; use std::time::{Duration, SystemTime}; use crate::PERF_TRACE_TARGET; +use crate::metrics::{ONDEMAND_DOWNLOAD_BYTES, ONDEMAND_DOWNLOAD_COUNT}; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; @@ -1255,6 +1256,14 @@ impl LayerInner { self.access_stats.record_residence_event(); + let task_kind: &'static str = ctx.task_kind().into(); + ONDEMAND_DOWNLOAD_BYTES + .with_label_values(&[task_kind]) + .inc_by(self.desc.file_size); + ONDEMAND_DOWNLOAD_COUNT + .with_label_values(&[task_kind]) + .inc(); + Ok(self.initialize_after_layer_is_on_disk(permit)) } Err(e) => { From bec7427d9e4d84b6bcb6a74338cfb711e1748e8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 30 Apr 2025 18:24:01 +0200 Subject: [PATCH 026/142] pull_timeline and sk logging fixes (#11786) This patch contains some fixes of issues I ran into for #11712: * make `pull_timeline` return success for timeline that already exists. This follows general API design of storage components: API endpoints are retryable and converge to a status code, instead of starting to error. We change the `pull_timeline`'s return type a little bit, because we might not actually have a source sk to pull from. Note that the fix is not enough, there is still a race when two `pull_timeline` instances happen in parallel: we might try to enter both pulled timelines at the same time. That can be fixed later. * make `pull_timeline` support one safekeeper being down. In general, if one safekeeper is down, that's not a problem. the added comment explains a potential situation (found in the `test_lagging_sk` test for example) * don't log very long errors when computes try to connect to safekeepers that don't have the timeline yet, if `allow_timeline_creation` is false. That flag is enabled when a sk connection string with generation numbers is passed to the compute, so we'll hit this code path more often. E.g. when a safekeeper missed a timeline creation, but the compute connects to it first before the `pull_timeline` gets requested by the storcon reconciler: this is a perfectly normal situation. So don't log the whole error backtrace, and don't log it on the error log level, but only on info. part of #11670 --- libs/postgres_backend/src/lib.rs | 6 ++++ libs/safekeeper_api/src/models.rs | 5 ++-- safekeeper/src/pull_timeline.rs | 28 ++++++++++++++++--- safekeeper/src/receive_wal.rs | 13 ++++++--- .../src/service/safekeeper_reconciler.rs | 9 +++--- 5 files changed, 47 insertions(+), 14 deletions(-) diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 654dde8da6..714d8ac403 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -841,6 +841,10 @@ impl PostgresBackend { let expected_end = match &end { ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true, + // The timeline doesn't exist and we have been requested to not auto-create it. + // Compute requests for timelines that haven't been created yet + // might reach us before the storcon request to create those timelines. + TimelineNoCreate => true, CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error)) if is_expected_io_error(io_error) => { @@ -1059,6 +1063,8 @@ pub enum CopyStreamHandlerEnd { Terminate, #[error("EOF on COPY stream")] EOF, + #[error("timeline not found, and allow_timeline_creation is false")] + TimelineNoCreate, /// The connection was lost #[error("connection error: {0}")] Disconnected(#[from] ConnectionError), diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 51f88625da..cc31b38fe7 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -303,7 +303,8 @@ pub struct PullTimelineRequest { #[derive(Debug, Serialize, Deserialize)] pub struct PullTimelineResponse { - // Donor safekeeper host - pub safekeeper_host: String, + /// Donor safekeeper host. + /// None if no pull happened because the timeline already exists. + pub safekeeper_host: Option, // TODO: add more fields? } diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 653b084ad8..1510a51019 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -401,7 +401,10 @@ pub async fn handle_request( request.timeline_id, )); if existing_tli.is_ok() { - bail!("Timeline {} already exists", request.timeline_id); + info!("Timeline {} already exists", request.timeline_id); + return Ok(PullTimelineResponse { + safekeeper_host: None, + }); } let mut http_client = reqwest::Client::builder(); @@ -425,8 +428,25 @@ pub async fn handle_request( let mut statuses = Vec::new(); for (i, response) in responses.into_iter().enumerate() { - let status = response.context(format!("fetching status from {}", http_hosts[i]))?; - statuses.push((status, i)); + match response { + Ok(status) => { + statuses.push((status, i)); + } + Err(e) => { + info!("error fetching status from {}: {e}", http_hosts[i]); + } + } + } + + // Allow missing responses from up to one safekeeper (say due to downtime) + // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes + // offline and C comes online. Then we want a pull on C with A and B as hosts to work. + let min_required_successful = (http_hosts.len() - 1).max(1); + if statuses.len() < min_required_successful { + bail!( + "only got {} successful status responses. required: {min_required_successful}", + statuses.len() + ) } // Find the most advanced safekeeper @@ -536,6 +556,6 @@ async fn pull_timeline( .await?; Ok(PullTimelineResponse { - safekeeper_host: host, + safekeeper_host: Some(host), }) } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 9975153f6c..eb8eee6ab8 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -32,7 +32,7 @@ use crate::metrics::{ WAL_RECEIVERS, }; use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage}; -use crate::timeline::WalResidentTimeline; +use crate::timeline::{TimelineError, WalResidentTimeline}; const DEFAULT_FEEDBACK_CAPACITY: usize = 8; @@ -357,9 +357,14 @@ impl NetworkReader<'_, IO> { .await .context("create timeline")? } else { - self.global_timelines - .get(self.ttid) - .context("get timeline")? + let timeline_res = self.global_timelines.get(self.ttid); + match timeline_res { + Ok(tl) => tl, + Err(TimelineError::NotFound(_)) => { + return Err(CopyStreamHandlerEnd::TimelineNoCreate); + } + other => other.context("get_timeline")?, + } }; tli.wal_residence_guard().await? } diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index 74308cabff..71c73a0112 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -306,10 +306,11 @@ impl SafekeeperReconcilerInner { req, async |client| client.pull_timeline(&pull_req).await, |resp| { - tracing::info!( - "pulled timeline from {} onto {req_host}", - resp.safekeeper_host, - ); + if let Some(host) = resp.safekeeper_host { + tracing::info!("pulled timeline from {host} onto {req_host}"); + } else { + tracing::info!("timeline already present on safekeeper on {req_host}"); + } }, req_cancel, ) From 1b789e8d7c917898d4069eda8c999f96c5ac0eeb Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Wed, 30 Apr 2025 20:50:21 +0400 Subject: [PATCH 027/142] fix(pgxn/neon): Use proper member size in TermsCollectedMset and VotesCollectedMset (#11785) ## Problem `TermsCollectedMset` and `VotesCollectedMset` accept a MemberSet argument to find a quorum in. It may be either `wp->mconf.members` or `wp->mconf.new_members`. But the loops inside always use `wp->mconf.members.len`. If the sizes of member sets are different, it may lead to these functions not scanning all the safekeepers from `mset`. We are not planning to change the member set size dynamically now, but it's worth fixing anyway. - Part of https://github.com/neondatabase/neon/issues/11669 ## Summary of changes - Use proper size of member set in `TermsCollectedMset` and `VotesCollectedMset` --- pgxn/neon/walproposer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index b95b1451e4..f4f1398375 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -836,7 +836,7 @@ TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf { uint32 n_greeted = 0; - for (uint32 i = 0; i < wp->mconf.members.len; i++) + for (uint32 i = 0; i < mset->len; i++) { Safekeeper *sk = msk[i]; @@ -1106,7 +1106,7 @@ VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf { uint32 n_votes = 0; - for (uint32 i = 0; i < wp->mconf.members.len; i++) + for (uint32 i = 0; i < mset->len; i++) { Safekeeper *sk = msk[i]; From 5bd850d15a3aa1034c580521b15b16ab71d961a0 Mon Sep 17 00:00:00 2001 From: Shockingly Good Date: Thu, 1 May 2025 11:09:10 +0200 Subject: [PATCH 028/142] Fix the leaked tracing context for the "compute_monitor:run". (#11791) Removes the leaked tracing context for the "compute_monitor:run" log, which either inherited the "start_compute" span or also the HTTP request context. ## Problem The problem is that the context of the monitor's trace is unnecessarily populated with the span data inherited from previously within the same thread. ## Summary of changes The context is completely reset by moving the span from the thread spawning the monitor into the thread where the monitor will actually start working. Addresses https://github.com/neondatabase/cloud/issues/28145 ## Examples ### Before ``` 2025-04-30T16:39:05.840298Z INFO start_compute:compute_monitor:run: compute is not running, waiting before monitoring activity ``` ### After ``` 2025-04-30T16:39:05.840298Z INFO compute_monitor:run: compute is not running, waiting before monitoring activity ``` --- compute_tools/src/monitor.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 5a07eec833..3311ee47b3 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -424,10 +424,10 @@ pub fn launch_monitor(compute: &Arc) -> thread::JoinHandle<()> { experimental, }; - let span = span!(Level::INFO, "compute_monitor"); thread::Builder::new() .name("compute-monitor".into()) .spawn(move || { + let span = span!(Level::INFO, "compute_monitor"); let _enter = span.enter(); monitor.run(); }) From f999632327f108fc497d4c3f467cdf4349f4027a Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Thu, 1 May 2025 11:22:01 -0400 Subject: [PATCH 029/142] Adding `anon` v2 support to the dockerfile (#11313) ## Problem Removed `anon` v1 support as described here: https://github.com/neondatabase/cloud/issues/22663 Adding `anon` v2 support to re-introduce the `pg_anon` extension. Related Issues: https://github.com/neondatabase/cloud/issues/20456 ## Summary of changes Adding `anon` v2 support by building it in the dockerfile --- compute/compute-node.Dockerfile | 51 +++++++++++++ compute/patches/anon_v2.patch | 129 ++++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 compute/patches/anon_v2.patch diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 267940a405..cc338cec6a 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1096,6 +1096,23 @@ RUN cargo install --locked --version 0.14.1 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root +######################################################################################### +# +# Layer "rust extensions pgrx14" +# +# Version 14 is now required by a few +# This layer should be used as a base for new pgrx extensions, +# and eventually get merged with `rust-extensions-build` +# +######################################################################################### +FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14 +ARG PG_VERSION + +RUN cargo install --locked --version 0.14.1 cargo-pgrx && \ + /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' + +USER root + ######################################################################################### # # Layers "pg-onnx-build" and "pgrag-build" @@ -1330,6 +1347,39 @@ COPY --from=pg_session_jwt-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_session_jwt-src RUN cargo pgrx install --release +######################################################################################### +# +# Layer "pg-anon-pg-build" +# compile anon extension +# +######################################################################################### +FROM pg-build AS pg_anon-src +ARG PG_VERSION +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +WORKDIR /ext-src +COPY compute/patches/anon_v2.patch . + +# This is an experimental extension, never got to real production. +# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. +ENV PATH="/usr/local/pgsql/bin/:$PATH" +RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/latest/postgresql_anonymizer-latest.tar.gz -O pg_anon.tar.gz && \ + mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ + find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "=0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + patch -p1 < /ext-src/anon_v2.patch + +FROM rust-extensions-build-pgrx14 AS pg-anon-pg-build +ARG PG_VERSION +COPY --from=pg_anon-src /ext-src/ /ext-src/ +WORKDIR /ext-src +RUN cd pg_anon-src && \ + make -j $(getconf _NPROCESSORS_ONLN) extension PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \ + chmod -R a+r ../pg_anon-src && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; + +######################################################################################## + ######################################################################################### # # Layer "wal2json-build" @@ -1626,6 +1676,7 @@ COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql +COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ diff --git a/compute/patches/anon_v2.patch b/compute/patches/anon_v2.patch new file mode 100644 index 0000000000..e833a6dfd3 --- /dev/null +++ b/compute/patches/anon_v2.patch @@ -0,0 +1,129 @@ +diff --git a/sql/anon.sql b/sql/anon.sql +index 0cdc769..f6cc950 100644 +--- a/sql/anon.sql ++++ b/sql/anon.sql +@@ -1141,3 +1141,8 @@ $$ + -- TODO : https://en.wikipedia.org/wiki/L-diversity + + -- TODO : https://en.wikipedia.org/wiki/T-closeness ++ ++-- NEON Patches ++ ++GRANT ALL ON SCHEMA anon to neon_superuser; ++GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser; +diff --git a/sql/init.sql b/sql/init.sql +index 7da6553..9b6164b 100644 +--- a/sql/init.sql ++++ b/sql/init.sql +@@ -74,50 +74,49 @@ $$ + + SECURITY LABEL FOR anon ON FUNCTION anon.load_csv IS 'UNTRUSTED'; + +--- load fake data from a given path +-CREATE OR REPLACE FUNCTION anon.init( +- datapath TEXT +-) ++CREATE OR REPLACE FUNCTION anon.load_fake_data() + RETURNS BOOLEAN + AS $$ + DECLARE +- datapath_check TEXT; + success BOOLEAN; ++ sharedir TEXT; ++ datapath TEXT; + BEGIN + +- IF anon.is_initialized() THEN +- RAISE NOTICE 'The anon extension is already initialized.'; +- RETURN TRUE; +- END IF; ++ datapath := '/extension/anon/'; ++ -- find the local extension directory ++ SELECT setting INTO sharedir ++ FROM pg_catalog.pg_config ++ WHERE name = 'SHAREDIR'; + + SELECT bool_or(results) INTO success + FROM unnest(array[ +- anon.load_csv('anon.identifiers_category',datapath||'/identifiers_category.csv'), +- anon.load_csv('anon.identifier',datapath ||'/identifier.csv'), +- anon.load_csv('anon.address',datapath ||'/address.csv'), +- anon.load_csv('anon.city',datapath ||'/city.csv'), +- anon.load_csv('anon.company',datapath ||'/company.csv'), +- anon.load_csv('anon.country',datapath ||'/country.csv'), +- anon.load_csv('anon.email', datapath ||'/email.csv'), +- anon.load_csv('anon.first_name',datapath ||'/first_name.csv'), +- anon.load_csv('anon.iban',datapath ||'/iban.csv'), +- anon.load_csv('anon.last_name',datapath ||'/last_name.csv'), +- anon.load_csv('anon.postcode',datapath ||'/postcode.csv'), +- anon.load_csv('anon.siret',datapath ||'/siret.csv'), +- anon.load_csv('anon.lorem_ipsum',datapath ||'/lorem_ipsum.csv') ++ anon.load_csv('anon.identifiers_category',sharedir || datapath || '/identifiers_category.csv'), ++ anon.load_csv('anon.identifier',sharedir || datapath || '/identifier.csv'), ++ anon.load_csv('anon.address',sharedir || datapath || '/address.csv'), ++ anon.load_csv('anon.city',sharedir || datapath || '/city.csv'), ++ anon.load_csv('anon.company',sharedir || datapath || '/company.csv'), ++ anon.load_csv('anon.country',sharedir || datapath || '/country.csv'), ++ anon.load_csv('anon.email', sharedir || datapath || '/email.csv'), ++ anon.load_csv('anon.first_name',sharedir || datapath || '/first_name.csv'), ++ anon.load_csv('anon.iban',sharedir || datapath || '/iban.csv'), ++ anon.load_csv('anon.last_name',sharedir || datapath || '/last_name.csv'), ++ anon.load_csv('anon.postcode',sharedir || datapath || '/postcode.csv'), ++ anon.load_csv('anon.siret',sharedir || datapath || '/siret.csv'), ++ anon.load_csv('anon.lorem_ipsum',sharedir || datapath || '/lorem_ipsum.csv') + ]) results; + RETURN success; +- + END; + $$ +- LANGUAGE PLPGSQL ++ LANGUAGE plpgsql + VOLATILE + RETURNS NULL ON NULL INPUT +- PARALLEL UNSAFE -- because load_csv is unsafe +- SECURITY INVOKER ++ PARALLEL UNSAFE -- because of the EXCEPTION ++ SECURITY DEFINER + SET search_path='' + ; +-SECURITY LABEL FOR anon ON FUNCTION anon.init(TEXT) IS 'UNTRUSTED'; ++ ++SECURITY LABEL FOR anon ON FUNCTION anon.load_fake_data IS 'UNTRUSTED'; + + -- People tend to forget the anon.init() step + -- This is a friendly notice for them +@@ -144,7 +143,7 @@ SECURITY LABEL FOR anon ON FUNCTION anon.notice_if_not_init IS 'UNTRUSTED'; + CREATE OR REPLACE FUNCTION anon.load(TEXT) + RETURNS BOOLEAN AS + $$ +- SELECT anon.init($1); ++ SELECT anon.init(); + $$ + LANGUAGE SQL + VOLATILE +@@ -159,16 +158,16 @@ SECURITY LABEL FOR anon ON FUNCTION anon.load(TEXT) IS 'UNTRUSTED'; + CREATE OR REPLACE FUNCTION anon.init() + RETURNS BOOLEAN + AS $$ +- WITH conf AS ( +- -- find the local extension directory +- SELECT setting AS sharedir +- FROM pg_catalog.pg_config +- WHERE name = 'SHAREDIR' +- ) +- SELECT anon.init(conf.sharedir || '/extension/anon/') +- FROM conf; ++BEGIN ++ IF anon.is_initialized() THEN ++ RAISE NOTICE 'The anon extension is already initialized.'; ++ RETURN TRUE; ++ END IF; ++ ++ RETURN anon.load_fake_data(); ++END; + $$ +- LANGUAGE SQL ++ LANGUAGE plpgsql + VOLATILE + PARALLEL UNSAFE -- because init is unsafe + SECURITY INVOKER From 16d594b7b37733fd56d3cc4767b119f1dd391fb7 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 1 May 2025 16:56:43 +0100 Subject: [PATCH 030/142] pagectl: list layers for given key in decreasing LSN order (#11799) Adds an extra key CLI arg to `pagectl layer list-layer`. When provided, only layers with key ranges containing the key will be listed in decreasing LSN order (indices are preserved for `dump-layer`). --- pageserver/ctl/src/layers.rs | 37 +++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 293c01eff0..79f56a5a51 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -10,6 +10,7 @@ use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer, delta_layer, ima use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use pageserver::virtual_file::api::IoMode; use pageserver::{page_cache, virtual_file}; +use pageserver_api::key::Key; use utils::id::{TenantId, TimelineId}; use crate::layer_map_analyzer::parse_filename; @@ -27,6 +28,7 @@ pub(crate) enum LayerCmd { path: PathBuf, tenant: String, timeline: String, + key: Option, }, /// Dump all information of a layer file DumpLayer { @@ -100,6 +102,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { path, tenant, timeline, + key, } => { let timeline_path = path .join(TENANTS_SEGMENT_NAME) @@ -107,21 +110,37 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { .join(TIMELINES_SEGMENT_NAME) .join(timeline); let mut idx = 0; + let mut to_print = Vec::default(); for layer in fs::read_dir(timeline_path)? { let layer = layer?; if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) { - println!( - "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", - idx, - layer_file.key_range.start, - layer_file.key_range.end, - layer_file.lsn_range.start, - layer_file.lsn_range.end, - layer_file.is_delta, - ); + if let Some(key) = key { + if layer_file.key_range.start <= *key && *key < layer_file.key_range.end { + to_print.push((idx, layer_file)); + } + } else { + to_print.push((idx, layer_file)); + } idx += 1; } } + + if key.is_some() { + to_print + .sort_by_key(|(_idx, layer_file)| std::cmp::Reverse(layer_file.lsn_range.end)); + } + + for (idx, layer_file) in to_print { + println!( + "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", + idx, + layer_file.key_range.start, + layer_file.key_range.end, + layer_file.lsn_range.start, + layer_file.lsn_range.end, + layer_file.is_delta, + ); + } Ok(()) } LayerCmd::DumpLayer { From ae2c3ac12ff1b21f92a0d81b8988ff255c3dd8cf Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 1 May 2025 13:51:10 -0400 Subject: [PATCH 031/142] test: revert relsizev2 config (#11759) ## Problem part of https://github.com/neondatabase/neon/issues/9516 One thing I realized in the past few months is that "no-way-back" things like this are scary to roll out without a fine-grained rollout infra. The plan was to flip the flag in the repo and roll it out soon, but I don't think rolling out would happen in the near future. So I'd rather revert the flag to avoid creating a discrepancy between staging and the regress tests. ## Summary of changes Not using rel_size_v2 by default in unit tests; we still have a few tests to explicitly test the new format so we still get some test coverages. --------- Signed-off-by: Alex Chi Z --- test_runner/fixtures/neon_fixtures.py | 3 ++- test_runner/regress/test_attach_tenant_config.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b93df4ede4..47d1228c61 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1279,7 +1279,8 @@ class NeonEnv: ) tenant_config = ps_cfg.setdefault("tenant_config", {}) - tenant_config["rel_size_v2_enabled"] = True # Enable relsize_v2 by default in tests + # This feature is pending rollout. + # tenant_config["rel_size_v2_enabled"] = True if self.pageserver_remote_storage is not None: ps_cfg["remote_storage"] = remote_storage_to_toml_dict( diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index ee408e3c65..3616467c00 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -186,7 +186,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "type": "interpreted", "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, }, - "rel_size_v2_enabled": False, # test suite enables it by default as of https://github.com/neondatabase/neon/issues/11081, so, custom config means disabling it + "rel_size_v2_enabled": True, "gc_compaction_enabled": True, "gc_compaction_verification": False, "gc_compaction_initial_threshold_kb": 1024000, From bbc35e10b877f7bbb260595c90375f4e5a3bfbdb Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 1 May 2025 14:36:26 -0400 Subject: [PATCH 032/142] fix(test): increase timeouts for some tests (#11781) ## Problem Those tests are timing out more frequently after https://github.com/neondatabase/neon/pull/11585 ## Summary of changes Increase timeout for `test_pageserver_gc_compaction_smoke` Increase rollback wait timeout for `test_tx_abort_with_many_relations` Signed-off-by: Alex Chi Z --- test_runner/regress/test_compaction.py | 2 +- test_runner/regress/test_pg_regress.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 53edf9f79e..0dfc665a1d 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -229,7 +229,7 @@ def test_pageserver_gc_compaction_preempt( @skip_in_debug_build("only run with release build") -@pytest.mark.timeout(600) # This test is slow with sanitizers enabled, especially on ARM +@pytest.mark.timeout(900) # This test is slow with sanitizers enabled, especially on ARM @pytest.mark.parametrize( "with_branches", ["with_branches", "no_branches"], diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 0fea706888..474002353b 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -471,7 +471,7 @@ def test_tx_abort_with_many_relations( try: # Rollback phase should be fast: this is one WAL record that we should process efficiently fut = exec.submit(rollback_and_wait) - fut.result(timeout=15) + fut.result(timeout=15 if reldir_type == "v1" else 30) except: exec.shutdown(wait=False, cancel_futures=True) raise From 22290eb7ba14b1bd6efd0f6999f6b292684207a0 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 2 May 2025 13:46:21 +0100 Subject: [PATCH 033/142] CI: notify relevant team about release deploy failures (#11797) ## Problem We notify only Storage team about failed deploys, but Compute and Proxy teams can also benefit from that ## Summary of changes - Adjust `notify-storage-release-deploy-failure` to notify the relevant team about failed deploy --- .github/actionlint.yml | 5 ++++ .github/workflows/build_and_test.yml | 37 +++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 1d1b50e458..b7e0be761a 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -33,9 +33,14 @@ config-variables: - REMOTE_STORAGE_AZURE_CONTAINER - REMOTE_STORAGE_AZURE_REGION - SLACK_CICD_CHANNEL_ID + - SLACK_COMPUTE_CHANNEL_ID - SLACK_ON_CALL_DEVPROD_STREAM - SLACK_ON_CALL_QA_STAGING_STREAM - SLACK_ON_CALL_STORAGE_STAGING_STREAM + - SLACK_ONCALL_COMPUTE_GROUP + - SLACK_ONCALL_PROXY_GROUP + - SLACK_ONCALL_STORAGE_GROUP + - SLACK_PROXY_CHANNEL_ID - SLACK_RUST_CHANNEL_ID - SLACK_STORAGE_CHANNEL_ID - SLACK_UPCOMING_RELEASE_CHANNEL_ID diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 18bec1b461..e0995218f9 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1434,10 +1434,10 @@ jobs: ;; esac - notify-storage-release-deploy-failure: - needs: [ deploy ] + notify-release-deploy-failure: + needs: [ meta, deploy ] # We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs. - if: github.ref_name == 'release' && needs.deploy.result != 'success' && always() + if: contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind) && needs.deploy.result != 'success' && always() runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) @@ -1445,15 +1445,40 @@ jobs: with: egress-policy: audit - - name: Post release-deploy failure to team-storage slack channel + - name: Post release-deploy failure to team slack channel uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 + env: + TEAM_ONCALL: >- + ${{ + fromJSON(format('{ + "storage-release": "", + "compute-release": "", + "proxy-release": "" + }', + vars.SLACK_ONCALL_STORAGE_GROUP, + vars.SLACK_ONCALL_COMPUTE_GROUP, + vars.SLACK_ONCALL_PROXY_GROUP + ))[needs.meta.outputs.run-kind] + }} + CHANNEL: >- + ${{ + fromJSON(format('{ + "storage-release": "{0}", + "compute-release": "{1}", + "proxy-release": "{2}" + }', + vars.SLACK_STORAGE_CHANNEL_ID, + vars.SLACK_COMPUTE_CHANNEL_ID, + vars.SLACK_PROXY_CHANNEL_ID + ))[needs.meta.outputs.run-kind] + }} with: method: chat.postMessage token: ${{ secrets.SLACK_BOT_TOKEN }} payload: | - channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }} + channel: ${{ env.CHANNEL }} text: | - 🔴 : deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. + 🔴 ${{ env.TEAM_ONCALL }}: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: From 79699aebc8ac886afea2ec45320d7129232007cb Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 2 May 2025 17:36:10 +0300 Subject: [PATCH 034/142] Reserve in file descriptor pool sockets used for connections to page servers (#11798) ## Problem See https://github.com/neondatabase/neon/issues/11790 The neon extension opens extensions to the pageservers, which consumes file descriptors. Postgres has a mechanism to count how many FDs are in use, but it doesn't know about those FDs. We should call ReserveExternalFD() or AcquireExternalFD() to account for them. ## Summary of changes Call `ReserveExternalFD()` for each shard --------- Co-authored-by: Konstantin Knizhnik Co-authored-by: Mikhail Kot --- pgxn/neon/libpagestore.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index e758841beb..ee4e6ccc5b 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -26,6 +26,7 @@ #include "portability/instr_time.h" #include "postmaster/interrupt.h" #include "storage/buf_internals.h" +#include "storage/fd.h" #include "storage/ipc.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" @@ -79,6 +80,7 @@ int neon_protocol_version = 3; static int neon_compute_mode = 0; static int max_reconnect_attempts = 60; static int stripe_size; +static int max_sockets; static int pageserver_response_log_timeout = 10000; /* 2.5 minutes. A bit higher than highest default TCP retransmission timeout */ @@ -336,6 +338,13 @@ load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p) pageserver_disconnect(i); } pagestore_local_counter = end_update_counter; + + /* Reserve file descriptors for sockets */ + while (max_sockets < num_shards) + { + max_sockets += 1; + ReserveExternalFD(); + } } if (num_shards_p) From 4b9087651c2ef0d7888d76e8786066d00381388f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 2 May 2025 22:27:59 +0300 Subject: [PATCH 035/142] Checked that stored LwLSN >= FirstNormalUnloggedLSN (#11750) ## Problem Undo unintended change 60b9fb1baf4cba732cff4792b9a97d755794b7e2 ## Summary of changes Add assert that we are not storing fake LSN in LwLSN. --------- Co-authored-by: Konstantin Knizhnik --- pgxn/neon/neon_lwlsncache.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/neon_lwlsncache.c b/pgxn/neon/neon_lwlsncache.c index 6959da55cb..a8cfa0f825 100644 --- a/pgxn/neon/neon_lwlsncache.c +++ b/pgxn/neon/neon_lwlsncache.c @@ -4,6 +4,7 @@ #include "miscadmin.h" #include "access/xlog.h" +#include "access/xlog_internal.h" #include "storage/ipc.h" #include "storage/shmem.h" #include "storage/buf_internals.h" @@ -396,9 +397,10 @@ SetLastWrittenLSNForBlockRangeInternal(XLogRecPtr lsn, XLogRecPtr neon_set_lwlsn_block_range(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks) { - if (lsn < FirstNormalUnloggedLSN || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0) + if (lsn == InvalidXLogRecPtr || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0) return lsn; + Assert(lsn >= WalSegMinSize); LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); lsn = SetLastWrittenLSNForBlockRangeInternal(lsn, rlocator, forknum, from, n_blocks); LWLockRelease(LastWrittenLsnLock); @@ -435,7 +437,6 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode, NInfoGetRelNumber(relfilenode) == InvalidOid) return InvalidXLogRecPtr; - BufTagInit(key, relNumber, forknum, blockno, spcOid, dbOid); LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); @@ -444,6 +445,10 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode, { XLogRecPtr lsn = lsns[i]; + if (lsn == InvalidXLogRecPtr) + continue; + + Assert(lsn >= WalSegMinSize); key.blockNum = blockno + i; entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found); if (found) From 6131d86ec972f08febecc7b29a4273e84a408905 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 5 May 2025 12:18:55 +0100 Subject: [PATCH 036/142] proxy: allow invalid SNI (#11792) ## Problem Some PrivateLink customers are unable to use Private DNS. As such they use an invalid domain name to address Neon. We currently are rejecting those connections because we cannot resolve the correct certificate. ## Summary of changes 1. Ensure a certificate is always returned. 2. If there is an SNI field, use endpoint fallback if it doesn't match. I suggest reviewing each commit separately. --- proxy/src/auth/credentials.rs | 65 ++++----- proxy/src/proxy/handshake.rs | 15 +-- proxy/src/proxy/tests/mod.rs | 3 +- proxy/src/serverless/sql_over_http.rs | 3 +- proxy/src/tls/server_config.rs | 187 ++++++++++++++------------ 5 files changed, 138 insertions(+), 135 deletions(-) diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index c55af325e3..183976374a 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -32,12 +32,6 @@ pub(crate) enum ComputeUserInfoParseError { option: EndpointId, }, - #[error( - "Common name inferred from SNI ('{}') is not known", - .cn, - )] - UnknownCommonName { cn: String }, - #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")] MalformedProjectName(EndpointId), } @@ -66,22 +60,15 @@ impl ComputeUserInfoMaybeEndpoint { } } -pub(crate) fn endpoint_sni( - sni: &str, - common_names: &HashSet, -) -> Result, ComputeUserInfoParseError> { - let Some((subdomain, common_name)) = sni.split_once('.') else { - return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() }); - }; +pub(crate) fn endpoint_sni(sni: &str, common_names: &HashSet) -> Option { + let (subdomain, common_name) = sni.split_once('.')?; if !common_names.contains(common_name) { - return Err(ComputeUserInfoParseError::UnknownCommonName { - cn: common_name.into(), - }); + return None; } if subdomain == SERVERLESS_DRIVER_SNI { - return Ok(None); + return None; } - Ok(Some(EndpointId::from(subdomain))) + Some(EndpointId::from(subdomain)) } impl ComputeUserInfoMaybeEndpoint { @@ -113,15 +100,8 @@ impl ComputeUserInfoMaybeEndpoint { }) .map(|name| name.into()); - let endpoint_from_domain = if let Some(sni_str) = sni { - if let Some(cn) = common_names { - endpoint_sni(sni_str, cn)? - } else { - None - } - } else { - None - }; + let endpoint_from_domain = + sni.and_then(|sni_str| common_names.and_then(|cn| endpoint_sni(sni_str, cn))); let endpoint = match (endpoint_option, endpoint_from_domain) { // Invariant: if we have both project name variants, they should match. @@ -424,21 +404,34 @@ mod tests { } #[test] - fn parse_inconsistent_sni() { + fn parse_unknown_sni() { let options = StartupMessageParams::new([("user", "john_doe")]); let sni = Some("project.localhost"); let common_names = Some(["example.com".into()].into()); let ctx = RequestContext::test(); - let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) - .expect_err("should fail"); - match err { - UnknownCommonName { cn } => { - assert_eq!(cn, "localhost"); - } - _ => panic!("bad error: {err:?}"), - } + let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .unwrap(); + + assert!(info.endpoint_id.is_none()); + } + + #[test] + fn parse_unknown_sni_with_options() { + let options = StartupMessageParams::new([ + ("user", "john_doe"), + ("options", "endpoint=foo-bar-baz-1234"), + ]); + + let sni = Some("project.localhost"); + let common_names = Some(["example.com".into()].into()); + + let ctx = RequestContext::test(); + let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .unwrap(); + + assert_eq!(info.endpoint_id.as_deref(), Some("foo-bar-baz-1234")); } #[test] diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index c05031ad97..54c02f2c15 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -24,9 +24,6 @@ pub(crate) enum HandshakeError { #[error("protocol violation")] ProtocolViolation, - #[error("missing certificate")] - MissingCertificate, - #[error("{0}")] StreamUpgradeError(#[from] StreamUpgradeError), @@ -42,10 +39,6 @@ impl ReportableError for HandshakeError { match self { HandshakeError::EarlyData => crate::error::ErrorKind::User, HandshakeError::ProtocolViolation => crate::error::ErrorKind::User, - // This error should not happen, but will if we have no default certificate and - // the client sends no SNI extension. - // If they provide SNI then we can be sure there is a certificate that matches. - HandshakeError::MissingCertificate => crate::error::ErrorKind::Service, HandshakeError::StreamUpgradeError(upgrade) => match upgrade { StreamUpgradeError::AlreadyTls => crate::error::ErrorKind::Service, StreamUpgradeError::Io(_) => crate::error::ErrorKind::ClientDisconnect, @@ -146,7 +139,7 @@ pub(crate) async fn handshake( // try parse endpoint let ep = conn_info .server_name() - .and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten()); + .and_then(|sni| endpoint_sni(sni, &tls.common_names)); if let Some(ep) = ep { ctx.set_endpoint_id(ep); } @@ -161,10 +154,8 @@ pub(crate) async fn handshake( } } - let (_, tls_server_end_point) = tls - .cert_resolver - .resolve(conn_info.server_name()) - .ok_or(HandshakeError::MissingCertificate)?; + let (_, tls_server_end_point) = + tls.cert_resolver.resolve(conn_info.server_name()); stream = PqStream { framed: Framed { diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 9a6864c33e..f47636cd71 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -98,8 +98,7 @@ fn generate_tls_config<'a>( .with_no_client_auth() .with_single_cert(vec![cert.clone()], key.clone_key())?; - let mut cert_resolver = CertResolver::new(); - cert_resolver.add_cert(key, vec![cert], true)?; + let cert_resolver = CertResolver::new(key, vec![cert])?; let common_names = cert_resolver.get_common_names(); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 7fb39553f9..fee5942b7e 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -199,8 +199,7 @@ fn get_conn_info( let endpoint = match connection_url.host() { Some(url::Host::Domain(hostname)) => { if let Some(tls) = tls { - endpoint_sni(hostname, &tls.common_names)? - .ok_or(ConnInfoError::MalformedEndpoint)? + endpoint_sni(hostname, &tls.common_names).ok_or(ConnInfoError::MalformedEndpoint)? } else { hostname .split_once('.') diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs index 5a95e69fde..8f8917ef62 100644 --- a/proxy/src/tls/server_config.rs +++ b/proxy/src/tls/server_config.rs @@ -5,6 +5,7 @@ use anyhow::{Context, bail}; use itertools::Itertools; use rustls::crypto::ring::{self, sign}; use rustls::pki_types::{CertificateDer, PrivateKeyDer}; +use rustls::sign::CertifiedKey; use x509_cert::der::{Reader, SliceReader}; use super::{PG_ALPN_PROTOCOL, TlsServerEndPoint}; @@ -25,10 +26,8 @@ pub fn configure_tls( certs_dir: Option<&String>, allow_tls_keylogfile: bool, ) -> anyhow::Result { - let mut cert_resolver = CertResolver::new(); - // add default certificate - cert_resolver.add_cert_path(key_path, cert_path, true)?; + let mut cert_resolver = CertResolver::parse_new(key_path, cert_path)?; // add extra certificates if let Some(certs_dir) = certs_dir { @@ -40,11 +39,8 @@ pub fn configure_tls( let key_path = path.join("tls.key"); let cert_path = path.join("tls.crt"); if key_path.exists() && cert_path.exists() { - cert_resolver.add_cert_path( - &key_path.to_string_lossy(), - &cert_path.to_string_lossy(), - false, - )?; + cert_resolver + .add_cert_path(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?; } } } @@ -83,92 +79,42 @@ pub fn configure_tls( }) } -#[derive(Default, Debug)] +#[derive(Debug)] pub struct CertResolver { certs: HashMap, TlsServerEndPoint)>, - default: Option<(Arc, TlsServerEndPoint)>, + default: (Arc, TlsServerEndPoint), } impl CertResolver { - pub fn new() -> Self { - Self::default() + fn parse_new(key_path: &str, cert_path: &str) -> anyhow::Result { + let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; + Self::new(priv_key, cert_chain) } - fn add_cert_path( - &mut self, - key_path: &str, - cert_path: &str, - is_default: bool, - ) -> anyhow::Result<()> { - let priv_key = { - let key_bytes = std::fs::read(key_path) - .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?; - rustls_pemfile::private_key(&mut &key_bytes[..]) - .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? - .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? - }; + pub fn new( + priv_key: PrivateKeyDer<'static>, + cert_chain: Vec>, + ) -> anyhow::Result { + let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?; - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - - let cert_chain = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .try_collect() - .with_context(|| { - format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") - })? - }; - - self.add_cert(priv_key, cert_chain, is_default) + let mut certs = HashMap::new(); + let default = (cert.clone(), tls_server_end_point); + certs.insert(common_name, (cert, tls_server_end_point)); + Ok(Self { certs, default }) } - pub fn add_cert( + fn add_cert_path(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> { + let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; + self.add_cert(priv_key, cert_chain) + } + + fn add_cert( &mut self, priv_key: PrivateKeyDer<'static>, cert_chain: Vec>, - is_default: bool, ) -> anyhow::Result<()> { - let key = sign::any_supported_type(&priv_key).context("invalid private key")?; - - let first_cert = &cert_chain[0]; - let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - - let certificate = SliceReader::new(first_cert) - .context("Failed to parse cerficiate")? - .decode::() - .context("Failed to parse cerficiate")?; - - let common_name = certificate.tbs_certificate.subject.to_string(); - - // We need to get the canonical name for this certificate so we can match them against any domain names - // seen within the proxy codebase. - // - // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI. - // We need to remove the wildcard prefix for the purposes of certificate selection. - // - // auth-broker does not use SNI and instead uses the Neon-Connection-String header. - // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String. - // - // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string - // validation, so let's we can continue with any common-name - let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") { - s.to_string() - } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") { - s.to_string() - } else if let Some(s) = common_name.strip_prefix("CN=") { - s.to_string() - } else { - bail!("Failed to parse common name from certificate") - }; - - let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); - - if is_default { - self.default = Some((cert.clone(), tls_server_end_point)); - } - + let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?; self.certs.insert(common_name, (cert, tls_server_end_point)); - Ok(()) } @@ -177,12 +123,82 @@ impl CertResolver { } } +fn parse_key_cert( + key_path: &str, + cert_path: &str, +) -> anyhow::Result<(PrivateKeyDer<'static>, Vec>)> { + let priv_key = { + let key_bytes = std::fs::read(key_path) + .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?; + rustls_pemfile::private_key(&mut &key_bytes[..]) + .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? + .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? + }; + + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + + let cert_chain = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() + .with_context(|| { + format!( + "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." + ) + })? + }; + + Ok((priv_key, cert_chain)) +} + +fn process_key_cert( + priv_key: PrivateKeyDer<'static>, + cert_chain: Vec>, +) -> anyhow::Result<(String, Arc, TlsServerEndPoint)> { + let key = sign::any_supported_type(&priv_key).context("invalid private key")?; + + let first_cert = &cert_chain[0]; + let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; + + let certificate = SliceReader::new(first_cert) + .context("Failed to parse cerficiate")? + .decode::() + .context("Failed to parse cerficiate")?; + + let common_name = certificate.tbs_certificate.subject.to_string(); + + // We need to get the canonical name for this certificate so we can match them against any domain names + // seen within the proxy codebase. + // + // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI. + // We need to remove the wildcard prefix for the purposes of certificate selection. + // + // auth-broker does not use SNI and instead uses the Neon-Connection-String header. + // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String. + // + // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string + // validation, so let's we can continue with any common-name + let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=") { + s.to_string() + } else { + bail!("Failed to parse common name from certificate") + }; + + let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); + + Ok((common_name, cert, tls_server_end_point)) +} + impl rustls::server::ResolvesServerCert for CertResolver { fn resolve( &self, client_hello: rustls::server::ClientHello<'_>, ) -> Option> { - self.resolve(client_hello.server_name()).map(|x| x.0) + Some(self.resolve(client_hello.server_name()).0) } } @@ -190,7 +206,7 @@ impl CertResolver { pub fn resolve( &self, server_name: Option<&str>, - ) -> Option<(Arc, TlsServerEndPoint)> { + ) -> (Arc, TlsServerEndPoint) { // loop here and cut off more and more subdomains until we find // a match to get a proper wildcard support. OTOH, we now do not // use nested domains, so keep this simple for now. @@ -200,12 +216,17 @@ impl CertResolver { if let Some(mut sni_name) = server_name { loop { if let Some(cert) = self.certs.get(sni_name) { - return Some(cert.clone()); + return cert.clone(); } if let Some((_, rest)) = sni_name.split_once('.') { sni_name = rest; } else { - return None; + // The customer has some custom DNS mapping - just return + // a default certificate. + // + // This will error if the customer uses anything stronger + // than sslmode=require. That's a choice they can make. + return self.default.clone(); } } } else { From 0b243242df493cf3a60de6d0ec1a0d1491c3cd4c Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Mon, 5 May 2025 20:15:22 +0800 Subject: [PATCH 037/142] fix(test): allow flush error in gc-compaction tests (#11822) ## Problem Part of https://github.com/neondatabase/neon/issues/11762 ## Summary of changes While #11762 needs some work to refactor the error propagating thing, we can do a hacky fix for the gc-compaction tests to allow flush error during shutdown. It does not affect correctness. Signed-off-by: Alex Chi Z --- test_runner/regress/test_compaction.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 0dfc665a1d..370f57b19d 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -202,6 +202,8 @@ def test_pageserver_gc_compaction_preempt( env = neon_env_builder.init_start(initial_tenant_conf=conf) env.pageserver.allowed_errors.append(".*The timeline or pageserver is shutting down.*") + env.pageserver.allowed_errors.append(".*flush task cancelled.*") + env.pageserver.allowed_errors.append(".*failed to pipe.*") tenant_id = env.initial_tenant timeline_id = env.initial_timeline From baf425a2cde7a92873b548115847445f3a43f6b0 Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 5 May 2025 15:06:37 +0200 Subject: [PATCH 038/142] [pageserver/virtual_file] impr: Improve OpenOptions API ergonomics (#11789) # Improve OpenOptions API ergonomics Closes #11787 This PR improves the OpenOptions API ergonomics by: 1. Making OpenOptions methods take and return owned Self instead of &mut self 2. Changing VirtualFile::open_with_options_v2 to take an owned OpenOptions 3. Removing unnecessary .clone() and .to_owned() calls These changes make the API more idiomatic Rust by leveraging the builder pattern with owned values, which is cleaner and more ergonomic than the previous approach. Link to Devin run: https://app.devin.ai/sessions/c2a4b24f7aca40a3b3777f4259bf8ee1 Requested by: christian@neon.tech --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: christian@neon.tech --- pageserver/src/virtual_file.rs | 42 ++++++++------------- pageserver/src/virtual_file/open_options.rs | 17 ++++----- 2 files changed, 23 insertions(+), 36 deletions(-) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 58953407b1..f429e59ef3 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -14,8 +14,6 @@ use std::fs::File; use std::io::{Error, ErrorKind}; use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; -#[cfg(target_os = "linux")] -use std::os::unix::fs::OpenOptionsExt; use std::sync::LazyLock; use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering}; @@ -99,7 +97,7 @@ impl VirtualFile { pub async fn open_with_options_v2>( path: P, - open_options: &OpenOptions, + #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut open_options: OpenOptions, ctx: &RequestContext, ) -> Result { let mode = get_io_mode(); @@ -112,21 +110,16 @@ impl VirtualFile { #[cfg(target_os = "linux")] (IoMode::DirectRw, _) => true, }; - let open_options = open_options.clone(); - let open_options = if set_o_direct { + if set_o_direct { #[cfg(target_os = "linux")] { - let mut open_options = open_options; - open_options.custom_flags(nix::libc::O_DIRECT); - open_options + open_options = open_options.custom_flags(nix::libc::O_DIRECT); } #[cfg(not(target_os = "linux"))] unreachable!( "O_DIRECT is not supported on this platform, IoMode's that result in set_o_direct=true shouldn't even be defined" ); - } else { - open_options - }; + } let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?; Ok(VirtualFile { inner, _mode: mode }) } @@ -530,7 +523,7 @@ impl VirtualFileInner { path: P, ctx: &RequestContext, ) -> Result { - Self::open_with_options(path.as_ref(), OpenOptions::new().read(true).clone(), ctx).await + Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await } /// Open a file with given options. @@ -558,10 +551,11 @@ impl VirtualFileInner { // It would perhaps be nicer to check just for the read and write flags // explicitly, but OpenOptions doesn't contain any functions to read flags, // only to set them. - let mut reopen_options = open_options.clone(); - reopen_options.create(false); - reopen_options.create_new(false); - reopen_options.truncate(false); + let reopen_options = open_options + .clone() + .create(false) + .create_new(false) + .truncate(false); let vfile = VirtualFileInner { handle: RwLock::new(handle), @@ -1307,7 +1301,7 @@ mod tests { opts: OpenOptions, ctx: &RequestContext, ) -> Result { - let vf = VirtualFile::open_with_options_v2(&path, &opts, ctx).await?; + let vf = VirtualFile::open_with_options_v2(&path, opts, ctx).await?; Ok(MaybeVirtualFile::VirtualFile(vf)) } } @@ -1374,7 +1368,7 @@ mod tests { let _ = file_a.read_string_at(0, 1, &ctx).await.unwrap_err(); // Close the file and re-open for reading - let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?; + let mut file_a = A::open(path_a, OpenOptions::new().read(true), &ctx).await?; // cannot write to a file opened in read-only mode let _ = file_a @@ -1393,8 +1387,7 @@ mod tests { .read(true) .write(true) .create(true) - .truncate(true) - .to_owned(), + .truncate(true), &ctx, ) .await?; @@ -1412,12 +1405,7 @@ mod tests { let mut vfiles = Vec::new(); for _ in 0..100 { - let mut vfile = A::open( - path_b.clone(), - OpenOptions::new().read(true).to_owned(), - &ctx, - ) - .await?; + let mut vfile = A::open(path_b.clone(), OpenOptions::new().read(true), &ctx).await?; assert_eq!("FOOBAR", vfile.read_string_at(0, 6, &ctx).await?); vfiles.push(vfile); } @@ -1466,7 +1454,7 @@ mod tests { for _ in 0..VIRTUAL_FILES { let f = VirtualFileInner::open_with_options( &test_file_path, - OpenOptions::new().read(true).clone(), + OpenOptions::new().read(true), &ctx, ) .await?; diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs index 7d323f3d8f..2a7bb693f2 100644 --- a/pageserver/src/virtual_file/open_options.rs +++ b/pageserver/src/virtual_file/open_options.rs @@ -1,6 +1,7 @@ //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`]; use std::os::fd::OwnedFd; +use std::os::unix::fs::OpenOptionsExt; use std::path::Path; use super::io_engine::IoEngine; @@ -43,7 +44,7 @@ impl OpenOptions { self.write } - pub fn read(&mut self, read: bool) -> &mut OpenOptions { + pub fn read(mut self, read: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.read(read); @@ -56,7 +57,7 @@ impl OpenOptions { self } - pub fn write(&mut self, write: bool) -> &mut OpenOptions { + pub fn write(mut self, write: bool) -> Self { self.write = write; match &mut self.inner { Inner::StdFs(x) => { @@ -70,7 +71,7 @@ impl OpenOptions { self } - pub fn create(&mut self, create: bool) -> &mut OpenOptions { + pub fn create(mut self, create: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.create(create); @@ -83,7 +84,7 @@ impl OpenOptions { self } - pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions { + pub fn create_new(mut self, create_new: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.create_new(create_new); @@ -96,7 +97,7 @@ impl OpenOptions { self } - pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions { + pub fn truncate(mut self, truncate: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.truncate(truncate); @@ -124,10 +125,8 @@ impl OpenOptions { } } } -} -impl std::os::unix::prelude::OpenOptionsExt for OpenOptions { - fn mode(&mut self, mode: u32) -> &mut OpenOptions { + pub fn mode(mut self, mode: u32) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.mode(mode); @@ -140,7 +139,7 @@ impl std::os::unix::prelude::OpenOptionsExt for OpenOptions { self } - fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions { + pub fn custom_flags(mut self, flags: i32) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.custom_flags(flags); From cb67f9a6517652d21917a77f61b3e466f992d901 Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Mon, 5 May 2025 16:30:13 +0200 Subject: [PATCH 039/142] delete orphan left over projects (#11826) ## Problem sometimes our benchmarking GitHub workflow is terminated by side-effects beyond our control (e.g. GitHub runner looses connection to server) and then we have left-over Neon projects created during the workflow [Example where GitHub runner lost connection and project was not deleted](https://github.com/neondatabase/neon/actions/runs/14017400543/job/39244816485) Fixes https://github.com/neondatabase/cloud/issues/28546 ## Summary of changes - Add a cleanup step that cleans up left-over projects - also give each project created during workflows a name that references the testcase and GitHub runid ## Example run (test of new job steps) https://github.com/neondatabase/neon/actions/runs/14837092399/job/41650741922#step:6:63 --------- Co-authored-by: a-masterov <72613290+a-masterov@users.noreply.github.com> --- .github/workflows/benchmarking.yml | 71 +++++++++++++++++++ .../test_cumulative_statistics_persistence.py | 6 +- .../performance/test_physical_replication.py | 8 ++- 3 files changed, 82 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 5107f457e2..220d7905b1 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -53,6 +53,77 @@ concurrency: cancel-in-progress: true jobs: + cleanup: + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: ghcr.io/neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --init + env: + ORG_ID: org-solitary-dew-09443886 + LIMIT: 100 + SEARCH: "Created by actions/neon-project-create; GITHUB_RUN_ID" + BASE_URL: https://console-stage.neon.build/api/v2 + DRY_RUN: "false" # Set to "true" to just test out the workflow + + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Cleanup inactive Neon projects left over from prior runs + env: + API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + run: | + set -euo pipefail + + NOW=$(date -u +%s) + DAYS_AGO=$((NOW - 5 * 86400)) + + REQUEST_URL="$BASE_URL/projects?limit=$LIMIT&search=$(printf '%s' "$SEARCH" | jq -sRr @uri)&org_id=$ORG_ID" + + echo "Requesting project list from:" + echo "$REQUEST_URL" + + response=$(curl -s -X GET "$REQUEST_URL" \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" ) + + echo "Response:" + echo "$response" | jq . + + projects_to_delete=$(echo "$response" | jq --argjson cutoff "$DAYS_AGO" ' + .projects[] + | select(.compute_last_active_at != null) + | select((.compute_last_active_at | fromdateiso8601) < $cutoff) + | {id, name, compute_last_active_at} + ') + + if [ -z "$projects_to_delete" ]; then + echo "No projects eligible for deletion." + exit 0 + fi + + echo "Projects that will be deleted:" + echo "$projects_to_delete" | jq -r '.id' + + if [ "$DRY_RUN" = "false" ]; then + echo "$projects_to_delete" | jq -r '.id' | while read -r project_id; do + echo "Deleting project: $project_id" + curl -s -X DELETE "$BASE_URL/projects/$project_id" \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" + done + else + echo "Dry run enabled — no projects were deleted." + fi bench: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} permissions: diff --git a/test_runner/performance/test_cumulative_statistics_persistence.py b/test_runner/performance/test_cumulative_statistics_persistence.py index 061467bbad..5e9e55cb0f 100644 --- a/test_runner/performance/test_cumulative_statistics_persistence.py +++ b/test_runner/performance/test_cumulative_statistics_persistence.py @@ -1,4 +1,5 @@ import math # Add this import +import os import time import traceback from pathlib import Path @@ -87,7 +88,10 @@ def test_cumulative_statistics_persistence( - insert additional tuples that by itself are not enough to trigger auto-vacuum but in combination with the previous tuples are - verify that autovacuum is triggered by the combination of tuples inserted before and after endpoint suspension """ - project = neon_api.create_project(pg_version) + project = neon_api.create_project( + pg_version, + f"Test cumulative statistics persistence, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}", + ) project_id = project["project"]["id"] neon_api.wait_for_operation_to_finish(project_id) endpoint_id = project["endpoints"][0]["id"] diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index bdafa2d657..c580bfcc14 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -62,7 +62,9 @@ def test_ro_replica_lag( pgbench_duration = f"-T{test_duration_min * 60 * 2}" - project = neon_api.create_project(pg_version) + project = neon_api.create_project( + pg_version, f"Test readonly replica lag, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}" + ) project_id = project["project"]["id"] log.info("Project ID: %s", project_id) log.info("Primary endpoint ID: %s", project["endpoints"][0]["id"]) @@ -195,7 +197,9 @@ def test_replication_start_stop( pgbench_duration = f"-T{2**num_replicas * configuration_test_time_sec}" error_occurred = False - project = neon_api.create_project(pg_version) + project = neon_api.create_project( + pg_version, f"Test replication start stop, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}" + ) project_id = project["project"]["id"] log.info("Project ID: %s", project_id) log.info("Primary endpoint ID: %s", project["endpoints"][0]["id"]) From 16ca74a3f41789a1136e1256a8b9e5ec59b33417 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 6 May 2025 09:49:23 +0300 Subject: [PATCH 040/142] Add SAFETY comment on libc::sysconf() call (#11581) I got an 'undocumented_unsafe_blocks' clippy warning about it. Not sure why I got the warning now and not before, but in any case a comment is a good idea. --- libs/metrics/src/more_process_metrics.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/metrics/src/more_process_metrics.rs b/libs/metrics/src/more_process_metrics.rs index 13a745e031..f91800685f 100644 --- a/libs/metrics/src/more_process_metrics.rs +++ b/libs/metrics/src/more_process_metrics.rs @@ -16,6 +16,7 @@ pub struct Collector { const NMETRICS: usize = 2; static CLK_TCK_F64: Lazy = Lazy::new(|| { + // SAFETY: libc::sysconf is safe, it merely returns a value. let long = unsafe { libc::sysconf(libc::_SC_CLK_TCK) }; if long == -1 { panic!("sysconf(_SC_CLK_TCK) failed"); From c6ff18affccc73372078c393163f06a88b871820 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Tue, 6 May 2025 10:51:51 +0400 Subject: [PATCH 041/142] cosmetics(pgxn/neon): WP code small clean up (#11824) ## Problem Some small cosmetic changes I made while reading the code. Should not affect anything. ## Summary of changes - Remove `n_votes` field because it's not used anymore - Explicitly initialize `safekeepers_generation` with `INVALID_GENERATION` if the generation is not present (the struct is zero-initialized anyway, but the explicit initialization is better IMHO) - Access SafekeeperId via pointer `sk_id` created above --- pgxn/neon/neon_walreader.c | 2 +- pgxn/neon/walproposer.c | 6 +++--- pgxn/neon/walproposer.h | 3 --- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index be2c4ddf79..d5e3a38dbb 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -150,7 +150,7 @@ NeonWALReaderFree(NeonWALReader *state) * fetched from timeline 'tli'. * * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error - * occurs, in which case 'err' has the desciption. Error always closes remote + * occurs, in which case 'err' has the description. Error always closes remote * connection, if there was any, so socket subscription should be removed. * * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index f4f1398375..3befb42030 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -124,6 +124,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) } else { + wp->safekeepers_generation = INVALID_GENERATION; host = wp->config->safekeepers_list; } wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation); @@ -756,7 +757,7 @@ UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk) { SafekeeperId *sk_id = &wp->mconf.members.m[i]; - if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId) + if (sk_id->node_id == sk->greetResponse.nodeId) { /* * If mconf or list of safekeepers to connect to changed (the @@ -781,7 +782,7 @@ UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk) { SafekeeperId *sk_id = &wp->mconf.new_members.m[i]; - if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId) + if (sk_id->node_id == sk->greetResponse.nodeId) { if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk) { @@ -1071,7 +1072,6 @@ RecvVoteResponse(Safekeeper *sk) /* ready for elected message */ sk->state = SS_WAIT_ELECTED; - wp->n_votes++; /* Are we already elected? */ if (wp->state == WPS_CAMPAIGN) { diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 648b0015ad..83ef72d3d7 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -845,9 +845,6 @@ typedef struct WalProposer /* timeline globally starts at this LSN */ XLogRecPtr timelineStartLsn; - /* number of votes collected from safekeepers */ - int n_votes; - /* number of successful connections over the lifetime of walproposer */ int n_connected; From f0e7b3e0efc964539c9561b5976922e9c1cccbcb Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 6 May 2025 10:24:27 +0300 Subject: [PATCH 042/142] Use unlogged build for gist_indexsortbuild_flush_ready_pages (#11753) ## Problem See https://github.com/neondatabase/neon/issues/11718 GIST index can be constructed in two ways: GIST_SORTED_BUILD and GIST_BUFFERING. We used unlogged build in the second case but not in the first. ## Summary of changes Use unlogged build in `gist_indexsortbuild_flush_ready_pages` Correspondent Postgres PRsL: https://github.com/neondatabase/postgres/pull/624 https://github.com/neondatabase/postgres/pull/625 https://github.com/neondatabase/postgres/pull/626 --------- Co-authored-by: Konstantin Knizhnik Co-authored-by: Heikki Linnakangas --- pgxn/neon/pagestore_smgr.c | 6 ++++++ test_runner/regress/test_gist.py | 28 ++++++++++++++++++++++++++++ vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/revisions.json | 6 +++--- 6 files changed, 40 insertions(+), 6 deletions(-) create mode 100644 test_runner/regress/test_gist.py diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 3bf0bedf99..87eb420717 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1989,8 +1989,14 @@ neon_start_unlogged_build(SMgrRelation reln) neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } +#if PG_MAJORVERSION_NUM >= 17 + /* + * We have to disable this check for pg14-16 because sorted build of GIST index requires + * to perform unlogged build several times + */ if (smgrnblocks(reln, MAIN_FORKNUM) != 0) neon_log(ERROR, "cannot perform unlogged index build, index is not empty "); +#endif unlogged_build_rel = reln; unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; diff --git a/test_runner/regress/test_gist.py b/test_runner/regress/test_gist.py new file mode 100644 index 0000000000..89e3b9b2b1 --- /dev/null +++ b/test_runner/regress/test_gist.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + + +# +# Test unlogged build for GIST index +# +def test_gist(neon_simple_env: NeonEnv): + env = neon_simple_env + endpoint = env.endpoints.create_start("main") + con = endpoint.connect() + cur = con.cursor() + iterations = 100 + + for _ in range(iterations): + cur.execute( + "CREATE TABLE pvactst (i INT, a INT[], p POINT) with (autovacuum_enabled = off)" + ) + cur.execute( + "INSERT INTO pvactst SELECT i, array[1,2,3], point(i, i+1) FROM generate_series(1,1000) i" + ) + cur.execute("CREATE INDEX gist_pvactst ON pvactst USING gist (p)") + cur.execute("VACUUM pvactst") + cur.execute("DROP TABLE pvactst") diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index d3c9d61fb7..c8dab02bfc 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit d3c9d61fb7a362a165dac7060819dd9d6ad68c28 +Subproject commit c8dab02bfc003ae7bd59096919042d7840f3c194 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 8ecb12f21d..b838c8969b 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 8ecb12f21d862dfa39f7204b8f5e1c00a2a225b3 +Subproject commit b838c8969b7c63f3e637a769656f5f36793b797c diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 37496f87b5..05ddf212e2 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 37496f87b5324af53c56127e278ee5b1e8435253 +Subproject commit 05ddf212e2e07b788b5c8b88bdcf98630941f6ae diff --git a/vendor/revisions.json b/vendor/revisions.json index 90d878d0f7..74a6ff33d7 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -5,14 +5,14 @@ ], "v16": [ "16.8", - "37496f87b5324af53c56127e278ee5b1e8435253" + "05ddf212e2e07b788b5c8b88bdcf98630941f6ae" ], "v15": [ "15.12", - "8ecb12f21d862dfa39f7204b8f5e1c00a2a225b3" + "b838c8969b7c63f3e637a769656f5f36793b797c" ], "v14": [ "14.17", - "d3c9d61fb7a362a165dac7060819dd9d6ad68c28" + "c8dab02bfc003ae7bd59096919042d7840f3c194" ] } From 62ac5b94b3842322f267df28ff90ad3c9d4060a7 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Tue, 6 May 2025 09:28:25 +0000 Subject: [PATCH 043/142] proxy: Include the exp/nbf timestamps in the errors (#11828) ## Problem It's difficult to tell when the JWT expired from current logs and error messages. ## Summary of changes Add exp/nbf timestamps to the respective error variants. Also use checked_add when deserializing a SystemTime from JWT. Related to INC-509 --- proxy/src/auth/backend/jwt.rs | 41 ++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 44a6a42665..a48f67199a 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -409,14 +409,22 @@ impl JwkCacheEntryLock { if let Some(exp) = payload.expiration { if now >= exp + CLOCK_SKEW_LEEWAY { - return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired)); + return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired( + exp.duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ))); } } if let Some(nbf) = payload.not_before { if nbf >= now + CLOCK_SKEW_LEEWAY { return Err(JwtError::InvalidClaims( - JwtClaimsError::JwtTokenNotYetReadyToUse, + JwtClaimsError::JwtTokenNotYetReadyToUse( + nbf.duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ), )); } } @@ -534,10 +542,10 @@ struct JwtPayload<'a> { #[serde(rename = "aud", default)] audience: OneOrMany, /// Expiration - Time after which the JWT expires - #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)] + #[serde(rename = "exp", deserialize_with = "numeric_date_opt", default)] expiration: Option, - /// Not before - Time after which the JWT expires - #[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)] + /// Not before - Time before which the JWT is not valid + #[serde(rename = "nbf", deserialize_with = "numeric_date_opt", default)] not_before: Option, // the following entries are only extracted for the sake of debug logging. @@ -609,8 +617,15 @@ impl<'de> Deserialize<'de> for OneOrMany { } fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result, D::Error> { - let d = >::deserialize(d)?; - Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n))) + >::deserialize(d)? + .map(|t| { + SystemTime::UNIX_EPOCH + .checked_add(Duration::from_secs(t)) + .ok_or_else(|| { + serde::de::Error::custom(format_args!("timestamp out of bounds: {t}")) + }) + }) + .transpose() } struct JwkRenewalPermit<'a> { @@ -746,11 +761,11 @@ pub enum JwtClaimsError { #[error("invalid JWT token audience")] InvalidJwtTokenAudience, - #[error("JWT token has expired")] - JwtTokenHasExpired, + #[error("JWT token has expired (exp={0})")] + JwtTokenHasExpired(u64), - #[error("JWT token is not yet ready to use")] - JwtTokenNotYetReadyToUse, + #[error("JWT token is not yet ready to use (nbf={0})")] + JwtTokenNotYetReadyToUse(u64), } #[allow(dead_code, reason = "Debug use only")] @@ -1233,14 +1248,14 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL "nbf": now + 60, "aud": "neon", }}, - error: JwtClaimsError::JwtTokenNotYetReadyToUse, + error: JwtClaimsError::JwtTokenNotYetReadyToUse(now + 60), }, Test { body: json! {{ "exp": now - 60, "aud": ["neon"], }}, - error: JwtClaimsError::JwtTokenHasExpired, + error: JwtClaimsError::JwtTokenHasExpired(now - 60), }, Test { body: json! {{ From 50dc2fae771c6720b77eeb431e90a2cb300b9b5c Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 6 May 2025 11:52:21 +0100 Subject: [PATCH 044/142] compute-node.Dockerfile: remove layer with duplicated name (#11807) ## Problem Two `rust-extensions-build-pgrx14` layers were added independently in two different PRs, and the layers are exactly the same ## Summary of changes - Remove one of `rust-extensions-build-pgrx14` layers --- compute/compute-node.Dockerfile | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index cc338cec6a..8766eb519e 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1084,23 +1084,12 @@ RUN cargo install --locked --version 0.12.9 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root + ######################################################################################### # # Layer "rust extensions pgrx14" # -######################################################################################### -FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14 -ARG PG_VERSION - -RUN cargo install --locked --version 0.14.1 cargo-pgrx && \ - /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' - -USER root -######################################################################################### -# -# Layer "rust extensions pgrx14" -# -# Version 14 is now required by a few +# Version 14 is now required by a few # This layer should be used as a base for new pgrx extensions, # and eventually get merged with `rust-extensions-build` # From c82e363ed90742214de1bf5efb7a721019e89d42 Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Tue, 6 May 2025 14:26:13 +0200 Subject: [PATCH 045/142] cleanup orphan projects created by python tests, too (#11836) ## Problem - some projects are created during GitHub workflows but not by action project_create but by python test scripts. If the python test fails the project is not deleted ## Summary of changes - make sure we cleanup those python created projects a few days after they are no longer used, too --- .github/workflows/benchmarking.yml | 2 +- test_runner/random_ops/test_random_ops.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 220d7905b1..79371ec704 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -64,7 +64,7 @@ jobs: env: ORG_ID: org-solitary-dew-09443886 LIMIT: 100 - SEARCH: "Created by actions/neon-project-create; GITHUB_RUN_ID" + SEARCH: "GITHUB_RUN_ID=" BASE_URL: https://console-stage.neon.build/api/v2 DRY_RUN: "false" # Set to "true" to just test out the workflow diff --git a/test_runner/random_ops/test_random_ops.py b/test_runner/random_ops/test_random_ops.py index 643151fa11..645c9b7b9d 100644 --- a/test_runner/random_ops/test_random_ops.py +++ b/test_runner/random_ops/test_random_ops.py @@ -206,7 +206,7 @@ class NeonProject: self.neon_api = neon_api self.pg_bin = pg_bin proj = self.neon_api.create_project( - pg_version, f"Automatic random API test {os.getenv('GITHUB_RUN_ID')}" + pg_version, f"Automatic random API test GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}" ) self.id: str = proj["project"]["id"] self.name: str = proj["project"]["name"] From 6827f2f58ccb8dec0922d0a2c7a413998cf2f539 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 6 May 2025 20:27:16 +0800 Subject: [PATCH 046/142] fix(pageserver): only keep `iter_with_options` API, improve docs in gc-compact (#11804) ## Problem Address comments in https://github.com/neondatabase/neon/pull/11709 ## Summary of changes - remove `iter` API, users always need to specify buffer size depending on the expected memory usage. - several doc improvements --------- Signed-off-by: Alex Chi Z Co-authored-by: Christian Schwarz --- .../src/tenant/storage_layer/delta_layer.rs | 15 +----- .../tenant/storage_layer/filter_iterator.rs | 4 +- .../src/tenant/storage_layer/image_layer.rs | 15 +----- .../tenant/storage_layer/merge_iterator.rs | 46 +++++++++++-------- pageserver/src/tenant/timeline/compaction.rs | 13 ++++-- 5 files changed, 42 insertions(+), 51 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 607b0d513c..11875ac653 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -1441,14 +1441,6 @@ impl DeltaLayerInner { offset } - pub fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> { - self.iter_with_options( - ctx, - 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. - 1024, // The default value. Unit tests might use a different value - ) - } - pub fn iter_with_options<'a>( &'a self, ctx: &'a RequestContext, @@ -1634,7 +1626,6 @@ pub(crate) mod test { use crate::tenant::disk_btree::tests::TestDisk; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; - use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; use crate::tenant::{TenantShard, Timeline}; /// Construct an index for a fictional delta layer and and then @@ -2311,8 +2302,7 @@ pub(crate) mod test { for batch_size in [1, 2, 4, 8, 3, 7, 13] { println!("running with batch_size={batch_size} max_read_size={max_read_size}"); // Test if the batch size is correctly determined - let mut iter = delta_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size); let mut num_items = 0; for _ in 0..3 { iter.next_batch().await.unwrap(); @@ -2329,8 +2319,7 @@ pub(crate) mod test { iter.key_values_batch.clear(); } // Test if the result is correct - let mut iter = delta_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size); assert_delta_iter_equal(&mut iter, &test_deltas).await; } } diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs index 8d172a1c19..1a330ecfc2 100644 --- a/pageserver/src/tenant/storage_layer/filter_iterator.rs +++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs @@ -157,7 +157,7 @@ mod tests { .await .unwrap(); - let merge_iter = MergeIterator::create( + let merge_iter = MergeIterator::create_for_testing( &[resident_layer_1.get_as_delta(&ctx).await.unwrap()], &[], &ctx, @@ -182,7 +182,7 @@ mod tests { result.extend(test_deltas1[90..100].iter().cloned()); assert_filter_iter_equal(&mut filter_iter, &result).await; - let merge_iter = MergeIterator::create( + let merge_iter = MergeIterator::create_for_testing( &[resident_layer_1.get_as_delta(&ctx).await.unwrap()], &[], &ctx, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 2f7c5715bb..d684230572 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -684,14 +684,6 @@ impl ImageLayerInner { } } - pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> { - self.iter_with_options( - ctx, - 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. - 1024, // The default value. Unit tests might use a different value - ) - } - pub(crate) fn iter_with_options<'a>( &'a self, ctx: &'a RequestContext, @@ -1240,7 +1232,6 @@ mod test { use crate::context::RequestContext; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; - use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; use crate::tenant::{TenantShard, Timeline}; #[tokio::test] @@ -1507,8 +1498,7 @@ mod test { for batch_size in [1, 2, 4, 8, 3, 7, 13] { println!("running with batch_size={batch_size} max_read_size={max_read_size}"); // Test if the batch size is correctly determined - let mut iter = img_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size); let mut num_items = 0; for _ in 0..3 { iter.next_batch().await.unwrap(); @@ -1525,8 +1515,7 @@ mod test { iter.key_values_batch.clear(); } // Test if the result is correct - let mut iter = img_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size); assert_img_iter_equal(&mut iter, &test_imgs, Lsn(0x10)).await; } } diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index e084e3d567..ea3dea50c3 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -19,14 +19,6 @@ pub(crate) enum LayerRef<'a> { } impl<'a> LayerRef<'a> { - #[allow(dead_code)] - fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> { - match self { - Self::Image(x) => LayerIterRef::Image(x.iter(ctx)), - Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)), - } - } - fn iter_with_options( self, ctx: &'a RequestContext, @@ -322,6 +314,28 @@ impl MergeIteratorItem for ((Key, Lsn, Value), Arc) { } impl<'a> MergeIterator<'a> { + #[cfg(test)] + pub(crate) fn create_for_testing( + deltas: &[&'a DeltaLayerInner], + images: &[&'a ImageLayerInner], + ctx: &'a RequestContext, + ) -> Self { + Self::create_with_options(deltas, images, ctx, 1024 * 8192, 1024) + } + + /// Create a new merge iterator with custom options. + /// + /// Adjust `max_read_size` and `max_batch_size` to trade memory usage for performance. The size should scale + /// with the number of layers to compact. If there are a lot of layers, consider reducing the values, so that + /// the buffer does not take too much memory. + /// + /// The default options for L0 compactions are: + /// - max_read_size: 1024 * 8192 (8MB) + /// - max_batch_size: 1024 + /// + /// The default options for gc-compaction are: + /// - max_read_size: 128 * 8192 (1MB) + /// - max_batch_size: 128 pub fn create_with_options( deltas: &[&'a DeltaLayerInner], images: &[&'a ImageLayerInner], @@ -351,14 +365,6 @@ impl<'a> MergeIterator<'a> { } } - pub fn create( - deltas: &[&'a DeltaLayerInner], - images: &[&'a ImageLayerInner], - ctx: &'a RequestContext, - ) -> Self { - Self::create_with_options(deltas, images, ctx, 1024 * 8192, 1024) - } - pub(crate) async fn next_inner(&mut self) -> anyhow::Result> { while let Some(mut iter) = self.heap.peek_mut() { if !iter.is_loaded() { @@ -477,7 +483,7 @@ mod tests { let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) .await .unwrap(); - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_2.get_as_delta(&ctx).await.unwrap(), resident_layer_1.get_as_delta(&ctx).await.unwrap(), @@ -549,7 +555,7 @@ mod tests { let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx) .await .unwrap(); - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_1.get_as_delta(&ctx).await.unwrap(), resident_layer_2.get_as_delta(&ctx).await.unwrap(), @@ -670,7 +676,7 @@ mod tests { // Test with different layer order for MergeIterator::create to ensure the order // is stable. - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_4.get_as_delta(&ctx).await.unwrap(), resident_layer_1.get_as_delta(&ctx).await.unwrap(), @@ -682,7 +688,7 @@ mod tests { ); assert_merge_iter_equal(&mut merge_iter, &expect).await; - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_1.get_as_delta(&ctx).await.unwrap(), resident_layer_4.get_as_delta(&ctx).await.unwrap(), diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 9086d29d50..d0c13d86ce 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1994,7 +1994,13 @@ impl Timeline { let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?; deltas.push(l); } - MergeIterator::create(&deltas, &[], ctx) + MergeIterator::create_with_options( + &deltas, + &[], + ctx, + 1024 * 8192, /* 8 MiB buffer per layer iterator */ + 1024, + ) }; // This iterator walks through all keys and is needed to calculate size used by each key @@ -2828,7 +2834,7 @@ impl Timeline { Ok(()) } - /// Check if the memory usage is within the limit. + /// Check to bail out of gc compaction early if it would use too much memory. async fn check_memory_usage( self: &Arc, layer_selection: &[Layer], @@ -2841,7 +2847,8 @@ impl Timeline { let layer_desc = layer.layer_desc(); if layer_desc.is_delta() { // Delta layers at most have 1MB buffer; 3x to make it safe (there're deltas as large as 16KB). - // Multiply the layer size so that tests can pass. + // Scale it by target_layer_size_bytes so that tests can pass (some tests, e.g., `test_pageserver_gc_compaction_preempt + // use 3MB layer size and we need to account for that). estimated_memory_usage_mb += 3.0 * (layer_desc.file_size / target_layer_size_bytes) as f64; num_delta_layers += 1; From 0e0ad073bf609fbc38e86f4030f1902c2632c5f7 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 6 May 2025 15:57:34 +0200 Subject: [PATCH 047/142] storcon: fix split aborts removing other tenants (#11837) ## Problem When aborting a split, the code accidentally removes all other tenant shards from the in-memory map that have the same shard count as the aborted split, causing "tenant not found" errors. It will recover on a storcon restart, when it loads the persisted state. This issue has been present for at least a year. Resolves https://github.com/neondatabase/cloud/issues/28589. ## Summary of changes Only remove shards belonging to the relevant tenant when aborting a split. Also adds a regression test. --- storage_controller/src/service.rs | 3 ++- test_runner/regress/test_sharding.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 72379f0810..21c693af97 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -5181,7 +5181,8 @@ impl Service { } // We don't expect any new_shard_count shards to exist here, but drop them just in case - tenants.retain(|_id, s| s.shard.count != *new_shard_count); + tenants + .retain(|id, s| !(id.tenant_id == *tenant_id && s.shard.count == *new_shard_count)); detach_locations }; diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 0bfc4b1d8c..4c9887fb92 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1334,6 +1334,13 @@ def test_sharding_split_failures( tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}' ) + # Create bystander tenants with various shard counts. They should not be affected by the aborted + # splits. Regression test for https://github.com/neondatabase/cloud/issues/28589. + bystanders = {} # id → shard_count + for bystander_shard_count in [1, 2, 4, 8]: + id, _ = env.create_tenant(shard_count=bystander_shard_count) + bystanders[id] = bystander_shard_count + env.storage_controller.allowed_errors.extend( [ # All split failures log a warning when then enqueue the abort operation @@ -1394,6 +1401,8 @@ def test_sharding_split_failures( locations = ps.http_client().tenant_list_locations()["tenant_shards"] for loc in locations: tenant_shard_id = TenantShardId.parse(loc[0]) + if tenant_shard_id.tenant_id != tenant_id: + continue # skip bystanders log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") assert tenant_shard_id.shard_count == initial_shard_count if loc[1]["mode"] == "Secondary": @@ -1414,6 +1423,8 @@ def test_sharding_split_failures( locations = ps.http_client().tenant_list_locations()["tenant_shards"] for loc in locations: tenant_shard_id = TenantShardId.parse(loc[0]) + if tenant_shard_id.tenant_id != tenant_id: + continue # skip bystanders log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") assert tenant_shard_id.shard_count == split_shard_count if loc[1]["mode"] == "Secondary": @@ -1496,6 +1507,12 @@ def test_sharding_split_failures( # the scheduler reaches an idle state env.storage_controller.reconcile_until_idle(timeout_secs=30) + # Check that all bystanders are still around. + for bystander_id, bystander_shard_count in bystanders.items(): + response = env.storage_controller.tenant_describe(bystander_id) + assert TenantId(response["tenant_id"]) == bystander_id + assert len(response["shards"]) == bystander_shard_count + env.storage_controller.consistency_check() From 79ee78ea32754ed2b1cfa360c55a6a2aa11ad871 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Ko=C5=82odziejczak?= <31549762+mrl5@users.noreply.github.com> Date: Tue, 6 May 2025 17:18:50 +0200 Subject: [PATCH 048/142] feat(compute): enable audit logs for pg_session_jwt extension (#11829) related to https://github.com/neondatabase/cloud/issues/28480 related to https://github.com/neondatabase/pg_session_jwt/pull/36 cc @MihaiBojin @conradludgate @lneves12 --- compute/compute-node.Dockerfile | 4 ++-- compute_tools/src/config.rs | 3 +++ .../ext-src/pg_session_jwt-src/expected/basic_functions.out | 1 + proxy/src/serverless/local_conn_pool.rs | 2 +- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 8766eb519e..8bdf5cb7d1 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1322,8 +1322,8 @@ ARG PG_VERSION # Do not update without approve from proxy team # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs WORKDIR /ext-src -RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.0.tar.gz -O pg_session_jwt.tar.gz && \ - echo "19be2dc0b3834d643706ed430af998bb4c2cdf24b3c45e7b102bb3a550e8660c pg_session_jwt.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.1.tar.gz -O pg_session_jwt.tar.gz && \ + echo "62fec9e472cb805c53ba24a0765afdb8ea2720cfc03ae7813e61687b36d1b0ad pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \ diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 71c6123c3b..42d245f55a 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -223,6 +223,9 @@ pub fn write_postgres_conf( // TODO: tune this after performance testing writeln!(file, "pgaudit.log_rotation_age=5")?; + // Enable audit logs for pg_session_jwt extension + writeln!(file, "pg_session_jwt.audit_log=on")?; + // Add audit shared_preload_libraries, if they are not present. // // The caller who sets the flag is responsible for ensuring that the necessary diff --git a/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out index ca54864ecd..ff6a7404cb 100644 --- a/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out +++ b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out @@ -12,6 +12,7 @@ ERROR: invalid JWT encoding -- Test creating a session with an expired JWT SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw'); ERROR: Token used after it has expired +DETAIL: exp=1742564432 -- Test creating a session with a valid JWT SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg'); jwt_session_init diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 1d9b35f41d..bb5637cd5f 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -41,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; pub(crate) const EXT_NAME: &str = "pg_session_jwt"; -pub(crate) const EXT_VERSION: &str = "0.3.0"; +pub(crate) const EXT_VERSION: &str = "0.3.1"; pub(crate) const EXT_SCHEMA: &str = "auth"; #[derive(Clone)] From f9b3a2e059c4c1fd764da011e7a5364d86e60491 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 6 May 2025 14:51:10 -0500 Subject: [PATCH 049/142] Add scoping to compute_ctl JWT claims (#11639) Currently we only have an admin scope which allows a user to bypass the compute_id check. When the admin scope is provided, validate the audience of the JWT to be "compute". Closes: https://github.com/neondatabase/cloud/issues/27614 Signed-off-by: Tristan Partin --- .../src/http/middleware/authorize.rs | 59 +++++++++++--- control_plane/src/bin/neon_local.rs | 20 +++-- control_plane/src/endpoint.rs | 20 +++-- libs/compute_api/src/requests.rs | 41 +++++++++- test_runner/fixtures/endpoint/http.py | 12 +++ test_runner/fixtures/neon_cli.py | 7 +- test_runner/fixtures/neon_fixtures.py | 12 ++- test_runner/regress/test_compute_http.py | 78 +++++++++++++++++++ 8 files changed, 222 insertions(+), 27 deletions(-) create mode 100644 test_runner/regress/test_compute_http.py diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs index 2d0f411d7a..2afc57ad9c 100644 --- a/compute_tools/src/http/middleware/authorize.rs +++ b/compute_tools/src/http/middleware/authorize.rs @@ -1,12 +1,10 @@ -use std::collections::HashSet; - use anyhow::{Result, anyhow}; use axum::{RequestExt, body::Body}; use axum_extra::{ TypedHeader, headers::{Authorization, authorization::Bearer}, }; -use compute_api::requests::ComputeClaims; +use compute_api::requests::{COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope}; use futures::future::BoxFuture; use http::{Request, Response, StatusCode}; use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet}; @@ -25,13 +23,14 @@ pub(in crate::http) struct Authorize { impl Authorize { pub fn new(compute_id: String, jwks: JwkSet) -> Self { let mut validation = Validation::new(Algorithm::EdDSA); - // Nothing is currently required - validation.required_spec_claims = HashSet::new(); validation.validate_exp = true; // Unused by the control plane - validation.validate_aud = false; - // Unused by the control plane validation.validate_nbf = false; + // Unused by the control plane + validation.validate_aud = false; + validation.set_audience(&[COMPUTE_AUDIENCE]); + // Nothing is currently required + validation.set_required_spec_claims(&[] as &[&str; 0]); Self { compute_id, @@ -64,11 +63,47 @@ impl AsyncAuthorizeRequest for Authorize { Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)), }; - if data.claims.compute_id != compute_id { - return Err(JsonResponse::error( - StatusCode::UNAUTHORIZED, - "invalid compute ID in authorization token claims", - )); + match data.claims.scope { + // TODO: We should validate audience for every token, but + // instead of this ad-hoc validation, we should turn + // [`Validation::validate_aud`] on. This is merely a stopgap + // while we roll out `aud` deployment. We return a 401 + // Unauthorized because when we eventually do use + // [`Validation`], we will hit the above `Err` match arm which + // returns 401 Unauthorized. + Some(ComputeClaimsScope::Admin) => { + let Some(ref audience) = data.claims.audience else { + return Err(JsonResponse::error( + StatusCode::UNAUTHORIZED, + "missing audience in authorization token claims", + )); + }; + + if audience != COMPUTE_AUDIENCE { + return Err(JsonResponse::error( + StatusCode::UNAUTHORIZED, + "invalid audience in authorization token claims", + )); + } + } + + // If the scope is not [`ComputeClaimsScope::Admin`], then we + // must validate the compute_id + _ => { + let Some(ref claimed_compute_id) = data.claims.compute_id else { + return Err(JsonResponse::error( + StatusCode::FORBIDDEN, + "missing compute_id in authorization token claims", + )); + }; + + if *claimed_compute_id != compute_id { + return Err(JsonResponse::error( + StatusCode::FORBIDDEN, + "invalid compute ID in authorization token claims", + )); + } + } } // Make claims available to any subsequent middleware or request diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 6f55c0310f..44698f7b23 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -16,6 +16,7 @@ use std::time::Duration; use anyhow::{Context, Result, anyhow, bail}; use clap::Parser; +use compute_api::requests::ComputeClaimsScope; use compute_api::spec::ComputeMode; use control_plane::broker::StorageBroker; use control_plane::endpoint::ComputeControlPlane; @@ -705,6 +706,9 @@ struct EndpointStopCmdArgs { struct EndpointGenerateJwtCmdArgs { #[clap(help = "Postgres endpoint id")] endpoint_id: String, + + #[clap(short = 's', long, help = "Scope to generate the JWT with", value_parser = ComputeClaimsScope::from_str)] + scope: Option, } #[derive(clap::Subcommand)] @@ -1540,12 +1544,16 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res endpoint.stop(&args.mode, args.destroy)?; } EndpointCmd::GenerateJwt(args) => { - let endpoint_id = &args.endpoint_id; - let endpoint = cplane - .endpoints - .get(endpoint_id) - .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; - let jwt = endpoint.generate_jwt()?; + let endpoint = { + let endpoint_id = &args.endpoint_id; + + cplane + .endpoints + .get(endpoint_id) + .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))? + }; + + let jwt = endpoint.generate_jwt(args.scope)?; print!("{jwt}"); } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 4071b620d6..0b16339a6f 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -45,7 +45,9 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use anyhow::{Context, Result, anyhow, bail}; -use compute_api::requests::{ComputeClaims, ConfigurationRequest}; +use compute_api::requests::{ + COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope, ConfigurationRequest, +}; use compute_api::responses::{ ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TlsConfig, }; @@ -630,9 +632,17 @@ impl Endpoint { } /// Generate a JWT with the correct claims. - pub fn generate_jwt(&self) -> Result { + pub fn generate_jwt(&self, scope: Option) -> Result { self.env.generate_auth_token(&ComputeClaims { - compute_id: self.endpoint_id.clone(), + audience: match scope { + Some(ComputeClaimsScope::Admin) => Some(COMPUTE_AUDIENCE.to_owned()), + _ => Some(self.endpoint_id.clone()), + }, + compute_id: match scope { + Some(ComputeClaimsScope::Admin) => None, + _ => Some(self.endpoint_id.clone()), + }, + scope, }) } @@ -903,7 +913,7 @@ impl Endpoint { self.external_http_address.port() ), ) - .bearer_auth(self.generate_jwt()?) + .bearer_auth(self.generate_jwt(None::)?) .send() .await?; @@ -980,7 +990,7 @@ impl Endpoint { self.external_http_address.port() )) .header(CONTENT_TYPE.as_str(), "application/json") - .bearer_auth(self.generate_jwt()?) + .bearer_auth(self.generate_jwt(None::)?) .body( serde_json::to_string(&ConfigurationRequest { spec, diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index 98f2fc297c..40d34eccea 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -1,16 +1,55 @@ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. +use std::str::FromStr; + use serde::{Deserialize, Serialize}; use crate::privilege::Privilege; use crate::responses::ComputeCtlConfig; use crate::spec::{ComputeSpec, ExtVersion, PgIdent}; +/// The value to place in the [`ComputeClaims::audience`] claim. +pub static COMPUTE_AUDIENCE: &str = "compute"; + +#[derive(Copy, Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +/// Available scopes for a compute's JWT. +pub enum ComputeClaimsScope { + /// An admin-scoped token allows access to all of `compute_ctl`'s authorized + /// facilities. + Admin, +} + +impl FromStr for ComputeClaimsScope { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "admin" => Ok(ComputeClaimsScope::Admin), + _ => Err(anyhow::anyhow!("invalid compute claims scope \"{s}\"")), + } + } +} + /// When making requests to the `compute_ctl` external HTTP server, the client /// must specify a set of claims in `Authorization` header JWTs such that /// `compute_ctl` can authorize the request. #[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename = "snake_case")] pub struct ComputeClaims { - pub compute_id: String, + /// The compute ID that will validate the token. The only case in which this + /// can be [`None`] is if [`Self::scope`] is + /// [`ComputeClaimsScope::Admin`]. + pub compute_id: Option, + + /// The scope of what the token authorizes. + pub scope: Option, + + /// The recipient the token is intended for. + /// + /// See [RFC 7519](https://www.rfc-editor.org/rfc/rfc7519#section-4.1.3) for + /// more information. + #[serde(rename = "aud")] + pub audience: Option, } /// Request of the /configure API diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 652c38f5c3..beed1dcd93 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -1,6 +1,7 @@ from __future__ import annotations import urllib.parse +from enum import StrEnum from typing import TYPE_CHECKING, final import requests @@ -14,6 +15,17 @@ if TYPE_CHECKING: from requests import PreparedRequest +COMPUTE_AUDIENCE = "compute" +""" +The value to place in the `aud` claim. +""" + + +@final +class ComputeClaimsScope(StrEnum): + ADMIN = "admin" + + @final class BearerAuth(AuthBase): """ diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index b5d69b5ab6..3be78719d7 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -21,6 +21,7 @@ if TYPE_CHECKING: Any, ) + from fixtures.endpoint.http import ComputeClaimsScope from fixtures.pg_version import PgVersion @@ -535,12 +536,16 @@ class NeonLocalCli(AbstractNeonCli): res.check_returncode() return res - def endpoint_generate_jwt(self, endpoint_id: str) -> str: + def endpoint_generate_jwt( + self, endpoint_id: str, scope: ComputeClaimsScope | None = None + ) -> str: """ Generate a JWT for making requests to the endpoint's external HTTP server. """ args = ["endpoint", "generate-jwt", endpoint_id] + if scope: + args += ["--scope", str(scope)] cmd = self.raw_cli(args) cmd.check_returncode() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 47d1228c61..133be5c045 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -51,7 +51,7 @@ from fixtures.common_types import ( TimelineId, ) from fixtures.compute_migrations import NUM_COMPUTE_MIGRATIONS -from fixtures.endpoint.http import EndpointHttpClient +from fixtures.endpoint.http import ComputeClaimsScope, EndpointHttpClient from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.neon_cli import NeonLocalCli, Pagectl @@ -4218,7 +4218,7 @@ class Endpoint(PgProtocol, LogUtils): self.config(config_lines) - self.__jwt = self.env.neon_cli.endpoint_generate_jwt(self.endpoint_id) + self.__jwt = self.generate_jwt() return self @@ -4265,6 +4265,14 @@ class Endpoint(PgProtocol, LogUtils): return self + def generate_jwt(self, scope: ComputeClaimsScope | None = None) -> str: + """ + Generate a JWT for making requests to the endpoint's external HTTP + server. + """ + assert self.endpoint_id is not None + return self.env.neon_cli.endpoint_generate_jwt(self.endpoint_id, scope) + def endpoint_path(self) -> Path: """Path to endpoint directory""" assert self.endpoint_id diff --git a/test_runner/regress/test_compute_http.py b/test_runner/regress/test_compute_http.py new file mode 100644 index 0000000000..ce31ff0fe6 --- /dev/null +++ b/test_runner/regress/test_compute_http.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +from http.client import FORBIDDEN, UNAUTHORIZED +from typing import TYPE_CHECKING + +import jwt +import pytest +from fixtures.endpoint.http import COMPUTE_AUDIENCE, ComputeClaimsScope, EndpointHttpClient +from fixtures.utils import run_only_on_default_postgres +from requests import RequestException + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + + +@run_only_on_default_postgres("The code path being tested is not dependent on Postgres version") +def test_compute_no_scope_claim(neon_simple_env: NeonEnv): + """ + Test that if the JWT scope is not admin and no compute_id is specified, + the external HTTP server returns a 403 Forbidden error. + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + # Encode nothing in the token + token = jwt.encode({}, env.auth_keys.priv, algorithm="EdDSA") + + # Create an admin-scoped HTTP client + client = EndpointHttpClient( + external_port=endpoint.external_http_port, + internal_port=endpoint.internal_http_port, + jwt=token, + ) + + try: + client.status() + pytest.fail("Exception should have been raised") + except RequestException as e: + assert e.response is not None + assert e.response.status_code == FORBIDDEN + + +@pytest.mark.parametrize( + "audience", + (COMPUTE_AUDIENCE, "invalid", None), + ids=["with_audience", "with_invalid_audience", "without_audience"], +) +@run_only_on_default_postgres("The code path being tested is not dependent on Postgres version") +def test_compute_admin_scope_claim(neon_simple_env: NeonEnv, audience: str | None): + """ + Test that an admin-scoped JWT can access the compute's external HTTP server + without the compute_id being specified in the claims. + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + data = {"scope": str(ComputeClaimsScope.ADMIN)} + if audience: + data["aud"] = audience + + token = jwt.encode(data, env.auth_keys.priv, algorithm="EdDSA") + + # Create an admin-scoped HTTP client + client = EndpointHttpClient( + external_port=endpoint.external_http_port, + internal_port=endpoint.internal_http_port, + jwt=token, + ) + + try: + client.status() + if audience != COMPUTE_AUDIENCE: + pytest.fail("Exception should have been raised") + except RequestException as e: + assert e.response is not None + assert e.response.status_code == UNAUTHORIZED From 384e3df2ad2c1561cc9d427793b2d54eaa3f137b Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Tue, 6 May 2025 17:52:15 -0400 Subject: [PATCH 050/142] fix: pinned anon extension to v2.1.0 (#11844) ## Problem Currently the setup for `anon` v2 in the compute image downloads the latest version of the extension. This can be problematic as on a compute start/restart it can download a version that is newer than what we have tested and potentially break things, hence not giving us the ability to control when the extension is updated. We were also using `v2.2.0`, which is not ready for production yet and has been clarified by the maintainer. Additional context: https://gitlab.com/dalibo/postgresql_anonymizer/-/issues/530 ## Summary of changes Changed the URL from which we download the `anon` extension to point to `v2.1.0` instead of `latest`. --- compute/compute-node.Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 8bdf5cb7d1..6233eaf709 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1351,7 +1351,8 @@ COPY compute/patches/anon_v2.patch . # This is an experimental extension, never got to real production. # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/latest/postgresql_anonymizer-latest.tar.gz -O pg_anon.tar.gz && \ +RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/2.1.0/postgresql_anonymizer-latest.tar.gz -O pg_anon.tar.gz && \ + echo "48e7f5ae2f1ca516df3da86c5c739d48dd780a4e885705704ccaad0faa89d6c0 pg_anon.tar.gz" | sha256sum --check && \ mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt && \ sed -i 's/pgrx = "0.14.1"/pgrx = { version = "=0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ From 5c356c63ebefa448ea521565df35d59f1c4a933b Mon Sep 17 00:00:00 2001 From: Mikhail Date: Tue, 6 May 2025 23:02:12 +0100 Subject: [PATCH 051/142] endpoint_storage compute_ctl integration (#11550) Add `/lfc/(prewarm|offload)` routes to `compute_ctl` which interact with endpoint storage. Add `prewarm_lfc_on_startup` spec option which, if enabled, downloads LFC prewarm data on compute startup. Resolves: https://github.com/neondatabase/cloud/issues/26343 --- Cargo.lock | 2 + Cargo.toml | 1 + compute_tools/Cargo.toml | 1 + compute_tools/src/compute.rs | 49 ++++- compute_tools/src/compute_prewarm.rs | 202 +++++++++++++++++++ compute_tools/src/http/routes/lfc.rs | 39 ++++ compute_tools/src/http/routes/mod.rs | 1 + compute_tools/src/http/server.rs | 4 +- compute_tools/src/lib.rs | 1 + compute_tools/src/metrics.rs | 22 +- compute_tools/tests/pg_helpers_tests.rs | 1 + control_plane/Cargo.toml | 2 +- control_plane/src/bin/neon_local.rs | 19 +- control_plane/src/endpoint.rs | 5 + control_plane/src/endpoint_storage.rs | 10 +- control_plane/src/local_env.rs | 16 +- endpoint_storage/src/app.rs | 12 +- endpoint_storage/src/claims.rs | 52 +++++ endpoint_storage/src/lib.rs | 85 +++----- libs/compute_api/src/responses.rs | 24 +++ libs/compute_api/src/spec.rs | 9 + libs/compute_api/tests/cluster_spec.json | 5 + libs/utils/src/id.rs | 3 + test_runner/fixtures/endpoint/http.py | 30 +++ test_runner/fixtures/neon_fixtures.py | 4 +- test_runner/regress/test_endpoint_storage.py | 3 +- test_runner/regress/test_lfc_prewarm.py | 166 ++++++++++----- 27 files changed, 630 insertions(+), 138 deletions(-) create mode 100644 compute_tools/src/compute_prewarm.rs create mode 100644 compute_tools/src/http/routes/lfc.rs create mode 100644 endpoint_storage/src/claims.rs diff --git a/Cargo.lock b/Cargo.lock index 4c464c62b8..fe4cc35029 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1284,6 +1284,7 @@ name = "compute_tools" version = "0.1.0" dependencies = [ "anyhow", + "async-compression", "aws-config", "aws-sdk-kms", "aws-sdk-s3", @@ -1420,6 +1421,7 @@ dependencies = [ "clap", "comfy-table", "compute_api", + "endpoint_storage", "futures", "http-utils", "humantime", diff --git a/Cargo.toml b/Cargo.toml index 1c203af9e0..8d4cc4a75a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -243,6 +243,7 @@ azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rus ## Local libraries compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } +endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" } http-utils = { version = "0.1", path = "./libs/http-utils/" } metrics = { version = "0.1", path = "./libs/metrics/" } pageserver = { path = "./pageserver" } diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 8c1e7ad149..8ee5dd0665 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -10,6 +10,7 @@ default = [] testing = ["fail/failpoints"] [dependencies] +async-compression.workspace = true base64.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 8834f0d63d..08d915b331 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,17 +1,10 @@ -use std::collections::HashMap; -use std::os::unix::fs::{PermissionsExt, symlink}; -use std::path::Path; -use std::process::{Command, Stdio}; -use std::str::FromStr; -use std::sync::atomic::{AtomicU32, Ordering}; -use std::sync::{Arc, Condvar, Mutex, RwLock}; -use std::time::{Duration, Instant}; -use std::{env, fs}; - use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use compute_api::privilege::Privilege; -use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus}; +use compute_api::responses::{ + ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState, + LfcPrewarmState, +}; use compute_api::spec::{ ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent, }; @@ -25,6 +18,16 @@ use postgres; use postgres::NoTls; use postgres::error::SqlState; use remote_storage::{DownloadError, RemotePath}; +use std::collections::HashMap; +use std::net::SocketAddr; +use std::os::unix::fs::{PermissionsExt, symlink}; +use std::path::Path; +use std::process::{Command, Stdio}; +use std::str::FromStr; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Arc, Condvar, Mutex, RwLock}; +use std::time::{Duration, Instant}; +use std::{env, fs}; use tokio::spawn; use tracing::{Instrument, debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; @@ -150,6 +153,9 @@ pub struct ComputeState { /// set up the span relationship ourselves. pub startup_span: Option, + pub lfc_prewarm_state: LfcPrewarmState, + pub lfc_offload_state: LfcOffloadState, + pub metrics: ComputeMetrics, } @@ -163,6 +169,8 @@ impl ComputeState { pspec: None, startup_span: None, metrics: ComputeMetrics::default(), + lfc_prewarm_state: LfcPrewarmState::default(), + lfc_offload_state: LfcOffloadState::default(), } } @@ -198,6 +206,8 @@ pub struct ParsedSpec { pub pageserver_connstr: String, pub safekeeper_connstrings: Vec, pub storage_auth_token: Option, + pub endpoint_storage_addr: Option, + pub endpoint_storage_token: Option, } impl TryFrom for ParsedSpec { @@ -251,6 +261,18 @@ impl TryFrom for ParsedSpec { .or(Err("invalid timeline id"))? }; + let endpoint_storage_addr: Option = spec + .endpoint_storage_addr + .clone() + .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr")) + .unwrap_or_default() + .parse() + .ok(); + let endpoint_storage_token = spec + .endpoint_storage_token + .clone() + .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_token")); + Ok(ParsedSpec { spec, pageserver_connstr, @@ -258,6 +280,8 @@ impl TryFrom for ParsedSpec { storage_auth_token, tenant_id, timeline_id, + endpoint_storage_addr, + endpoint_storage_token, }) } } @@ -736,6 +760,9 @@ impl ComputeNode { // Log metrics so that we can search for slow operations in logs info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished"); + if pspec.spec.prewarm_lfc_on_startup { + self.prewarm_lfc(); + } Ok(()) } diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs new file mode 100644 index 0000000000..a6a84b3f1f --- /dev/null +++ b/compute_tools/src/compute_prewarm.rs @@ -0,0 +1,202 @@ +use crate::compute::ComputeNode; +use anyhow::{Context, Result, bail}; +use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder}; +use compute_api::responses::LfcOffloadState; +use compute_api::responses::LfcPrewarmState; +use http::StatusCode; +use reqwest::Client; +use std::sync::Arc; +use tokio::{io::AsyncReadExt, spawn}; +use tracing::{error, info}; + +#[derive(serde::Serialize, Default)] +pub struct LfcPrewarmStateWithProgress { + #[serde(flatten)] + base: LfcPrewarmState, + total: i32, + prewarmed: i32, + skipped: i32, +} + +/// A pair of url and a token to query endpoint storage for LFC prewarm-related tasks +struct EndpointStoragePair { + url: String, + token: String, +} + +const KEY: &str = "lfc_state"; +impl TryFrom<&crate::compute::ParsedSpec> for EndpointStoragePair { + type Error = anyhow::Error; + fn try_from(pspec: &crate::compute::ParsedSpec) -> Result { + let Some(ref endpoint_id) = pspec.spec.endpoint_id else { + bail!("pspec.endpoint_id missing") + }; + let Some(ref base_uri) = pspec.endpoint_storage_addr else { + bail!("pspec.endpoint_storage_addr missing") + }; + let tenant_id = pspec.tenant_id; + let timeline_id = pspec.timeline_id; + + let url = format!("http://{base_uri}/{tenant_id}/{timeline_id}/{endpoint_id}/{KEY}"); + let Some(ref token) = pspec.endpoint_storage_token else { + bail!("pspec.endpoint_storage_token missing") + }; + let token = token.clone(); + Ok(EndpointStoragePair { url, token }) + } +} + +impl ComputeNode { + // If prewarm failed, we want to get overall number of segments as well as done ones. + // However, this function should be reliable even if querying postgres failed. + pub async fn lfc_prewarm_state(&self) -> LfcPrewarmStateWithProgress { + info!("requesting LFC prewarm state from postgres"); + let mut state = LfcPrewarmStateWithProgress::default(); + { + state.base = self.state.lock().unwrap().lfc_prewarm_state.clone(); + } + + let client = match ComputeNode::get_maintenance_client(&self.tokio_conn_conf).await { + Ok(client) => client, + Err(err) => { + error!(%err, "connecting to postgres"); + return state; + } + }; + let row = match client + .query_one("select * from get_prewarm_info()", &[]) + .await + { + Ok(row) => row, + Err(err) => { + error!(%err, "querying LFC prewarm status"); + return state; + } + }; + state.total = row.try_get(0).unwrap_or_default(); + state.prewarmed = row.try_get(1).unwrap_or_default(); + state.skipped = row.try_get(2).unwrap_or_default(); + state + } + + pub fn lfc_offload_state(&self) -> LfcOffloadState { + self.state.lock().unwrap().lfc_offload_state.clone() + } + + /// Returns false if there is a prewarm request ongoing, true otherwise + pub fn prewarm_lfc(self: &Arc) -> bool { + crate::metrics::LFC_PREWARM_REQUESTS.inc(); + { + let state = &mut self.state.lock().unwrap().lfc_prewarm_state; + if let LfcPrewarmState::Prewarming = + std::mem::replace(state, LfcPrewarmState::Prewarming) + { + return false; + } + } + + let cloned = self.clone(); + spawn(async move { + let Err(err) = cloned.prewarm_impl().await else { + cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed; + return; + }; + error!(%err); + cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed { + error: err.to_string(), + }; + }); + true + } + + fn endpoint_storage_pair(&self) -> Result { + let state = self.state.lock().unwrap(); + state.pspec.as_ref().unwrap().try_into() + } + + async fn prewarm_impl(&self) -> Result<()> { + let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?; + info!(%url, "requesting LFC state from endpoint storage"); + + let request = Client::new().get(&url).bearer_auth(token); + let res = request.send().await.context("querying endpoint storage")?; + let status = res.status(); + if status != StatusCode::OK { + bail!("{status} querying endpoint storage") + } + + let mut uncompressed = Vec::new(); + let lfc_state = res + .bytes() + .await + .context("getting request body from endpoint storage")?; + ZstdDecoder::new(lfc_state.iter().as_slice()) + .read_to_end(&mut uncompressed) + .await + .context("decoding LFC state")?; + let uncompressed_len = uncompressed.len(); + info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into postgres"); + + ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + .await + .context("connecting to postgres")? + .query_one("select prewarm_local_cache($1)", &[&uncompressed]) + .await + .context("loading LFC state into postgres") + .map(|_| ()) + } + + /// Returns false if there is an offload request ongoing, true otherwise + pub fn offload_lfc(self: &Arc) -> bool { + crate::metrics::LFC_OFFLOAD_REQUESTS.inc(); + { + let state = &mut self.state.lock().unwrap().lfc_offload_state; + if let LfcOffloadState::Offloading = + std::mem::replace(state, LfcOffloadState::Offloading) + { + return false; + } + } + + let cloned = self.clone(); + spawn(async move { + let Err(err) = cloned.offload_lfc_impl().await else { + cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed; + return; + }; + error!(%err); + cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed { + error: err.to_string(), + }; + }); + true + } + + async fn offload_lfc_impl(&self) -> Result<()> { + let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?; + info!(%url, "requesting LFC state from postgres"); + + let mut compressed = Vec::new(); + ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + .await + .context("connecting to postgres")? + .query_one("select get_local_cache_state()", &[]) + .await + .context("querying LFC state")? + .try_get::(0) + .context("deserializing LFC state") + .map(ZstdEncoder::new)? + .read_to_end(&mut compressed) + .await + .context("compressing LFC state")?; + let compressed_len = compressed.len(); + info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage"); + + let request = Client::new().put(url).bearer_auth(token).body(compressed); + match request.send().await { + Ok(res) if res.status() == StatusCode::OK => Ok(()), + Ok(res) => bail!("Error writing to endpoint storage: {}", res.status()), + Err(err) => Err(err).context("writing to endpoint storage"), + } + } +} diff --git a/compute_tools/src/http/routes/lfc.rs b/compute_tools/src/http/routes/lfc.rs new file mode 100644 index 0000000000..07bcc6bfb7 --- /dev/null +++ b/compute_tools/src/http/routes/lfc.rs @@ -0,0 +1,39 @@ +use crate::compute_prewarm::LfcPrewarmStateWithProgress; +use crate::http::JsonResponse; +use axum::response::{IntoResponse, Response}; +use axum::{Json, http::StatusCode}; +use compute_api::responses::LfcOffloadState; +type Compute = axum::extract::State>; + +pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json { + Json(compute.lfc_prewarm_state().await) +} + +// Following functions are marked async for axum, as it's more convenient than wrapping these +// in async lambdas at call site + +pub(in crate::http) async fn offload_state(compute: Compute) -> Json { + Json(compute.lfc_offload_state()) +} + +pub(in crate::http) async fn prewarm(compute: Compute) -> Response { + if compute.prewarm_lfc() { + StatusCode::ACCEPTED.into_response() + } else { + JsonResponse::error( + StatusCode::TOO_MANY_REQUESTS, + "Multiple requests for prewarm are not allowed", + ) + } +} + +pub(in crate::http) async fn offload(compute: Compute) -> Response { + if compute.offload_lfc() { + StatusCode::ACCEPTED.into_response() + } else { + JsonResponse::error( + StatusCode::TOO_MANY_REQUESTS, + "Multiple requests for prewarm offload are not allowed", + ) + } +} diff --git a/compute_tools/src/http/routes/mod.rs b/compute_tools/src/http/routes/mod.rs index a67be7fd5a..432e66a830 100644 --- a/compute_tools/src/http/routes/mod.rs +++ b/compute_tools/src/http/routes/mod.rs @@ -11,6 +11,7 @@ pub(in crate::http) mod extensions; pub(in crate::http) mod failpoints; pub(in crate::http) mod grants; pub(in crate::http) mod insights; +pub(in crate::http) mod lfc; pub(in crate::http) mod metrics; pub(in crate::http) mod metrics_json; pub(in crate::http) mod status; diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 10f767e97c..d5d2427971 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -23,7 +23,7 @@ use super::{ middleware::authorize::Authorize, routes::{ check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, - grants, insights, metrics, metrics_json, status, terminate, + grants, insights, lfc, metrics, metrics_json, status, terminate, }, }; use crate::compute::ComputeNode; @@ -85,6 +85,8 @@ impl From<&Server> for Router> { Router::>::new().route("/metrics", get(metrics::get_metrics)); let authenticated_router = Router::>::new() + .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm)) + .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload)) .route("/check_writability", post(check_writability::is_writable)) .route("/configure", post(configure::configure)) .route("/database_schema", get(database_schema::get_schema_dump)) diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index a681fad0b0..7218067a8a 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -11,6 +11,7 @@ pub mod http; pub mod logger; pub mod catalog; pub mod compute; +pub mod compute_prewarm; pub mod disk_quota; pub mod extension_server; pub mod installed_extensions; diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index e37d6120ac..90326b2074 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -1,7 +1,7 @@ use metrics::core::{AtomicF64, AtomicU64, Collector, GenericCounter, GenericGauge}; use metrics::proto::MetricFamily; use metrics::{ - IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter, + IntCounter, IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter, register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec, }; use once_cell::sync::Lazy; @@ -97,6 +97,24 @@ pub(crate) static PG_TOTAL_DOWNTIME_MS: Lazy> = Lazy:: .expect("failed to define a metric") }); +/// Needed as neon.file_cache_prewarm_batch == 0 doesn't mean we never tried to prewarm. +/// On the other hand, LFC_PREWARMED_PAGES is excessive as we can GET /lfc/prewarm +pub(crate) static LFC_PREWARM_REQUESTS: Lazy = Lazy::new(|| { + register_int_counter!( + "compute_ctl_lfc_prewarm_requests_total", + "Total number of LFC prewarm requests made by compute_ctl", + ) + .expect("failed to define a metric") +}); + +pub(crate) static LFC_OFFLOAD_REQUESTS: Lazy = Lazy::new(|| { + register_int_counter!( + "compute_ctl_lfc_offload_requests_total", + "Total number of LFC offload requests made by compute_ctl", + ) + .expect("failed to define a metric") +}); + pub fn collect() -> Vec { let mut metrics = COMPUTE_CTL_UP.collect(); metrics.extend(INSTALLED_EXTENSIONS.collect()); @@ -106,5 +124,7 @@ pub fn collect() -> Vec { metrics.extend(AUDIT_LOG_DIR_SIZE.collect()); metrics.extend(PG_CURR_DOWNTIME_MS.collect()); metrics.extend(PG_TOTAL_DOWNTIME_MS.collect()); + metrics.extend(LFC_PREWARM_REQUESTS.collect()); + metrics.extend(LFC_OFFLOAD_REQUESTS.collect()); metrics } diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index b72c1293ee..53f2ddad84 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -30,6 +30,7 @@ mod pg_helpers_tests { r#"fsync = off wal_level = logical hot_standby = on +prewarm_lfc_on_startup = off neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501' wal_log_hints = on log_connections = on diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 92f0071bac..62c039047f 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -41,7 +41,7 @@ storage_broker.workspace = true http-utils.workspace = true utils.workspace = true whoami.workspace = true - +endpoint_storage.workspace = true compute_api.workspace = true workspace_hack.workspace = true tracing.workspace = true diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 44698f7b23..fd625e9ed6 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -20,7 +20,7 @@ use compute_api::requests::ComputeClaimsScope; use compute_api::spec::ComputeMode; use control_plane::broker::StorageBroker; use control_plane::endpoint::ComputeControlPlane; -use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_PORT, EndpointStorage}; +use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage}; use control_plane::local_env; use control_plane::local_env::{ EndpointStorageConf, InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, @@ -1022,7 +1022,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { }) .collect(), endpoint_storage: EndpointStorageConf { - port: ENDPOINT_STORAGE_DEFAULT_PORT, + listen_addr: ENDPOINT_STORAGE_DEFAULT_ADDR, }, pg_distrib_dir: None, neon_distrib_dir: None, @@ -1488,10 +1488,25 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res None }; + let exp = (std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)? + + Duration::from_secs(86400)) + .as_secs(); + let claims = endpoint_storage::claims::EndpointStorageClaims { + tenant_id: endpoint.tenant_id, + timeline_id: endpoint.timeline_id, + endpoint_id: endpoint_id.to_string(), + exp, + }; + + let endpoint_storage_token = env.generate_auth_token(&claims)?; + let endpoint_storage_addr = env.endpoint_storage.listen_addr.to_string(); + println!("Starting existing endpoint {endpoint_id}..."); endpoint .start( &auth_token, + endpoint_storage_token, + endpoint_storage_addr, safekeepers_generation, safekeepers, pageservers, diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 0b16339a6f..fe6a93eb5e 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -650,6 +650,8 @@ impl Endpoint { pub async fn start( &self, auth_token: &Option, + endpoint_storage_token: String, + endpoint_storage_addr: String, safekeepers_generation: Option, safekeepers: Vec, pageservers: Vec<(Host, u16)>, @@ -743,6 +745,9 @@ impl Endpoint { drop_subscriptions_before_start: self.drop_subscriptions_before_start, audit_log_level: ComputeAudit::Disabled, logs_export_host: None::, + endpoint_storage_addr: Some(endpoint_storage_addr), + endpoint_storage_token: Some(endpoint_storage_token), + prewarm_lfc_on_startup: false, }; // this strange code is needed to support respec() in tests diff --git a/control_plane/src/endpoint_storage.rs b/control_plane/src/endpoint_storage.rs index 102db91a22..171aaeddb4 100644 --- a/control_plane/src/endpoint_storage.rs +++ b/control_plane/src/endpoint_storage.rs @@ -3,17 +3,19 @@ use crate::local_env::LocalEnv; use anyhow::{Context, Result}; use camino::Utf8PathBuf; use std::io::Write; +use std::net::SocketAddr; use std::time::Duration; /// Directory within .neon which will be used by default for LocalFs remote storage. pub const ENDPOINT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/endpoint_storage"; -pub const ENDPOINT_STORAGE_DEFAULT_PORT: u16 = 9993; +pub const ENDPOINT_STORAGE_DEFAULT_ADDR: SocketAddr = + SocketAddr::new(std::net::IpAddr::V4(std::net::Ipv4Addr::LOCALHOST), 9993); pub struct EndpointStorage { pub bin: Utf8PathBuf, pub data_dir: Utf8PathBuf, pub pemfile: Utf8PathBuf, - pub port: u16, + pub addr: SocketAddr, } impl EndpointStorage { @@ -22,7 +24,7 @@ impl EndpointStorage { bin: Utf8PathBuf::from_path_buf(env.endpoint_storage_bin()).unwrap(), data_dir: Utf8PathBuf::from_path_buf(env.endpoint_storage_data_dir()).unwrap(), pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(), - port: env.endpoint_storage.port, + addr: env.endpoint_storage.listen_addr, } } @@ -31,7 +33,7 @@ impl EndpointStorage { } fn listen_addr(&self) -> Utf8PathBuf { - format!("127.0.0.1:{}", self.port).into() + format!("{}:{}", self.addr.ip(), self.addr.port()).into() } pub fn init(&self) -> Result<()> { diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index a18b34daa4..4a8892c6de 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -20,7 +20,9 @@ use utils::auth::encode_from_key_file; use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use crate::broker::StorageBroker; -use crate::endpoint_storage::{ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage}; +use crate::endpoint_storage::{ + ENDPOINT_STORAGE_DEFAULT_ADDR, ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage, +}; use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode}; use crate::safekeeper::SafekeeperNode; @@ -151,10 +153,10 @@ pub struct NeonLocalInitConf { pub generate_local_ssl_certs: bool, } -#[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct EndpointStorageConf { - pub port: u16, + pub listen_addr: SocketAddr, } /// Broker config for cluster internal communication. @@ -241,6 +243,14 @@ impl Default for NeonStorageControllerConf { } } +impl Default for EndpointStorageConf { + fn default() -> Self { + Self { + listen_addr: ENDPOINT_STORAGE_DEFAULT_ADDR, + } + } +} + impl NeonBroker { pub fn client_url(&self) -> Url { let url = if let Some(addr) = self.listen_https_addr { diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs index f07ef06328..0bd7fe5f28 100644 --- a/endpoint_storage/src/app.rs +++ b/endpoint_storage/src/app.rs @@ -343,7 +343,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]); const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg"; fn token() -> String { - let claims = endpoint_storage::Claims { + let claims = endpoint_storage::claims::EndpointStorageClaims { tenant_id: TENANT_ID, timeline_id: TIMELINE_ID, endpoint_id: ENDPOINT_ID.into(), @@ -489,16 +489,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH } fn delete_prefix_token(uri: &str) -> String { - use serde::Serialize; let parts = uri.split("/").collect::>(); - #[derive(Serialize)] - struct PrefixClaims { - tenant_id: TenantId, - timeline_id: Option, - endpoint_id: Option, - exp: u64, - } - let claims = PrefixClaims { + let claims = endpoint_storage::claims::DeletePrefixClaims { tenant_id: parts.get(1).map(|c| c.parse().unwrap()).unwrap(), timeline_id: parts.get(2).map(|c| c.parse().unwrap()), endpoint_id: parts.get(3).map(ToString::to_string), diff --git a/endpoint_storage/src/claims.rs b/endpoint_storage/src/claims.rs new file mode 100644 index 0000000000..ef0f0eb0b4 --- /dev/null +++ b/endpoint_storage/src/claims.rs @@ -0,0 +1,52 @@ +use serde::{Deserialize, Serialize}; +use std::fmt::Display; +use utils::id::{EndpointId, TenantId, TimelineId}; + +/// Claims to add, remove, or retrieve endpoint data. Used by compute_ctl +#[derive(Deserialize, Serialize, PartialEq)] +pub struct EndpointStorageClaims { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub endpoint_id: EndpointId, + pub exp: u64, +} + +/// Claims to remove tenant, timeline, or endpoint data. Used by control plane +#[derive(Deserialize, Serialize, PartialEq)] +pub struct DeletePrefixClaims { + pub tenant_id: TenantId, + /// None when tenant is deleted (endpoint_id is also None in this case) + pub timeline_id: Option, + /// None when timeline is deleted + pub endpoint_id: Option, + pub exp: u64, +} + +impl Display for EndpointStorageClaims { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "EndpointClaims(tenant_id={} timeline_id={} endpoint_id={} exp={})", + self.tenant_id, self.timeline_id, self.endpoint_id, self.exp + ) + } +} + +impl Display for DeletePrefixClaims { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "DeletePrefixClaims(tenant_id={} timeline_id={} endpoint_id={}, exp={})", + self.tenant_id, + self.timeline_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()), + self.endpoint_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()), + self.exp + ) + } +} diff --git a/endpoint_storage/src/lib.rs b/endpoint_storage/src/lib.rs index eb6b80c487..d1625dc843 100644 --- a/endpoint_storage/src/lib.rs +++ b/endpoint_storage/src/lib.rs @@ -1,3 +1,5 @@ +pub mod claims; +use crate::claims::{DeletePrefixClaims, EndpointStorageClaims}; use anyhow::Result; use axum::extract::{FromRequestParts, Path}; use axum::response::{IntoResponse, Response}; @@ -13,7 +15,7 @@ use std::result::Result as StdResult; use std::sync::Arc; use tokio_util::sync::CancellationToken; use tracing::{debug, error}; -use utils::id::{TenantId, TimelineId}; +use utils::id::{EndpointId, TenantId, TimelineId}; // simplified version of utils::auth::JwtAuth pub struct JwtAuth { @@ -79,26 +81,6 @@ pub struct Storage { pub max_upload_file_limit: usize, } -pub type EndpointId = String; // If needed, reuse small string from proxy/src/types.rc - -#[derive(Deserialize, Serialize, PartialEq)] -pub struct Claims { - pub tenant_id: TenantId, - pub timeline_id: TimelineId, - pub endpoint_id: EndpointId, - pub exp: u64, -} - -impl Display for Claims { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Claims(tenant_id {} timeline_id {} endpoint_id {} exp {})", - self.tenant_id, self.timeline_id, self.endpoint_id, self.exp - ) - } -} - #[derive(Deserialize, Serialize)] struct KeyRequest { tenant_id: TenantId, @@ -107,6 +89,13 @@ struct KeyRequest { path: String, } +#[derive(Deserialize, Serialize, PartialEq)] +struct PrefixKeyRequest { + tenant_id: TenantId, + timeline_id: Option, + endpoint_id: Option, +} + #[derive(Debug, PartialEq)] pub struct S3Path { pub path: RemotePath, @@ -165,7 +154,7 @@ impl FromRequestParts> for S3Path { .extract::>>() .await .map_err(|e| bad_request(e, "invalid token"))?; - let claims: Claims = state + let claims: EndpointStorageClaims = state .auth .decode(bearer.token()) .map_err(|e| bad_request(e, "decoding token"))?; @@ -178,7 +167,7 @@ impl FromRequestParts> for S3Path { path.endpoint_id.clone() }; - let route = Claims { + let route = EndpointStorageClaims { tenant_id: path.tenant_id, timeline_id: path.timeline_id, endpoint_id, @@ -193,38 +182,13 @@ impl FromRequestParts> for S3Path { } } -#[derive(Deserialize, Serialize, PartialEq)] -pub struct PrefixKeyPath { - pub tenant_id: TenantId, - pub timeline_id: Option, - pub endpoint_id: Option, -} - -impl Display for PrefixKeyPath { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "PrefixKeyPath(tenant_id {} timeline_id {} endpoint_id {})", - self.tenant_id, - self.timeline_id - .as_ref() - .map(ToString::to_string) - .unwrap_or("".to_string()), - self.endpoint_id - .as_ref() - .map(ToString::to_string) - .unwrap_or("".to_string()) - ) - } -} - #[derive(Debug, PartialEq)] pub struct PrefixS3Path { pub path: RemotePath, } -impl From<&PrefixKeyPath> for PrefixS3Path { - fn from(path: &PrefixKeyPath) -> Self { +impl From<&DeletePrefixClaims> for PrefixS3Path { + fn from(path: &DeletePrefixClaims) -> Self { let timeline_id = path .timeline_id .as_ref() @@ -250,21 +214,27 @@ impl FromRequestParts> for PrefixS3Path { state: &Arc, ) -> Result { let Path(path) = parts - .extract::>() + .extract::>() .await .map_err(|e| bad_request(e, "invalid route"))?; let TypedHeader(Authorization(bearer)) = parts .extract::>>() .await .map_err(|e| bad_request(e, "invalid token"))?; - let claims: PrefixKeyPath = state + let claims: DeletePrefixClaims = state .auth .decode(bearer.token()) .map_err(|e| bad_request(e, "invalid token"))?; - if path != claims { - return Err(unauthorized(path, claims)); + let route = DeletePrefixClaims { + tenant_id: path.tenant_id, + timeline_id: path.timeline_id, + endpoint_id: path.endpoint_id, + exp: claims.exp, + }; + if route != claims { + return Err(unauthorized(route, claims)); } - Ok((&path).into()) + Ok((&route).into()) } } @@ -297,7 +267,7 @@ mod tests { #[test] fn s3_path() { - let auth = Claims { + let auth = EndpointStorageClaims { tenant_id: TENANT_ID, timeline_id: TIMELINE_ID, endpoint_id: ENDPOINT_ID.into(), @@ -327,10 +297,11 @@ mod tests { #[test] fn prefix_s3_path() { - let mut path = PrefixKeyPath { + let mut path = DeletePrefixClaims { tenant_id: TENANT_ID, timeline_id: None, endpoint_id: None, + exp: 0, }; let prefix_path = |s: String| RemotePath::from_string(&s).unwrap(); assert_eq!( diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index b7d6b7ca34..24d371c6eb 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -46,6 +46,30 @@ pub struct ExtensionInstallResponse { pub version: ExtVersion, } +#[derive(Serialize, Default, Debug, Clone)] +#[serde(tag = "status", rename_all = "snake_case")] +pub enum LfcPrewarmState { + #[default] + NotPrewarmed, + Prewarming, + Completed, + Failed { + error: String, + }, +} + +#[derive(Serialize, Default, Debug, Clone)] +#[serde(tag = "status", rename_all = "snake_case")] +pub enum LfcOffloadState { + #[default] + NotOffloaded, + Offloading, + Completed, + Failed { + error: String, + }, +} + /// Response of the /status API #[derive(Serialize, Debug, Deserialize)] #[serde(rename_all = "snake_case")] diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index ad246c48ec..09b550b96c 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -172,6 +172,15 @@ pub struct ComputeSpec { /// Hostname and the port of the otel collector. Leave empty to disable Postgres logs forwarding. /// Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:10514 pub logs_export_host: Option, + + /// Address of endpoint storage service + pub endpoint_storage_addr: Option, + /// JWT for authorizing requests to endpoint storage service + pub endpoint_storage_token: Option, + + /// If true, download LFC state from endpoint_storage and pass it to Postgres on startup + #[serde(default)] + pub prewarm_lfc_on_startup: bool, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index 37de24be5b..30e788a601 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -84,6 +84,11 @@ "value": "on", "vartype": "bool" }, + { + "name": "prewarm_lfc_on_startup", + "value": "off", + "vartype": "bool" + }, { "name": "neon.safekeepers", "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501", diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 6016c23a01..68cb1f0209 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -295,6 +295,9 @@ pub struct TenantId(Id); id_newtype!(TenantId); +/// If needed, reuse small string from proxy/src/types.rc +pub type EndpointId = String; + // A pair uniquely identifying Neon instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct TenantTimelineId { diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index beed1dcd93..4b4b98aa6c 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -10,6 +10,7 @@ from requests.auth import AuthBase from typing_extensions import override from fixtures.log_helper import log +from fixtures.utils import wait_until if TYPE_CHECKING: from requests import PreparedRequest @@ -62,6 +63,35 @@ class EndpointHttpClient(requests.Session): res.raise_for_status() return res.json() + def prewarm_lfc_status(self) -> dict[str, str]: + res = self.get(f"http://localhost:{self.external_port}/lfc/prewarm") + res.raise_for_status() + json: dict[str, str] = res.json() + return json + + def prewarm_lfc(self): + self.post(f"http://localhost:{self.external_port}/lfc/prewarm").raise_for_status() + + def prewarmed(): + json = self.prewarm_lfc_status() + status, err = json["status"], json.get("error") + assert status == "completed", f"{status}, error {err}" + + wait_until(prewarmed) + + def offload_lfc(self): + url = f"http://localhost:{self.external_port}/lfc/offload" + self.post(url).raise_for_status() + + def offloaded(): + res = self.get(url) + res.raise_for_status() + json = res.json() + status, err = json["status"], json.get("error") + assert status == "completed", f"{status}, error {err}" + + wait_until(offloaded) + def database_schema(self, database: str): res = self.get( f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 133be5c045..d4a750ad3b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1185,7 +1185,9 @@ class NeonEnv: "broker": {}, "safekeepers": [], "pageservers": [], - "endpoint_storage": {"port": self.port_distributor.get_port()}, + "endpoint_storage": { + "listen_addr": f"127.0.0.1:{self.port_distributor.get_port()}", + }, "generate_local_ssl_certs": self.generate_local_ssl_certs, } diff --git a/test_runner/regress/test_endpoint_storage.py b/test_runner/regress/test_endpoint_storage.py index 04029114ec..1e27ef4b14 100644 --- a/test_runner/regress/test_endpoint_storage.py +++ b/test_runner/regress/test_endpoint_storage.py @@ -4,10 +4,12 @@ import pytest from aiohttp import ClientSession from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import run_only_on_default_postgres from jwcrypto import jwk, jwt @pytest.mark.asyncio +@run_only_on_default_postgres("test doesn't use postgres") async def test_endpoint_storage_insert_retrieve_delete(neon_simple_env: NeonEnv): """ Inserts, retrieves, and deletes test file using a JWT token @@ -35,7 +37,6 @@ async def test_endpoint_storage_insert_retrieve_delete(neon_simple_env: NeonEnv) key = f"http://{base_url}/{tenant_id}/{timeline_id}/{endpoint_id}/key" headers = {"Authorization": f"Bearer {token}"} log.info(f"cache key url {key}") - log.info(f"token {token}") async with ClientSession(headers=headers) as session: async with session.get(key) as res: diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py index dd0ae1921d..82e1e9fcba 100644 --- a/test_runner/regress/test_lfc_prewarm.py +++ b/test_runner/regress/test_lfc_prewarm.py @@ -1,11 +1,24 @@ import random import threading import time +from enum import Enum import pytest +from fixtures.endpoint.http import EndpointHttpClient from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.utils import USE_LFC +from prometheus_client.parser import text_string_to_metric_families as prom_parse_impl + + +class LfcQueryMethod(Enum): + COMPUTE_CTL = False + POSTGRES = True + + +PREWARM_LABEL = "compute_ctl_lfc_prewarm_requests_total" +OFFLOAD_LABEL = "compute_ctl_lfc_offload_requests_total" +QUERY_OPTIONS = LfcQueryMethod.POSTGRES, LfcQueryMethod.COMPUTE_CTL def check_pinned_entries(cur): @@ -19,11 +32,20 @@ def check_pinned_entries(cur): assert n_pinned == 0 +def prom_parse(client: EndpointHttpClient) -> dict[str, float]: + return { + sample.name: sample.value + for family in prom_parse_impl(client.metrics()) + for sample in family.samples + if sample.name in (PREWARM_LABEL, OFFLOAD_LABEL) + } + + @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") -def test_lfc_prewarm(neon_simple_env: NeonEnv): +@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"]) +def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod): env = neon_simple_env n_records = 1000000 - endpoint = env.endpoints.create_start( branch_name="main", config_lines=[ @@ -34,30 +56,57 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv): "neon.file_cache_prewarm_limit=1000", ], ) - conn = endpoint.connect() - cur = conn.cursor() - cur.execute("create extension neon version '1.6'") - cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))") - cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))") - cur.execute("select get_local_cache_state()") - lfc_state = cur.fetchall()[0][0] + + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("create extension neon version '1.6'") + pg_cur.execute("create database lfc") + + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + log.info(f"Inserting {n_records} rows") + lfc_cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))") + lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))") + log.info(f"Inserted {n_records} rows") + + http_client = endpoint.http_client() + if query is LfcQueryMethod.COMPUTE_CTL: + status = http_client.prewarm_lfc_status() + assert status["status"] == "not_prewarmed" + assert "error" not in status + http_client.offload_lfc() + assert http_client.prewarm_lfc_status()["status"] == "not_prewarmed" + assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0} + else: + pg_cur.execute("select get_local_cache_state()") + lfc_state = pg_cur.fetchall()[0][0] endpoint.stop() endpoint.start() - conn = endpoint.connect() - cur = conn.cursor() - time.sleep(1) # wait until compute_ctl complete downgrade of extension to default version - cur.execute("alter extension neon update to '1.6'") - cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + # wait until compute_ctl completes downgrade of extension to default version + time.sleep(1) + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("alter extension neon update to '1.6'") - cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'") - lfc_used_pages = cur.fetchall()[0][0] + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + + if query is LfcQueryMethod.COMPUTE_CTL: + http_client.prewarm_lfc() + else: + pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + + pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'") + lfc_used_pages = pg_cur.fetchall()[0][0] log.info(f"Used LFC size: {lfc_used_pages}") - cur.execute("select * from get_prewarm_info()") - prewarm_info = cur.fetchall()[0] + pg_cur.execute("select * from get_prewarm_info()") + prewarm_info = pg_cur.fetchall()[0] log.info(f"Prewarm info: {prewarm_info}") - log.info(f"Prewarm progress: {(prewarm_info[1] + prewarm_info[2]) * 100 // prewarm_info[0]}%") + total, prewarmed, skipped, _ = prewarm_info + progress = (prewarmed + skipped) * 100 // total + log.info(f"Prewarm progress: {progress}%") assert lfc_used_pages > 10000 assert ( @@ -66,18 +115,23 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv): and prewarm_info[0] == prewarm_info[1] + prewarm_info[2] ) - cur.execute("select sum(pk) from t") - assert cur.fetchall()[0][0] == n_records * (n_records + 1) / 2 + lfc_cur.execute("select sum(pk) from t") + assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2 - check_pinned_entries(cur) + check_pinned_entries(pg_cur) + + desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped} + if query is LfcQueryMethod.COMPUTE_CTL: + assert http_client.prewarm_lfc_status() == desired + assert prom_parse(http_client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1} @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") -def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv): +@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"]) +def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMethod): env = neon_simple_env n_records = 10000 n_threads = 4 - endpoint = env.endpoints.create_start( branch_name="main", config_lines=[ @@ -87,40 +141,58 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv): "neon.file_cache_prewarm_limit=1000000", ], ) - conn = endpoint.connect() - cur = conn.cursor() - cur.execute("create extension neon version '1.6'") - cur.execute( + + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("create extension neon version '1.6'") + pg_cur.execute("CREATE DATABASE lfc") + + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + lfc_cur.execute( "create table accounts(id integer primary key, balance bigint default 0, payload text default repeat('?', 1000)) with (fillfactor=10)" ) - cur.execute(f"insert into accounts(id) values (generate_series(1,{n_records}))") - cur.execute("select get_local_cache_state()") - lfc_state = cur.fetchall()[0][0] + log.info(f"Inserting {n_records} rows") + lfc_cur.execute(f"insert into accounts(id) values (generate_series(1,{n_records}))") + log.info(f"Inserted {n_records} rows") + + http_client = endpoint.http_client() + if query is LfcQueryMethod.COMPUTE_CTL: + http_client.offload_lfc() + else: + pg_cur.execute("select get_local_cache_state()") + lfc_state = pg_cur.fetchall()[0][0] running = True + n_prewarms = 0 def workload(): - conn = endpoint.connect() - cur = conn.cursor() + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() n_transfers = 0 while running: src = random.randint(1, n_records) dst = random.randint(1, n_records) - cur.execute("update accounts set balance=balance-100 where id=%s", (src,)) - cur.execute("update accounts set balance=balance+100 where id=%s", (dst,)) + lfc_cur.execute("update accounts set balance=balance-100 where id=%s", (src,)) + lfc_cur.execute("update accounts set balance=balance+100 where id=%s", (dst,)) n_transfers += 1 log.info(f"Number of transfers: {n_transfers}") def prewarm(): - conn = endpoint.connect() - cur = conn.cursor() - n_prewarms = 0 + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() while running: - cur.execute("alter system set neon.file_cache_size_limit='1MB'") - cur.execute("select pg_reload_conf()") - cur.execute("alter system set neon.file_cache_size_limit='1GB'") - cur.execute("select pg_reload_conf()") - cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + pg_cur.execute("alter system set neon.file_cache_size_limit='1MB'") + pg_cur.execute("select pg_reload_conf()") + pg_cur.execute("alter system set neon.file_cache_size_limit='1GB'") + pg_cur.execute("select pg_reload_conf()") + + if query is LfcQueryMethod.COMPUTE_CTL: + http_client.prewarm_lfc() + else: + pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + + nonlocal n_prewarms n_prewarms += 1 log.info(f"Number of prewarms: {n_prewarms}") @@ -140,8 +212,10 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv): t.join() prewarm_thread.join() - cur.execute("select sum(balance) from accounts") - total_balance = cur.fetchall()[0][0] + lfc_cur.execute("select sum(balance) from accounts") + total_balance = lfc_cur.fetchall()[0][0] assert total_balance == 0 - check_pinned_entries(cur) + check_pinned_entries(pg_cur) + if query is LfcQueryMethod.COMPUTE_CTL: + assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: n_prewarms} From 0ef6851219c58d52263fd96cb87c18e72a928fd2 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 6 May 2025 17:19:15 -0500 Subject: [PATCH 052/142] Make the audience claim in compute JWTs a vector (#11845) According to RFC 7519, `aud` is generally an array of StringOrURI, but in special cases may be a single StringOrURI value. To accomodate future control plane work where a single token may work for multiple services, make the claim a vector. Link: https://www.rfc-editor.org/rfc/rfc7519#section-4.1.3 Signed-off-by: Tristan Partin --- compute_tools/src/http/middleware/authorize.rs | 2 +- control_plane/src/endpoint.rs | 4 ++-- libs/compute_api/src/requests.rs | 7 +++++-- test_runner/regress/test_compute_http.py | 4 ++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs index 2afc57ad9c..a82f46e062 100644 --- a/compute_tools/src/http/middleware/authorize.rs +++ b/compute_tools/src/http/middleware/authorize.rs @@ -79,7 +79,7 @@ impl AsyncAuthorizeRequest for Authorize { )); }; - if audience != COMPUTE_AUDIENCE { + if !audience.iter().any(|a| a == COMPUTE_AUDIENCE) { return Err(JsonResponse::error( StatusCode::UNAUTHORIZED, "invalid audience in authorization token claims", diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index fe6a93eb5e..be73661a3c 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -635,8 +635,8 @@ impl Endpoint { pub fn generate_jwt(&self, scope: Option) -> Result { self.env.generate_auth_token(&ComputeClaims { audience: match scope { - Some(ComputeClaimsScope::Admin) => Some(COMPUTE_AUDIENCE.to_owned()), - _ => Some(self.endpoint_id.clone()), + Some(ComputeClaimsScope::Admin) => Some(vec![COMPUTE_AUDIENCE.to_owned()]), + _ => None, }, compute_id: match scope { Some(ComputeClaimsScope::Admin) => None, diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index 40d34eccea..bbab271474 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -10,9 +10,9 @@ use crate::spec::{ComputeSpec, ExtVersion, PgIdent}; /// The value to place in the [`ComputeClaims::audience`] claim. pub static COMPUTE_AUDIENCE: &str = "compute"; +/// Available scopes for a compute's JWT. #[derive(Copy, Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] #[serde(rename_all = "snake_case")] -/// Available scopes for a compute's JWT. pub enum ComputeClaimsScope { /// An admin-scoped token allows access to all of `compute_ctl`'s authorized /// facilities. @@ -48,8 +48,11 @@ pub struct ComputeClaims { /// /// See [RFC 7519](https://www.rfc-editor.org/rfc/rfc7519#section-4.1.3) for /// more information. + /// + /// TODO: Remove the [`Option`] wrapper when control plane learns to send + /// the claim. #[serde(rename = "aud")] - pub audience: Option, + pub audience: Option>, } /// Request of the /configure API diff --git a/test_runner/regress/test_compute_http.py b/test_runner/regress/test_compute_http.py index ce31ff0fe6..9846d44ce2 100644 --- a/test_runner/regress/test_compute_http.py +++ b/test_runner/regress/test_compute_http.py @@ -56,9 +56,9 @@ def test_compute_admin_scope_claim(neon_simple_env: NeonEnv, audience: str | Non endpoint = env.endpoints.create_start("main") - data = {"scope": str(ComputeClaimsScope.ADMIN)} + data: dict[str, str | list[str]] = {"scope": str(ComputeClaimsScope.ADMIN)} if audience: - data["aud"] = audience + data["aud"] = [audience] token = jwt.encode(data, env.auth_keys.priv, algorithm="EdDSA") From 608afc3055fe1acb32caba2f7b94e01db30f12f2 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 7 May 2025 17:21:17 +0800 Subject: [PATCH 053/142] fix(scrubber): log download error (#11833) ## Problem We use `head_object` to determine whether an object exists or not. However, it does not always error due to a missing object. ## Summary of changes Log the error so that we can have a better idea what's going on with the scrubber errors in prod. --------- Signed-off-by: Alex Chi Z --- storage_scrubber/src/checks.rs | 5 +++-- storage_scrubber/src/pageserver_physical_gc.rs | 11 +++++++---- storage_scrubber/src/scan_pageserver_metadata.rs | 5 ++++- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index f0ba632fd4..b151b612bf 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -165,16 +165,17 @@ pub(crate) async fn branch_cleanup_and_check_errors( .head_object(&path, &CancellationToken::new()) .await; - if response.is_err() { + if let Err(e) = response { // Object is not present. let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta()); let msg = format!( - "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})", + "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {}) with error: {}", layer, metadata.generation.get_suffix(), metadata.shard, is_l0, + e, ); if is_l0 || ignore_error { diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index f14341c7bc..e1a4095a3c 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -137,11 +137,10 @@ struct TenantRefAccumulator { impl TenantRefAccumulator { fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) { let this_shard_idx = ttid.tenant_shard_id.to_index(); - (*self - .shards_seen + self.shards_seen .entry(ttid.tenant_shard_id.tenant_id) - .or_default()) - .insert(this_shard_idx); + .or_default() + .insert(this_shard_idx); let mut ancestor_refs = Vec::new(); for (layer_name, layer_metadata) in &index_part.layer_metadata { @@ -767,10 +766,13 @@ pub async fn pageserver_physical_gc( stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id).await?, ); Ok(try_stream! { + let mut cnt = 0; while let Some(ttid_res) = timelines.next().await { let ttid = ttid_res?; + cnt += 1; yield (ttid, tenant_manifest_arc.clone()); } + tracing::info!(%tenant_shard_id, "Found {} timelines", cnt); }) } }); @@ -790,6 +792,7 @@ pub async fn pageserver_physical_gc( &accumulator, tenant_manifest_arc, ) + .instrument(info_span!("gc_timeline", %ttid)) }); let timelines = timelines.try_buffered(CONCURRENCY); let mut timelines = std::pin::pin!(timelines); diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index ba75f25984..77c7987aa7 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -153,7 +153,10 @@ pub async fn scan_pageserver_metadata( const CONCURRENCY: usize = 32; // Generate a stream of TenantTimelineId - let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t)); + let timelines = tenants.map_ok(|t| { + tracing::info!("Found tenant: {}", t); + stream_tenant_timelines(&remote_client, &target, t) + }); let timelines = timelines.try_buffered(CONCURRENCY); let timelines = timelines.try_flatten(); From 3cf5e1386c3071994281b4bc7a1579e4595689f6 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 7 May 2025 11:13:26 +0100 Subject: [PATCH 054/142] pageserver: fix rough edges of pageserver tracing (#11842) ## Problem There's a few rough edges around PS tracing. ## Summary of changes * include compute request id in pageserver trace * use the get page specific context for GET_REL_SIZE and GET_BATCH * fix assertion in download layer trace ![image](https://github.com/user-attachments/assets/2ff6779c-7c2d-4102-8013-ada8203aa42f) --- pageserver/src/page_service.rs | 6 +- pageserver/src/pgdatadir_mapping.rs | 64 +++++++++++--------- pageserver/src/tenant/storage_layer/layer.rs | 35 +++++------ 3 files changed, 55 insertions(+), 50 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 0ce1a99681..bca1cb5b49 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1038,21 +1038,23 @@ impl PageServerHandler { tracing::info_span!( parent: &parent_span, "handle_get_page_request", + request_id = %req.hdr.reqid, rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, - not_modified_since_lsn = %req.hdr.not_modified_since + not_modified_since_lsn = %req.hdr.not_modified_since, ) }}; ($shard_id:expr) => {{ tracing::info_span!( parent: &parent_span, "handle_get_page_request", + request_id = %req.hdr.reqid, rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, not_modified_since_lsn = %req.hdr.not_modified_since, - shard_id = %$shard_id + shard_id = %$shard_id, ) }}; } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index ccb48d8bc1..d770946580 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -40,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use super::tenant::{PageReconstructError, Timeline}; use crate::aux_file; -use crate::context::{PerfInstrumentFutureExt, RequestContext}; +use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder}; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::metrics::{ RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD, @@ -275,24 +275,30 @@ impl Timeline { continue; } - let nblocks = match self - .get_rel_size(*tag, Version::Lsn(lsn), &ctx) - .maybe_perf_instrument(&ctx, |crnt_perf_span| { - info_span!( - target: PERF_TRACE_TARGET, - parent: crnt_perf_span, - "GET_REL_SIZE", - reltag=%tag, - lsn=%lsn, - ) - }) - .await - { - Ok(nblocks) => nblocks, - Err(err) => { - result_slots[response_slot_idx].write(Err(err)); - slots_filled += 1; - continue; + let nblocks = { + let ctx = RequestContextBuilder::from(&ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "GET_REL_SIZE", + reltag=%tag, + lsn=%lsn, + ) + }) + .attached_child(); + + match self + .get_rel_size(*tag, Version::Lsn(lsn), &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) + .await + { + Ok(nblocks) => nblocks, + Err(err) => { + result_slots[response_slot_idx].write(Err(err)); + slots_filled += 1; + continue; + } } }; @@ -308,6 +314,17 @@ impl Timeline { let key = rel_block_to_key(*tag, *blknum); + let ctx = RequestContextBuilder::from(&ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "GET_BATCH", + batch_size = %page_count, + ) + }) + .attached_child(); + let key_slots = keys_slots.entry(key).or_default(); key_slots.push((response_slot_idx, ctx)); @@ -323,14 +340,7 @@ impl Timeline { let query = VersionedKeySpaceQuery::scattered(query); let res = self .get_vectored(query, io_concurrency, ctx) - .maybe_perf_instrument(ctx, |current_perf_span| { - info_span!( - target: PERF_TRACE_TARGET, - parent: current_perf_span, - "GET_BATCH", - batch_size = %page_count, - ) - }) + .maybe_perf_instrument(ctx, |current_perf_span| current_perf_span.clone()) .await; match res { diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 50810cb154..3d55972017 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -23,7 +23,7 @@ use super::{ LayerVisibilityHint, PerfInstrumentFutureExt, PersistentLayerDesc, ValuesReconstructState, }; use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; +use crate::context::{RequestContext, RequestContextBuilder}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::task_mgr::TaskKind; use crate::tenant::Timeline; @@ -1076,24 +1076,17 @@ impl LayerInner { return Err(DownloadError::DownloadRequired); } - let ctx = if ctx.has_perf_span() { - let dl_ctx = RequestContextBuilder::from(ctx) - .task_kind(TaskKind::LayerDownload) - .download_behavior(DownloadBehavior::Download) - .root_perf_span(|| { - info_span!( - target: PERF_TRACE_TARGET, - "DOWNLOAD_LAYER", - layer = %self, - reason = %reason - ) - }) - .detached_child(); - ctx.perf_follows_from(&dl_ctx); - dl_ctx - } else { - ctx.attached_child() - }; + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "DOWNLOAD_LAYER", + layer = %self, + reason = %reason, + ) + }) + .attached_child(); async move { tracing::info!(%reason, "downloading on-demand"); @@ -1101,7 +1094,7 @@ impl LayerInner { let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); let res = self .download_init_and_wait(timeline, permit, ctx.attached_child()) - .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) + .maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone()) .await?; scopeguard::ScopeGuard::into_inner(init_cancelled); @@ -1709,7 +1702,7 @@ impl DownloadError { } } -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Copy, Clone)] pub(crate) enum NeedsDownload { NotFound, NotFile(std::fs::FileType), From 0691b73f53580687f0a5aee8b1e3a3192faa7707 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Wed, 7 May 2025 14:14:24 +0200 Subject: [PATCH 055/142] fix(compute): Enforce cloud_admin role in compute_ctl connections (#11827) ## Problem Users can override some configuration parameters on the DB level with `ALTER DATABASE ... SET ...`. Some of these overrides, like `role` or `default_transaction_read_only`, affect `compute_ctl`'s ability to configure the DB schema properly. ## Summary of changes Enforce `role=cloud_admin`, `statement_timeout=0`, and move `default_transaction_read_only=off` override from control plane [1] to `compute_ctl`. Also, enforce `search_path=public` just in case, although we do not call any functions in user databases. [1]: https://github.com/neondatabase/cloud/blob/133dd8c4dbbba40edfbad475bf6a45073ca63faf/goapp/controlplane/internal/pkg/compute/provisioner/provisioner_common.go#L70 Fixes https://github.com/neondatabase/cloud/issues/28532 --- compute_tools/src/compute.rs | 45 +++++++++++--- test_runner/regress/test_compute_catalog.py | 66 +++++++++++++++++++++ 2 files changed, 104 insertions(+), 7 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 08d915b331..0cda36a6e2 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -329,11 +329,39 @@ struct StartVmMonitorResult { impl ComputeNode { pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result { let connstr = params.connstr.as_str(); - let conn_conf = postgres::config::Config::from_str(connstr) + let mut conn_conf = postgres::config::Config::from_str(connstr) .context("cannot build postgres config from connstr")?; - let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr) + let mut tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr) .context("cannot build tokio postgres config from connstr")?; + // Users can set some configuration parameters per database with + // ALTER DATABASE ... SET ... + // + // There are at least these parameters: + // + // - role=some_other_role + // - default_transaction_read_only=on + // - statement_timeout=1, i.e., 1ms, which will cause most of the queries to fail + // - search_path=non_public_schema, this should be actually safe because + // we don't call any functions in user databases, but better to always reset + // it to public. + // + // that can affect `compute_ctl` and prevent it from properly configuring the database schema. + // Unset them via connection string options before connecting to the database. + // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`. + // + // TODO(ololobus): we currently pass `-c default_transaction_read_only=off` from control plane + // as well. After rolling out this code, we can remove this parameter from control plane. + // In the meantime, double-passing is fine, the last value is applied. + // See: + const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; + let options = match conn_conf.get_options() { + Some(options) => format!("{} {}", options, EXTRA_OPTIONS), + None => EXTRA_OPTIONS.to_string(), + }; + conn_conf.options(&options); + tokio_conn_conf.options(&options); + let mut new_state = ComputeState::new(); if let Some(spec) = config.spec { let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?; @@ -1449,15 +1477,20 @@ impl ComputeNode { Err(e) => match e.code() { Some(&SqlState::INVALID_PASSWORD) | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => { - // Connect with zenith_admin if cloud_admin could not authenticate + // Connect with `zenith_admin` if `cloud_admin` could not authenticate info!( - "cannot connect to postgres: {}, retrying with `zenith_admin` username", + "cannot connect to Postgres: {}, retrying with 'zenith_admin' username", e ); let mut zenith_admin_conf = postgres::config::Config::from(conf.clone()); zenith_admin_conf.application_name("compute_ctl:apply_config"); zenith_admin_conf.user("zenith_admin"); + // It doesn't matter what were the options before, here we just want + // to connect and create a new superuser role. + const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; + zenith_admin_conf.options(ZENITH_OPTIONS); + let mut client = zenith_admin_conf.connect(NoTls) .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; @@ -1623,9 +1656,7 @@ impl ComputeNode { self.pg_reload_conf()?; if spec.mode == ComputeMode::Primary { - let mut conf = - tokio_postgres::Config::from_str(self.params.connstr.as_str()).unwrap(); - conf.application_name("apply_config"); + let conf = self.get_tokio_conn_conf(Some("compute_ctl:reconfigure")); let conf = Arc::new(conf); let spec = Arc::new(spec.clone()); diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index 37208c9fff..b66b326360 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -544,3 +544,69 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env ) role = cursor.fetchone() assert role is None + + +def test_db_with_custom_settings(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can work with databases that have some custom settings. + For example, role=some_other_role, default_transaction_read_only=on, + search_path=non_public_schema, statement_timeout=1 (1ms). + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + TEST_ROLE = "some_other_role" + TEST_DB = "db_with_custom_settings" + TEST_SCHEMA = "non_public_schema" + + endpoint.respec_deep( + **{ + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "databases": [ + { + "name": TEST_DB, + "owner": TEST_ROLE, + } + ], + "roles": [ + { + "name": TEST_ROLE, + } + ], + }, + } + } + ) + + endpoint.reconfigure() + + with endpoint.cursor(dbname=TEST_DB) as cursor: + cursor.execute(f"CREATE SCHEMA {TEST_SCHEMA}") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET role = {TEST_ROLE}") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET default_transaction_read_only = on") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET search_path = {TEST_SCHEMA}") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET statement_timeout = 1") + + with endpoint.cursor(dbname=TEST_DB) as cursor: + cursor.execute("SELECT current_role") + role = cursor.fetchone() + assert role is not None + assert role[0] == TEST_ROLE + + cursor.execute("SHOW default_transaction_read_only") + default_transaction_read_only = cursor.fetchone() + assert default_transaction_read_only is not None + assert default_transaction_read_only[0] == "on" + + cursor.execute("SHOW search_path") + search_path = cursor.fetchone() + assert search_path is not None + assert search_path[0] == TEST_SCHEMA + + # Do not check statement_timeout, because we force it to 2min + # in `endpoint.cursor()` fixture. + + endpoint.reconfigure() From 4d2e4b19c3dc8816668abc4204b110f1c9fd1b1e Mon Sep 17 00:00:00 2001 From: Shockingly Good Date: Wed, 7 May 2025 18:34:08 +0200 Subject: [PATCH 056/142] fix(compute) Correct the PGXN s3 gateway URL. (#11796) Corrects the postgres extension s3 gateway address to be not just a domain name but a full base URL. To make the code more readable, the option is renamed to "remote_ext_base_url", while keeping the old name also accessible by providing a clap argument alias. Also provides a very simple and, perhaps, even redundant unit test to confirm the logic behind parsing of the corresponding CLI argument. ## Problem As it is clearly stated in https://github.com/neondatabase/cloud/issues/26005, using of the short version of the domain name might work for now, but in the future, we should get rid of using the `default` namespace and this is where it will, most likely, break down. ## Summary of changes The changes adjust the domain name of the extension s3 gateway to use the proper base url format instead of the just domain name assuming the "default" namespace and add a new CLI argument name for to reflect the change and the expectance. --- compute_tools/src/bin/compute_ctl.rs | 34 +++++++++++++++---- compute_tools/src/compute.rs | 10 +++--- compute_tools/src/extension_server.rs | 8 ++--- .../src/http/routes/extension_server.rs | 2 +- control_plane/src/bin/neon_local.rs | 9 ++--- control_plane/src/endpoint.rs | 6 ++-- test_runner/fixtures/neon_cli.py | 6 ++-- test_runner/fixtures/neon_fixtures.py | 12 +++---- .../regress/test_download_extensions.py | 4 +-- 9 files changed, 56 insertions(+), 35 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index e337ee7b15..20b5e567a8 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -60,12 +60,16 @@ use utils::failpoint_support; // Compatibility hack: if the control plane specified any remote-ext-config // use the default value for extension storage proxy gateway. // Remove this once the control plane is updated to pass the gateway URL -fn parse_remote_ext_config(arg: &str) -> Result { - if arg.starts_with("http") { - Ok(arg.trim_end_matches('/').to_string()) +fn parse_remote_ext_base_url(arg: &str) -> Result { + const FALLBACK_PG_EXT_GATEWAY_BASE_URL: &str = + "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local"; + + Ok(if arg.starts_with("http") { + arg } else { - Ok("http://pg-ext-s3-gateway".to_string()) + FALLBACK_PG_EXT_GATEWAY_BASE_URL } + .to_owned()) } #[derive(Parser)] @@ -74,8 +78,10 @@ struct Cli { #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")] pub pgbin: String, - #[arg(short = 'r', long, value_parser = parse_remote_ext_config)] - pub remote_ext_config: Option, + /// The base URL for the remote extension storage proxy gateway. + /// Should be in the form of `http(s)://[:]`. + #[arg(short = 'r', long, value_parser = parse_remote_ext_base_url, alias = "remote-ext-config")] + pub remote_ext_base_url: Option, /// The port to bind the external listening HTTP server to. Clients running /// outside the compute will talk to the compute through this port. Keep @@ -164,7 +170,7 @@ fn main() -> Result<()> { pgversion: get_pg_version_string(&cli.pgbin), external_http_port: cli.external_http_port, internal_http_port: cli.internal_http_port, - ext_remote_storage: cli.remote_ext_config.clone(), + remote_ext_base_url: cli.remote_ext_base_url.clone(), resize_swap_on_bind: cli.resize_swap_on_bind, set_disk_quota_for_fs: cli.set_disk_quota_for_fs, #[cfg(target_os = "linux")] @@ -265,4 +271,18 @@ mod test { fn verify_cli() { Cli::command().debug_assert() } + + #[test] + fn parse_pg_ext_gateway_base_url() { + let arg = "http://pg-ext-s3-gateway2"; + let result = super::parse_remote_ext_base_url(arg).unwrap(); + assert_eq!(result, arg); + + let arg = "pg-ext-s3-gateway"; + let result = super::parse_remote_ext_base_url(arg).unwrap(); + assert_eq!( + result, + "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local" + ); + } } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 0cda36a6e2..25920675c1 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -95,7 +95,7 @@ pub struct ComputeNodeParams { pub internal_http_port: u16, /// the address of extension storage proxy gateway - pub ext_remote_storage: Option, + pub remote_ext_base_url: Option, } /// Compute node info shared across several `compute_ctl` threads. @@ -1896,9 +1896,9 @@ LIMIT 100", real_ext_name: String, ext_path: RemotePath, ) -> Result { - let ext_remote_storage = + let remote_ext_base_url = self.params - .ext_remote_storage + .remote_ext_base_url .as_ref() .ok_or(DownloadError::BadInput(anyhow::anyhow!( "Remote extensions storage is not configured", @@ -1960,7 +1960,7 @@ LIMIT 100", let download_size = extension_server::download_extension( &real_ext_name, &ext_path, - ext_remote_storage, + remote_ext_base_url, &self.params.pgbin, ) .await @@ -2069,7 +2069,7 @@ LIMIT 100", &self, spec: &ComputeSpec, ) -> Result { - if self.params.ext_remote_storage.is_none() { + if self.params.remote_ext_base_url.is_none() { return Ok(RemoteExtensionMetrics { num_ext_downloaded: 0, largest_ext_size: 0, diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index ee889e0c40..3439383699 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -158,14 +158,14 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion { pub async fn download_extension( ext_name: &str, ext_path: &RemotePath, - ext_remote_storage: &str, + remote_ext_base_url: &str, pgbin: &str, ) -> Result { info!("Download extension {:?} from {:?}", ext_name, ext_path); // TODO add retry logic let download_buffer = - match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await { + match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await { Ok(buffer) => buffer, Err(error_message) => { return Err(anyhow::anyhow!( @@ -272,8 +272,8 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { // Do request to extension storage proxy, e.g., // curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst // using HTTP GET and return the response body as bytes. -async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result { - let uri = format!("{}/{}", ext_remote_storage, ext_path); +async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result { + let uri = format!("{}/{}", remote_ext_base_url, ext_path); let filename = Path::new(ext_path) .file_name() .unwrap_or_else(|| std::ffi::OsStr::new("unknown")) diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs index 6508de6eee..e141a48b7f 100644 --- a/compute_tools/src/http/routes/extension_server.rs +++ b/compute_tools/src/http/routes/extension_server.rs @@ -22,7 +22,7 @@ pub(in crate::http) async fn download_extension( State(compute): State>, ) -> Response { // Don't even try to download extensions if no remote storage is configured - if compute.params.ext_remote_storage.is_none() { + if compute.params.remote_ext_base_url.is_none() { return JsonResponse::error( StatusCode::PRECONDITION_FAILED, "remote storage is not configured", diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index fd625e9ed6..610fa5f865 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -644,9 +644,10 @@ struct EndpointStartCmdArgs { #[clap( long, - help = "Configure the remote extensions storage proxy gateway to request for extensions." + help = "Configure the remote extensions storage proxy gateway URL to request for extensions.", + alias = "remote-ext-config" )] - remote_ext_config: Option, + remote_ext_base_url: Option, #[clap( long, @@ -1414,7 +1415,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res EndpointCmd::Start(args) => { let endpoint_id = &args.endpoint_id; let pageserver_id = args.endpoint_pageserver_id; - let remote_ext_config = &args.remote_ext_config; + let remote_ext_base_url = &args.remote_ext_base_url; let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new); // If --safekeepers argument is given, use only the listed @@ -1510,7 +1511,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res safekeepers_generation, safekeepers, pageservers, - remote_ext_config.as_ref(), + remote_ext_base_url.as_ref(), stripe_size.0 as usize, args.create_test_user, args.start_timeout, diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index be73661a3c..708745446d 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -655,7 +655,7 @@ impl Endpoint { safekeepers_generation: Option, safekeepers: Vec, pageservers: Vec<(Host, u16)>, - remote_ext_config: Option<&String>, + remote_ext_base_url: Option<&String>, shard_stripe_size: usize, create_test_user: bool, start_timeout: Duration, @@ -825,8 +825,8 @@ impl Endpoint { .stderr(logfile.try_clone()?) .stdout(logfile); - if let Some(remote_ext_config) = remote_ext_config { - cmd.args(["--remote-ext-config", remote_ext_config]); + if let Some(remote_ext_base_url) = remote_ext_base_url { + cmd.args(["--remote-ext-base-url", remote_ext_base_url]); } let child = cmd.spawn()?; diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 3be78719d7..4eaa4b7d99 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -557,7 +557,7 @@ class NeonLocalCli(AbstractNeonCli): endpoint_id: str, safekeepers_generation: int | None = None, safekeepers: list[int] | None = None, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, create_test_user: bool = False, @@ -572,8 +572,8 @@ class NeonLocalCli(AbstractNeonCli): extra_env_vars = env or {} if basebackup_request_tries is not None: extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries) - if remote_ext_config is not None: - args.extend(["--remote-ext-config", remote_ext_config]) + if remote_ext_base_url is not None: + args.extend(["--remote-ext-base-url", remote_ext_base_url]) if safekeepers_generation is not None: args.extend(["--safekeepers-generation", str(safekeepers_generation)]) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d4a750ad3b..85ad49bb4f 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4226,7 +4226,7 @@ class Endpoint(PgProtocol, LogUtils): def start( self, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, safekeeper_generation: int | None = None, safekeepers: list[int] | None = None, @@ -4252,7 +4252,7 @@ class Endpoint(PgProtocol, LogUtils): self.endpoint_id, safekeepers_generation=safekeeper_generation, safekeepers=self.active_safekeepers, - remote_ext_config=remote_ext_config, + remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, allow_multiple=allow_multiple, create_test_user=create_test_user, @@ -4467,7 +4467,7 @@ class Endpoint(PgProtocol, LogUtils): hot_standby: bool = False, lsn: Lsn | None = None, config_lines: list[str] | None = None, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, basebackup_request_tries: int | None = None, @@ -4486,7 +4486,7 @@ class Endpoint(PgProtocol, LogUtils): pageserver_id=pageserver_id, allow_multiple=allow_multiple, ).start( - remote_ext_config=remote_ext_config, + remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, allow_multiple=allow_multiple, basebackup_request_tries=basebackup_request_tries, @@ -4570,7 +4570,7 @@ class EndpointFactory: lsn: Lsn | None = None, hot_standby: bool = False, config_lines: list[str] | None = None, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, basebackup_request_tries: int | None = None, ) -> Endpoint: @@ -4590,7 +4590,7 @@ class EndpointFactory: hot_standby=hot_standby, config_lines=config_lines, lsn=lsn, - remote_ext_config=remote_ext_config, + remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, basebackup_request_tries=basebackup_request_tries, ) diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index d28240c722..24ba0713d2 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -221,7 +221,7 @@ def test_remote_extensions( endpoint.create_remote_extension_spec(spec) - endpoint.start(remote_ext_config=extensions_endpoint) + endpoint.start(remote_ext_base_url=extensions_endpoint) with endpoint.connect() as conn: with conn.cursor() as cur: @@ -249,7 +249,7 @@ def test_remote_extensions( # Remove the extension files to force a redownload of the extension. extension.remove(test_output_dir, pg_version) - endpoint.start(remote_ext_config=extensions_endpoint) + endpoint.start(remote_ext_base_url=extensions_endpoint) # Test that ALTER EXTENSION UPDATE statements also fetch remote extensions. with endpoint.connect() as conn: From 24d62c647fba00d1ac93f4118836ceeddf07b270 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Wed, 7 May 2025 21:00:41 +0400 Subject: [PATCH 057/142] storcon: add missing switch_timeline_membership method to sk client (#11850) ## Problem `switch_timeline_membership` is implemented on safekeeper's server side, but the is missing in the client. - Part of https://github.com/neondatabase/neon/issues/11823 ## Summary of changes - Add `switch_timeline_membership` method to `SafekeeperClient` --- safekeeper/client/src/mgmt_api.rs | 14 ++++++++++++++ storage_controller/src/safekeeper_client.rs | 17 +++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index 5849df0343..b364ac8e48 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -121,6 +121,20 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + pub async fn switch_timeline_membership( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/membership", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + let resp = self.put(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn delete_tenant(&self, tenant_id: TenantId) -> Result { let uri = format!("{}/v1/tenant/{}", self.mgmt_api_endpoint, tenant_id); let resp = self diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index 988159af4a..1f3ea96d96 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -98,6 +98,23 @@ impl SafekeeperClient { ) } + #[allow(unused)] + pub(crate) async fn switch_timeline_membership( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + measured_request!( + "switch_timeline_membership", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .switch_timeline_membership(tenant_id, timeline_id, req) + .await + ) + } + pub(crate) async fn delete_tenant( &self, tenant_id: TenantId, From 7eb85c56acb5f87c730b879c9488e217448ee28b Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 8 May 2025 08:33:29 +0200 Subject: [PATCH 058/142] tokio-epoll-uring: avoid warn! noise due to `ECANCELED` during shutdowns (#11819) # Problem Before this PR, `test_pageserver_catchup_while_compute_down` would occasionally fail due to scary-looking WARN log line ``` WARN ephemeral_file_buffered_writer{...}:flush_attempt{attempt=1}: \ error flushing buffered writer buffer to disk, retrying after backoff err=Operation canceled (os error 125) ``` After lengthy investigation, the conclusion is that this is likely due to a kernel bug related due to io_uring async workers (io-wq) and signals. The main indicator is that the error only ever happens in correlation with pageserver shtudown when SIGTERM is received. There is a fix that is merged in 6.14 kernels (`io-wq: backoff when retrying worker creation`). However, even when I revert that patch, the issue is not reproducible on 6.14, so, it remains a speculation. It was ruled out that the ECANCELED is due to the executor thread exiting before the async worker starts processing the operation. # Solution The workaround in this issue is to retry the operation on ECANCELED once. Retries are safe because the low-level io_engine operations are idempotent. (We don't use O_APPEND and I can't think of another flag that would make the APIs covered by this patch not idempotent.) # Testing With this PR, the warn! log no longer happens on [my reproducer setup](https://github.com/neondatabase/neon/issues/11446#issuecomment-2843015111). And the new rate-limited `info!`-level log line informing about the internal retry shows up instead, as expected. # Refs - fixes https://github.com/neondatabase/neon/issues/11446 --- libs/utils/src/rate_limit.rs | 2 +- pageserver/src/virtual_file/io_engine.rs | 85 +++++++++++++++++++-- pageserver/src/virtual_file/open_options.rs | 18 +++-- 3 files changed, 91 insertions(+), 14 deletions(-) diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs index 945f710b1d..700cd5792b 100644 --- a/libs/utils/src/rate_limit.rs +++ b/libs/utils/src/rate_limit.rs @@ -17,7 +17,7 @@ impl std::fmt::Display for RateLimitStats { } impl RateLimit { - pub fn new(interval: Duration) -> Self { + pub const fn new(interval: Duration) -> Self { Self { last: None, interval, diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index dd04fb561a..d8eb803335 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -13,7 +13,7 @@ pub(super) mod tokio_epoll_uring_ext; use tokio_epoll_uring::IoBuf; -use tracing::Instrument; +use tracing::{Instrument, info}; pub(crate) use super::api::IoEngineKind; #[derive(Clone, Copy)] @@ -111,13 +111,16 @@ pub(crate) fn get() -> IoEngine { use std::os::unix::prelude::FileExt; use std::sync::atomic::{AtomicU8, Ordering}; +use std::time::Duration; use super::owned_buffers_io::io_buf_ext::FullSlice; use super::owned_buffers_io::slice::SliceMutExt; use super::{FileGuard, Metadata}; #[cfg(target_os = "linux")] -fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { +pub(super) fn epoll_uring_error_to_std( + e: tokio_epoll_uring::Error, +) -> std::io::Error { match e { tokio_epoll_uring::Error::Op(e) => e, tokio_epoll_uring::Error::System(system) => { @@ -149,7 +152,11 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.read(file_guard, offset, slice).await; + let (resources, res) = + retry_ecanceled_once((file_guard, slice), |(file_guard, slice)| async { + system.read(file_guard, offset, slice).await + }) + .await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -164,7 +171,10 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.fsync(file_guard).await; + let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { + system.fsync(file_guard).await + }) + .await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -182,7 +192,10 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.fdatasync(file_guard).await; + let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { + system.fdatasync(file_guard).await + }) + .await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -201,7 +214,10 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.statx(file_guard).await; + let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { + system.statx(file_guard).await + }) + .await; ( resources, res.map_err(epoll_uring_error_to_std).map(Metadata::from), @@ -224,6 +240,7 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { // TODO: ftruncate op for tokio-epoll-uring + // Don't forget to use retry_ecanceled_once let res = file_guard.with_std_file(|std_file| std_file.set_len(len)); (file_guard, res) } @@ -245,8 +262,11 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let ((file_guard, slice), res) = - system.write(file_guard, offset, buf.into_raw_slice()).await; + let ((file_guard, slice), res) = retry_ecanceled_once( + (file_guard, buf.into_raw_slice()), + async |(file_guard, buf)| system.write(file_guard, offset, buf).await, + ) + .await; ( (file_guard, FullSlice::must_new(slice)), res.map_err(epoll_uring_error_to_std), @@ -282,6 +302,55 @@ impl IoEngine { } } +/// We observe in tests that stop pageserver with SIGTERM immediately after it was ingesting data, +/// occasionally buffered writers fail (and get retried by BufferedWriter) with ECANCELED. +/// The problem is believed to be a race condition in how io_uring handles punted async work (io-wq) and signals. +/// Investigation ticket: +/// +/// This function retries the operation once if it fails with ECANCELED. +/// ONLY USE FOR IDEMPOTENT [`super::VirtualFile`] operations. +pub(super) async fn retry_ecanceled_once( + resources: T, + f: F, +) -> (T, Result>) +where + F: Fn(T) -> Fut, + Fut: std::future::Future>)>, + T: Send, + V: Send, +{ + let (resources, res) = f(resources).await; + let Err(e) = res else { + return (resources, res); + }; + let tokio_epoll_uring::Error::Op(err) = e else { + return (resources, Err(e)); + }; + if err.raw_os_error() != Some(nix::libc::ECANCELED) { + return (resources, Err(tokio_epoll_uring::Error::Op(err))); + } + { + static RATE_LIMIT: std::sync::Mutex = + std::sync::Mutex::new(utils::rate_limit::RateLimit::new(Duration::from_secs(1))); + let mut guard = RATE_LIMIT.lock().unwrap(); + guard.call2(|rate_limit_stats| { + info!( + %rate_limit_stats, "ECANCELED observed, assuming it is due to a signal being received by the submitting thread, retrying after a delay; this message is rate-limited" + ); + }); + drop(guard); + } + tokio::time::sleep(Duration::from_millis(100)).await; // something big enough to beat even heavily overcommitted CI runners + let (resources, res) = f(resources).await; + (resources, res) +} + +pub(super) fn panic_operation_must_be_idempotent() { + panic!( + "unsupported; io_engine may retry operations internally and thus needs them to be idempotent (retry_ecanceled_once)" + ) +} + pub enum FeatureTestResult { PlatformPreferred(IoEngineKind), Worse { diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs index 2a7bb693f2..a40dfed4a4 100644 --- a/pageserver/src/virtual_file/open_options.rs +++ b/pageserver/src/virtual_file/open_options.rs @@ -110,18 +110,23 @@ impl OpenOptions { self } + /// Don't use, `O_APPEND` is not supported. + pub fn append(&mut self, _append: bool) { + super::io_engine::panic_operation_must_be_idempotent(); + } + pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result { match &self.inner { Inner::StdFs(x) => x.open(path).map(|file| file.into()), #[cfg(target_os = "linux")] Inner::TokioEpollUring(x) => { let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await; - system.open(path, x).await.map_err(|e| match e { - tokio_epoll_uring::Error::Op(e) => e, - tokio_epoll_uring::Error::System(system) => { - std::io::Error::new(std::io::ErrorKind::Other, system) - } + let (_, res) = super::io_engine::retry_ecanceled_once((), |()| async { + let res = system.open(path, x).await; + ((), res) }) + .await; + res.map_err(super::io_engine::epoll_uring_error_to_std) } } } @@ -140,6 +145,9 @@ impl OpenOptions { } pub fn custom_flags(mut self, flags: i32) -> Self { + if flags & nix::libc::O_APPEND != 0 { + super::io_engine::panic_operation_must_be_idempotent(); + } match &mut self.inner { Inner::StdFs(x) => { let _ = x.custom_flags(flags); From 1d1502bc167a2d0372756650581b4666597120c8 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 8 May 2025 08:57:53 +0200 Subject: [PATCH 059/142] fix(pageserver): `flush task cancelled` errors during timeline shutdown (#11853) # Refs - fixes https://github.com/neondatabase/neon/issues/11762 # Problem PR #10993 introduced internal retries for BufferedWriter flushes. PR #11052 added cancellation sensitivity to that retry loop. That cancellation sensitivity is an error path that didn't exist before. The result is that during timeline shutdown, after we `Timeline::cancel`, compaction can now fail with error `flush task cancelled`. The problem with that: 1. We mis-classify this as an `error!`-worthy event. 2. This causes tests to become flaky because the error is not in global `allowed_errors`. Technically we also trip the `compaction_circuit_breaker` because the resulting `CompactionError` is variant `::Other`. But since this is Timeline shutdown, is doesn't matter practically speaking. # Solution / Changes - Log the anyhow stack trace when classifying a compaction error as `error!`. This was helpful to identify sources of `flush task cancelled` errors. We only log at `error!` level in exceptional circumstances, so, it's ok to have bit verbose logs. - Introduce typed errors along the `BufferedWriter::write_*`=> `BlobWriter::write_blob` => `{Delta,Image}LayerWriter::put_*` => `Split{Delta,Image}LayerWriter::put_{value,image}` chain. - Proper mapping to `CompactionError`/`CreateImageLayersError` via new `From` impls. I am usually opposed to any magic `From` impls, but, it's how most of the compaction code works today. # Testing The symptoms are most prevalent in `test_runner/regress/test_branch_and_gc.py::test_branch_and_gc`. Before this PR, I was able to reproduce locally 1 or 2 times per 400 runs using `DEFAULT_PG_VERSION=15 BUILD_TYPE=release poetry run pytest --count 400 -n 8`. After this PR, it doesn't reproduce anymore after 2000 runs. # Future Work Technically the ingest path is also exposed to this new source of errors because `InMemoryLayer` is backed by `BufferedWriter`. But we haven't seen it occur in flaky tests yet. Details and a fix in - https://github.com/neondatabase/neon/pull/11851 --- pageserver/src/tenant/blob_io.rs | 27 ++++++++++++++----- pageserver/src/tenant/storage_layer.rs | 1 + .../storage_layer/batch_split_writer.rs | 18 ++++++++----- .../src/tenant/storage_layer/delta_layer.rs | 25 +++++++++++------ pageserver/src/tenant/storage_layer/errors.rs | 24 +++++++++++++++++ .../src/tenant/storage_layer/image_layer.rs | 20 ++++++++++---- pageserver/src/tenant/tasks.rs | 2 +- pageserver/src/tenant/timeline.rs | 20 ++++++++++++++ pageserver/src/tenant/timeline/compaction.rs | 3 +-- .../owned_buffers_io/write/flush.rs | 13 +++++++++ 10 files changed, 124 insertions(+), 29 deletions(-) create mode 100644 pageserver/src/tenant/storage_layer/errors.rs diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 8cf3c548c9..ed541c4f12 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -94,10 +94,23 @@ impl Header { pub enum WriteBlobError { #[error(transparent)] Flush(FlushTaskError), - #[error("blob too large ({len} bytes)")] - BlobTooLarge { len: usize }, #[error(transparent)] - WriteBlobRaw(anyhow::Error), + Other(anyhow::Error), +} + +impl WriteBlobError { + pub fn is_cancel(&self) -> bool { + match self { + WriteBlobError::Flush(e) => e.is_cancel(), + WriteBlobError::Other(_) => false, + } + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + WriteBlobError::Flush(e) => e.into_anyhow(), + WriteBlobError::Other(e) => e, + } + } } impl BlockCursor<'_> { @@ -327,7 +340,9 @@ where return ( ( io_buf.slice_len(), - Err(WriteBlobError::BlobTooLarge { len }), + Err(WriteBlobError::Other(anyhow::anyhow!( + "blob too large ({len} bytes)" + ))), ), srcbuf, ); @@ -391,7 +406,7 @@ where // Verify the header, to ensure we don't write invalid/corrupt data. let header = match Header::decode(&raw_with_header) .context("decoding blob header") - .map_err(WriteBlobError::WriteBlobRaw) + .map_err(WriteBlobError::Other) { Ok(header) => header, Err(err) => return (raw_with_header, Err(err)), @@ -401,7 +416,7 @@ where let raw_len = raw_with_header.len(); return ( raw_with_header, - Err(WriteBlobError::WriteBlobRaw(anyhow::anyhow!( + Err(WriteBlobError::Other(anyhow::anyhow!( "header length mismatch: {header_total_len} != {raw_len}" ))), ); diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 796ad01e54..5dfa961b71 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -2,6 +2,7 @@ pub mod batch_split_writer; pub mod delta_layer; +pub mod errors; pub mod filter_iterator; pub mod image_layer; pub mod inmemory_layer; diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 39cd02d101..51f2e909a2 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -10,6 +10,7 @@ use utils::id::TimelineId; use utils::lsn::Lsn; use utils::shard::TenantShardId; +use super::errors::PutError; use super::layer::S3_UPLOAD_LIMIT; use super::{ DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer, @@ -235,7 +236,7 @@ impl<'a> SplitImageLayerWriter<'a> { key: Key, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { // The current estimation is an upper bound of the space that the key/image could take // because we did not consider compression in this estimation. The resulting image layer // could be smaller than the target size. @@ -253,7 +254,8 @@ impl<'a> SplitImageLayerWriter<'a> { self.cancel.clone(), ctx, ) - .await?; + .await + .map_err(PutError::Other)?; let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer); self.batches.add_unfinished_image_writer( prev_image_writer, @@ -346,7 +348,7 @@ impl<'a> SplitDeltaLayerWriter<'a> { lsn: Lsn, val: Value, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate // number, and therefore the final layer size could be a little bit larger or smaller than the target. // @@ -366,7 +368,8 @@ impl<'a> SplitDeltaLayerWriter<'a> { self.cancel.clone(), ctx, ) - .await?, + .await + .map_err(PutError::Other)?, )); } let (_, inner) = self.inner.as_mut().unwrap(); @@ -386,7 +389,8 @@ impl<'a> SplitDeltaLayerWriter<'a> { self.cancel.clone(), ctx, ) - .await?; + .await + .map_err(PutError::Other)?; let (start_key, prev_delta_writer) = self.inner.replace((key, next_delta_writer)).unwrap(); self.batches.add_unfinished_delta_writer( @@ -396,11 +400,11 @@ impl<'a> SplitDeltaLayerWriter<'a> { ); } else if inner.estimated_size() >= S3_UPLOAD_LIMIT { // We have to produce a very large file b/c a key is updated too often. - anyhow::bail!( + return Err(PutError::Other(anyhow::anyhow!( "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced", key, inner.estimated_size() - ); + ))); } } self.last_key_written = key; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 11875ac653..2c1b27c8d5 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -55,6 +55,7 @@ use utils::bin_ser::SerializeError; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use super::errors::PutError; use super::{ AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, @@ -477,12 +478,15 @@ impl DeltaLayerWriterInner { lsn: Lsn, val: Value, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { let (_, res) = self .put_value_bytes( key, lsn, - Value::ser(&val)?.slice_len(), + Value::ser(&val) + .map_err(anyhow::Error::new) + .map_err(PutError::Other)? + .slice_len(), val.will_init(), ctx, ) @@ -497,7 +501,7 @@ impl DeltaLayerWriterInner { val: FullSlice, will_init: bool, ctx: &RequestContext, - ) -> (FullSlice, anyhow::Result<()>) + ) -> (FullSlice, Result<(), PutError>) where Buf: IoBuf + Send, { @@ -513,19 +517,24 @@ impl DeltaLayerWriterInner { .blob_writer .write_blob_maybe_compressed(val, ctx, compression) .await; + let res = res.map_err(PutError::WriteBlob); let off = match res { Ok((off, _)) => off, - Err(e) => return (val, Err(anyhow::anyhow!(e))), + Err(e) => return (val, Err(e)), }; let blob_ref = BlobRef::new(off, will_init); let delta_key = DeltaKey::from_key_lsn(&key, lsn); - let res = self.tree.append(&delta_key.0, blob_ref.0); + let res = self + .tree + .append(&delta_key.0, blob_ref.0) + .map_err(anyhow::Error::new) + .map_err(PutError::Other); self.num_keys += 1; - (val, res.map_err(|e| anyhow::anyhow!(e))) + (val, res) } fn size(&self) -> u64 { @@ -694,7 +703,7 @@ impl DeltaLayerWriter { lsn: Lsn, val: Value, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { self.inner .as_mut() .unwrap() @@ -709,7 +718,7 @@ impl DeltaLayerWriter { val: FullSlice, will_init: bool, ctx: &RequestContext, - ) -> (FullSlice, anyhow::Result<()>) + ) -> (FullSlice, Result<(), PutError>) where Buf: IoBuf + Send, { diff --git a/pageserver/src/tenant/storage_layer/errors.rs b/pageserver/src/tenant/storage_layer/errors.rs new file mode 100644 index 0000000000..591e489faa --- /dev/null +++ b/pageserver/src/tenant/storage_layer/errors.rs @@ -0,0 +1,24 @@ +use crate::tenant::blob_io::WriteBlobError; + +#[derive(Debug, thiserror::Error)] +pub enum PutError { + #[error(transparent)] + WriteBlob(WriteBlobError), + #[error(transparent)] + Other(anyhow::Error), +} + +impl PutError { + pub fn is_cancel(&self) -> bool { + match self { + PutError::WriteBlob(e) => e.is_cancel(), + PutError::Other(_) => false, + } + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + PutError::WriteBlob(e) => e.into_anyhow(), + PutError::Other(e) => e, + } + } +} diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index d684230572..740f53f928 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -53,6 +53,7 @@ use utils::bin_ser::SerializeError; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use super::errors::PutError; use super::layer_name::ImageLayerName; use super::{ AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, @@ -842,8 +843,14 @@ impl ImageLayerWriterInner { key: Key, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { - ensure!(self.key_range.contains(&key)); + ) -> Result<(), PutError> { + if !self.key_range.contains(&key) { + return Err(PutError::Other(anyhow::anyhow!( + "key {:?} not in range {:?}", + key, + self.key_range + ))); + } let compression = self.conf.image_compression; let uncompressed_len = img.len() as u64; self.uncompressed_bytes += uncompressed_len; @@ -853,7 +860,7 @@ impl ImageLayerWriterInner { .write_blob_maybe_compressed(img.slice_len(), ctx, compression) .await; // TODO: re-use the buffer for `img` further upstack - let (off, compression_info) = res?; + let (off, compression_info) = res.map_err(PutError::WriteBlob)?; if compression_info.compressed_size.is_some() { // The image has been considered for compression at least self.uncompressed_bytes_eligible += uncompressed_len; @@ -865,7 +872,10 @@ impl ImageLayerWriterInner { let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); - self.tree.append(&keybuf, off)?; + self.tree + .append(&keybuf, off) + .map_err(anyhow::Error::new) + .map_err(PutError::Other)?; #[cfg(feature = "testing")] { @@ -1085,7 +1095,7 @@ impl ImageLayerWriter { key: Key, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { self.inner.as_mut().unwrap().put_image(key, img, ctx).await } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 1112a5330b..4709a6d616 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -340,7 +340,7 @@ pub(crate) fn log_compaction_error( } else { match level { Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"), - Level::ERROR => error!("Compaction failed: {err:#}"), + Level::ERROR => error!("Compaction failed: {err:?}"), Level::INFO => info!("Compaction failed: {err:#}"), level => unimplemented!("unexpected level {level:?}"), } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index cfeab77598..c8d897d074 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -987,6 +987,16 @@ impl From for CreateImageLayersError { } } +impl From for CreateImageLayersError { + fn from(e: super::storage_layer::errors::PutError) -> Self { + if e.is_cancel() { + CreateImageLayersError::Cancelled + } else { + CreateImageLayersError::Other(e.into_anyhow()) + } + } +} + impl From for CreateImageLayersError { fn from(e: GetVectoredError) -> Self { match e { @@ -5923,6 +5933,16 @@ impl From for CompactionError { } } +impl From for CompactionError { + fn from(e: super::storage_layer::errors::PutError) -> Self { + if e.is_cancel() { + CompactionError::ShuttingDown + } else { + CompactionError::Other(e.into_anyhow()) + } + } +} + #[serde_as] #[derive(serde::Serialize)] struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index d0c13d86ce..07cd274a41 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -2204,8 +2204,7 @@ impl Timeline { .as_mut() .unwrap() .put_value(key, lsn, value, ctx) - .await - .map_err(CompactionError::Other)?; + .await?; } else { let owner = self.shard_identity.get_shard_number(&key); diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index b41a9f6cd2..ac9867e8b4 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -247,6 +247,19 @@ pub enum FlushTaskError { Cancelled, } +impl FlushTaskError { + pub fn is_cancel(&self) -> bool { + match self { + FlushTaskError::Cancelled => true, + } + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + FlushTaskError::Cancelled => anyhow::anyhow!(self), + } + } +} + impl FlushBackgroundTask where Buf: IoBufAligned + Send + Sync, From 40f32ea326ac9f8b691f179d0ced414470eb06ff Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 8 May 2025 10:19:14 +0100 Subject: [PATCH 060/142] pageserver: refactor import flow and add job concurrency limiting (#11816) ## Problem Import code is one big block. Separating planning and execution will help with reporting progress of import to storcon (building block for resuming import). ## Summary of changes Split up the import into planning and execution. A concurrency limit driven by PS config is also added. --- libs/pageserver_api/src/config.rs | 11 + pageserver/src/config.rs | 4 + .../src/tenant/timeline/import_pgdata.rs | 9 +- .../src/tenant/timeline/import_pgdata/flow.rs | 195 ++++++++++-------- 4 files changed, 129 insertions(+), 90 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index b64c42a808..5b0c13dd89 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -182,6 +182,7 @@ pub struct ConfigToml { pub tracing: Option, pub enable_tls_page_service_api: bool, pub dev_mode: bool, + pub timeline_import_config: TimelineImportConfig, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -300,6 +301,12 @@ impl From for tracing_utils::Protocol { } } +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct TimelineImportConfig { + pub import_job_concurrency: NonZeroUsize, + pub import_job_soft_size_limit: NonZeroUsize, +} + pub mod statvfs { pub mod mock { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -659,6 +666,10 @@ impl Default for ConfigToml { tracing: None, enable_tls_page_service_api: false, dev_mode: false, + timeline_import_config: TimelineImportConfig { + import_job_concurrency: NonZeroUsize::new(128).unwrap(), + import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(), + }, } } } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index ded2805602..7e773f56b3 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -230,6 +230,8 @@ pub struct PageServerConf { /// such as authentication requirements for HTTP and PostgreSQL APIs. /// This is insecure and should only be used in development environments. pub dev_mode: bool, + + pub timeline_import_config: pageserver_api::config::TimelineImportConfig, } /// Token for authentication to safekeepers @@ -404,6 +406,7 @@ impl PageServerConf { tracing, enable_tls_page_service_api, dev_mode, + timeline_import_config, } = config_toml; let mut conf = PageServerConf { @@ -457,6 +460,7 @@ impl PageServerConf { tracing, enable_tls_page_service_api, dev_mode, + timeline_import_config, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index 6ab6b90cb6..c4a8df39a3 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -149,14 +149,7 @@ pub async fn doit( } .await?; - flow::run( - timeline.clone(), - base_lsn, - control_file, - storage.clone(), - ctx, - ) - .await?; + flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?; // // Communicate that shard is done. diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index c6d2944769..34c073365d 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -34,7 +34,9 @@ use std::sync::Arc; use anyhow::{bail, ensure}; use bytes::Bytes; +use futures::stream::FuturesOrdered; use itertools::Itertools; +use pageserver_api::config::TimelineImportConfig; use pageserver_api::key::{ CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, Key, TWOPHASEDIR_KEY, rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, @@ -46,8 +48,9 @@ use pageserver_api::shard::ShardIdentity; use postgres_ffi::relfile_utils::parse_relfilename; use postgres_ffi::{BLCKSZ, pg_constants}; use remote_storage::RemotePath; -use tokio::task::JoinSet; -use tracing::{Instrument, debug, info_span, instrument}; +use tokio::sync::Semaphore; +use tokio_stream::StreamExt; +use tracing::{debug, instrument}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; @@ -63,37 +66,39 @@ use crate::tenant::storage_layer::{ImageLayerWriter, Layer}; pub async fn run( timeline: Arc, - pgdata_lsn: Lsn, control_file: ControlFile, storage: RemoteStorageWrapper, ctx: &RequestContext, ) -> anyhow::Result<()> { - Flow { - timeline, - pgdata_lsn, + let planner = Planner { control_file, - tasks: Vec::new(), - storage, - } - .run(ctx) - .await + storage: storage.clone(), + shard: timeline.shard_identity, + tasks: Vec::default(), + }; + + let import_config = &timeline.conf.timeline_import_config; + let plan = planner.plan(import_config).await?; + plan.execute(timeline, import_config, ctx).await } -struct Flow { - timeline: Arc, - pgdata_lsn: Lsn, +struct Planner { control_file: ControlFile, - tasks: Vec, storage: RemoteStorageWrapper, + shard: ShardIdentity, + tasks: Vec, } -impl Flow { - /// Perform the ingestion into [`Self::timeline`]. - /// Assumes the timeline is empty (= no layers). - pub async fn run(mut self, ctx: &RequestContext) -> anyhow::Result<()> { - let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align(); +struct Plan { + jobs: Vec, +} - self.pgdata_lsn = pgdata_lsn; +impl Planner { + /// Creates an import plan + /// + /// This function is and must remain pure: given the same input, it will generate the same import plan. + async fn plan(mut self, import_config: &TimelineImportConfig) -> anyhow::Result { + let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align(); let datadir = PgDataDir::new(&self.storage).await?; @@ -115,7 +120,7 @@ impl Flow { } // Import SLRUs - if self.timeline.tenant_shard_id.is_shard_zero() { + if self.shard.is_shard_zero() { // pg_xact (01:00 keyspace) self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact")) .await?; @@ -166,14 +171,16 @@ impl Flow { let mut last_end_key = Key::MIN; let mut current_chunk = Vec::new(); let mut current_chunk_size: usize = 0; - let mut parallel_jobs = Vec::new(); + let mut jobs = Vec::new(); for task in std::mem::take(&mut self.tasks).into_iter() { - if current_chunk_size + task.total_size() > 1024 * 1024 * 1024 { + if current_chunk_size + task.total_size() + > import_config.import_job_soft_size_limit.into() + { let key_range = last_end_key..task.key_range().start; - parallel_jobs.push(ChunkProcessingJob::new( + jobs.push(ChunkProcessingJob::new( key_range.clone(), std::mem::take(&mut current_chunk), - &self, + pgdata_lsn, )); last_end_key = key_range.end; current_chunk_size = 0; @@ -181,45 +188,13 @@ impl Flow { current_chunk_size += task.total_size(); current_chunk.push(task); } - parallel_jobs.push(ChunkProcessingJob::new( + jobs.push(ChunkProcessingJob::new( last_end_key..Key::MAX, current_chunk, - &self, + pgdata_lsn, )); - // Start all jobs simultaneosly - let mut work = JoinSet::new(); - // TODO: semaphore? - for job in parallel_jobs { - let ctx: RequestContext = - ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); - work.spawn(async move { job.run(&ctx).await }.instrument(info_span!("parallel_job"))); - } - let mut results = Vec::new(); - while let Some(result) = work.join_next().await { - match result { - Ok(res) => { - results.push(res); - } - Err(_joinset_err) => { - results.push(Err(anyhow::anyhow!( - "parallel job panicked or cancelled, check pageserver logs" - ))); - } - } - } - - if results.iter().all(|r| r.is_ok()) { - Ok(()) - } else { - let mut msg = String::new(); - for result in results { - if let Err(err) = result { - msg.push_str(&format!("{err:?}\n\n")); - } - } - bail!("Some parallel jobs failed:\n\n{msg}"); - } + Ok(Plan { jobs }) } #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))] @@ -266,7 +241,7 @@ impl Flow { let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32); self.tasks .push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new( - *self.timeline.get_shard_identity(), + self.shard, start_key..end_key, &file.path, self.storage.clone(), @@ -289,7 +264,7 @@ impl Flow { } async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> { - assert!(self.timeline.tenant_shard_id.is_shard_zero()); + assert!(self.shard.is_shard_zero()); let segments = self.storage.listfilesindir(path).await?; let segments: Vec<(String, u32, usize)> = segments @@ -344,6 +319,68 @@ impl Flow { } } +impl Plan { + async fn execute( + self, + timeline: Arc, + import_config: &TimelineImportConfig, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let mut work = FuturesOrdered::new(); + let semaphore = Arc::new(Semaphore::new(import_config.import_job_concurrency.into())); + + let jobs_in_plan = self.jobs.len(); + + let mut jobs = self.jobs.into_iter().enumerate().peekable(); + let mut results = Vec::new(); + + // Run import jobs concurrently up to the limit specified by the pageserver configuration. + // Note that we process completed futures in the oreder of insertion. This will be the + // building block for resuming imports across pageserver restarts or tenant migrations. + while results.len() < jobs_in_plan { + tokio::select! { + permit = semaphore.clone().acquire_owned(), if jobs.peek().is_some() => { + let permit = permit.expect("never closed"); + let (job_idx, job) = jobs.next().expect("we peeked"); + let job_timeline = timeline.clone(); + let ctx = ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); + + work.push_back(tokio::task::spawn(async move { + let _permit = permit; + let res = job.run(job_timeline, &ctx).await; + (job_idx, res) + })); + }, + maybe_complete_job_idx = work.next() => { + match maybe_complete_job_idx { + Some(Ok((_job_idx, res))) => { + results.push(res); + }, + Some(Err(_)) => { + results.push(Err(anyhow::anyhow!( + "parallel job panicked or cancelled, check pageserver logs" + ))); + } + None => {} + } + } + } + } + + if results.iter().all(|r| r.is_ok()) { + Ok(()) + } else { + let mut msg = String::new(); + for result in results { + if let Err(err) = result { + msg.push_str(&format!("{err:?}\n\n")); + } + } + bail!("Some parallel jobs failed:\n\n{msg}"); + } + } +} + // // dbdir iteration tools // @@ -713,7 +750,6 @@ impl From for AnyImportTask { } struct ChunkProcessingJob { - timeline: Arc, range: Range, tasks: Vec, @@ -721,25 +757,24 @@ struct ChunkProcessingJob { } impl ChunkProcessingJob { - fn new(range: Range, tasks: Vec, env: &Flow) -> Self { - assert!(env.pgdata_lsn.is_valid()); + fn new(range: Range, tasks: Vec, pgdata_lsn: Lsn) -> Self { + assert!(pgdata_lsn.is_valid()); Self { - timeline: env.timeline.clone(), range, tasks, - pgdata_lsn: env.pgdata_lsn, + pgdata_lsn, } } - async fn run(self, ctx: &RequestContext) -> anyhow::Result<()> { + async fn run(self, timeline: Arc, ctx: &RequestContext) -> anyhow::Result<()> { let mut writer = ImageLayerWriter::new( - self.timeline.conf, - self.timeline.timeline_id, - self.timeline.tenant_shard_id, + timeline.conf, + timeline.timeline_id, + timeline.tenant_shard_id, &self.range, self.pgdata_lsn, - &self.timeline.gate, - self.timeline.cancel.clone(), + &timeline.gate, + timeline.cancel.clone(), ctx, ) .await?; @@ -751,24 +786,20 @@ impl ChunkProcessingJob { let resident_layer = if nimages > 0 { let (desc, path) = writer.finish(ctx).await?; - Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)? + Layer::finish_creating(timeline.conf, &timeline, desc, &path)? } else { // dropping the writer cleans up return Ok(()); }; // this is sharing the same code as create_image_layers - let mut guard = self.timeline.layers.write().await; + let mut guard = timeline.layers.write().await; guard .open_mut()? - .track_new_image_layers(&[resident_layer.clone()], &self.timeline.metrics); + .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics); crate::tenant::timeline::drop_wlock(guard); - // Schedule the layer for upload but don't add barriers such as - // wait for completion or index upload, so we don't inhibit upload parallelism. - // TODO: limit upload parallelism somehow (e.g. by limiting concurrency of jobs?) - // TODO: or regulate parallelism by upload queue depth? Prob should happen at a higher level. - self.timeline + timeline .remote_client .schedule_layer_file_upload(resident_layer)?; From 7e55497e131f2f26a16ae22bff80cac11951cdd4 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Thu, 8 May 2025 14:00:45 +0400 Subject: [PATCH 061/142] tests: flush wal before waiting for last record lsn (#11726) ## Problem Compute may flush WAL on page boundaries, leaving some records partially flushed for a long time. It leads to `wait_for_last_flush_lsn` stuck waiting for this partial LSN. - Closes: https://github.com/neondatabase/cloud/issues/27876 ## Summary of changes - Flush WAL via CHECKPOINT after requesting current_wal_lsn to make sure that the record we point to is flushed in full - Use proper endpoint in `test_timeline_detach_with_aux_files_with_detach_v1` --- test_runner/fixtures/neon_fixtures.py | 7 +++++++ test_runner/regress/test_timeline_detach_ancestor.py | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 85ad49bb4f..370eca5130 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -5477,6 +5477,13 @@ def wait_for_last_flush_lsn( if last_flush_lsn is None: last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + # The last_flush_lsn may not correspond to a record boundary. + # For example, if the compute flushed WAL on a page boundary, + # the remaining part of the record might not be flushed for a long time. + # This would prevent the pageserver from reaching last_flush_lsn promptly. + # To ensure the rest of the record reaches the pageserver quickly, + # we forcibly flush the WAL by using CHECKPOINT. + endpoint.safe_psql("CHECKPOINT") results = [] for tenant_shard_id, pageserver in shards: diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index a71652af8a..d42c5d403e 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1822,7 +1822,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1( endpoint2.safe_psql( "SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')" ) - lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + lsn3 = wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id) assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set( ["pg_replslot/test_slot_restore/state"] @@ -1839,7 +1839,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1( assert all_reparented == set([]) # We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN. - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id) assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set( ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"] ), "main branch unaffected" From 6c70789cfdf145ae4ca73228884ca1359b80c302 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 8 May 2025 12:14:41 +0200 Subject: [PATCH 062/142] storcon: increase drain+fill secondary warmup timeout from 20 to 30 seconds (#11848) ## Problem During deployment drains/fills, we often see the storage controller giving up on warmups after 20 seconds, when the warmup is nearly complete (~90%). This can cause latency spikes for migrated tenants if they block on layer downloads. Touches https://github.com/neondatabase/cloud/issues/26193. ## Summary of changes Increase the drain and fill secondary warmup timeout from 20 to 30 seconds. --- storage_controller/src/service.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 21c693af97..fdb791c2cf 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -8485,7 +8485,7 @@ impl Service { // By default, live migrations are generous about the wait time for getting // the secondary location up to speed. When draining, give up earlier in order // to not stall the operation when a cold secondary is encountered. - const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(30); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) @@ -8818,7 +8818,7 @@ impl Service { node_id: NodeId, cancel: CancellationToken, ) -> Result<(), OperationError> { - const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(30); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) From d22377c754556c95d24970458cb08968828902b3 Mon Sep 17 00:00:00 2001 From: Mark Novikov Date: Thu, 8 May 2025 15:04:28 +0400 Subject: [PATCH 063/142] Skip event triggers in dump-restore (#11794) ## Problem Data import fails if the src db has any event triggers, because those can only be restored by a superuser. Specifically imports from Heroku and Supabase are guaranteed to fail. Closes https://github.com/neondatabase/cloud/issues/27353 ## Summary of changes Depends on `pg_dump` patches per each supported PostgreSQL version: - https://github.com/neondatabase/postgres/pull/630 - https://github.com/neondatabase/postgres/pull/629 - https://github.com/neondatabase/postgres/pull/627 - https://github.com/neondatabase/postgres/pull/628 --- compute_tools/src/bin/fast_import.rs | 1 + test_runner/regress/test_import_pgdata.py | 49 +++++++++++++++++++++++ vendor/postgres-v14 | 2 +- vendor/postgres-v17 | 2 +- vendor/revisions.json | 4 +- 5 files changed, 54 insertions(+), 4 deletions(-) diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 537028cde1..78acd78585 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -348,6 +348,7 @@ async fn run_dump_restore( "--no-security-labels".to_string(), "--no-subscriptions".to_string(), "--no-tablespaces".to_string(), + "--no-event-triggers".to_string(), // format "--format".to_string(), "directory".to_string(), diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index a26c3994a5..2fda1991f7 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -641,6 +641,55 @@ def test_fast_import_binary( assert res[0][0] == 10 +def test_fast_import_event_triggers( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, +): + vanilla_pg.start() + vanilla_pg.safe_psql(""" + CREATE FUNCTION test_event_trigger_for_drops() + RETURNS event_trigger LANGUAGE plpgsql AS $$ + DECLARE + obj record; + BEGIN + FOR obj IN SELECT * FROM pg_event_trigger_dropped_objects() + LOOP + RAISE NOTICE '% dropped object: % %.% %', + tg_tag, + obj.object_type, + obj.schema_name, + obj.object_name, + obj.object_identity; + END LOOP; + END + $$; + + CREATE EVENT TRIGGER test_event_trigger_for_drops + ON sql_drop + EXECUTE PROCEDURE test_event_trigger_for_drops(); + """) + + pg_port = port_distributor.get_port() + p = fast_import.run_pgdata(pg_port=pg_port, source_connection_string=vanilla_pg.connstr()) + assert p.returncode == 0 + + vanilla_pg.stop() + + pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version) + with VanillaPostgres( + fast_import.workdir / "pgdata", pgbin, pg_port, False + ) as new_pgdata_vanilla_pg: + new_pgdata_vanilla_pg.start() + + # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres + conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb") + res = conn.safe_psql("SELECT count(*) FROM pg_event_trigger;") + log.info(f"Result: {res}") + assert res[0][0] == 0, f"Neon does not support importing event triggers, got: {res[0][0]}" + + def test_fast_import_restore_to_connstring( test_output_dir, vanilla_pg: VanillaPostgres, diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index c8dab02bfc..108856a4ae 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit c8dab02bfc003ae7bd59096919042d7840f3c194 +Subproject commit 108856a4ae76be285b04497a0ed08fcbe60ddbe9 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index eab3a37834..b763ab54b9 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit eab3a37834cac6ec0719bf817ac918a201712d66 +Subproject commit b763ab54b98d232a0959371ab1d07f06ed77c49e diff --git a/vendor/revisions.json b/vendor/revisions.json index 74a6ff33d7..4307fd1c3f 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,7 +1,7 @@ { "v17": [ "17.4", - "eab3a37834cac6ec0719bf817ac918a201712d66" + "b763ab54b98d232a0959371ab1d07f06ed77c49e" ], "v16": [ "16.8", @@ -13,6 +13,6 @@ ], "v14": [ "14.17", - "c8dab02bfc003ae7bd59096919042d7840f3c194" + "108856a4ae76be285b04497a0ed08fcbe60ddbe9" ] } From 42d93031a13b31cee2fbb8c2e7f1b094b0f554a2 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 8 May 2025 13:48:29 +0200 Subject: [PATCH 064/142] fixup(#11819): broken macOS build (#11861) refs - fixes https://github.com/neondatabase/neon/issues/11860 --- pageserver/src/virtual_file/io_engine.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index d8eb803335..7827682498 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -13,7 +13,7 @@ pub(super) mod tokio_epoll_uring_ext; use tokio_epoll_uring::IoBuf; -use tracing::{Instrument, info}; +use tracing::Instrument; pub(crate) use super::api::IoEngineKind; #[derive(Clone, Copy)] @@ -111,7 +111,8 @@ pub(crate) fn get() -> IoEngine { use std::os::unix::prelude::FileExt; use std::sync::atomic::{AtomicU8, Ordering}; -use std::time::Duration; +#[cfg(target_os = "linux")] +use {std::time::Duration, tracing::info}; use super::owned_buffers_io::io_buf_ext::FullSlice; use super::owned_buffers_io::slice::SliceMutExt; @@ -309,6 +310,7 @@ impl IoEngine { /// /// This function retries the operation once if it fails with ECANCELED. /// ONLY USE FOR IDEMPOTENT [`super::VirtualFile`] operations. +#[cfg(target_os = "linux")] pub(super) async fn retry_ecanceled_once( resources: T, f: F, From 659366060dcef08a46c42c0794a829afb4270b1c Mon Sep 17 00:00:00 2001 From: Santosh Pingale <3813695+santosh-d3vpl3x@users.noreply.github.com> Date: Thu, 8 May 2025 16:09:15 +0200 Subject: [PATCH 065/142] Reuse remote_client from the SnapshotDownloader instead of recreating in download function (#11812) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem At the moment, remote_client and target are recreated in download function. We could reuse it from SnapshotDownloader instance. This isn't a problem per se, just a quality of life improvement but it caught my attention when we were trying out snapshot downloading in one of the older version and ran into a curious case of s3 clients behaving in two different manners. One client that used `force_path_style` and other one didn't. **Logs from this run:** ``` 2025-05-02T12:56:22.384626Z DEBUG /data/snappie/2739e7da34e625e3934ef0b76fa12483/timelines/d44b831adb0a6ba96792dc3a5cc30910/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014E8F20-00000000014E8F99-00000001 requires download... 2025-05-02T12:56:22.384689Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:apply_configuration: timeout settings for this operation: TimeoutConfig { connect_timeout: Set(3.1s), read_timeout: Disabled, operation_timeout: Disabled, operation_attempt_timeout: Disabled } 2025-05-02T12:56:22.384730Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: entering 'serialization' phase 2025-05-02T12:56:22.384784Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: entering 'before transmit' phase 2025-05-02T12:56:22.384813Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: retry strategy has OKed initial request 2025-05-02T12:56:22.384841Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: beginning attempt #1 2025-05-02T12:56:22.384870Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: resolving endpoint endpoint_params=EndpointResolverParams(TypeErasedBox[!Clone]:Params { bucket: Some("bucket"), region: Some("eu-north-1"), use_fips: false, use_dual_stack: false, endpoint: Some("https://s3.self-hosted.company.com"), force_path_style: false, accelerate: false, use_global_endpoint: false, use_object_lambda_endpoint: None, key: None, prefix: Some("/pageserver/tenants/2739e7da34e625e3934ef0b76fa12483/timelines/d44b831adb0a6ba96792dc3a5cc30910/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014E8F20-00000000014E8F99-00000001"), copy_source: None, disable_access_points: None, disable_multi_region_access_points: false, use_arn_region: None, use_s3_express_control_endpoint: None, disable_s3_express_session_auth: None }) endpoint_prefix=None 2025-05-02T12:56:22.384979Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: will use endpoint Endpoint { url: "https://neon.s3.self-hosted.company.com", headers: {}, properties: {"authSchemes": Array([Object({"signingRegion": String("eu-north-1"), "disableDoubleEncoding": Bool(true), "name": String("sigv4"), "signingName": String("s3")})])} } 2025-05-02T12:56:22.385042Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt:lazy_load_identity:provide_credentials{provider=default_chain}: loaded credentials provider=Environment 2025-05-02T12:56:22.385066Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt:lazy_load_identity: identity cache miss occurred; added new identity (took 35.958µs) new_expiration=2025-05-02T13:11:22.385028Z valid_for=899.999961437s partition=IdentityCachePartition(5) 2025-05-02T12:56:22.385090Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: loaded identity 2025-05-02T12:56:22.385162Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: entering 'transmit' phase 2025-05-02T12:56:22.385211Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: new TCP connector created in 361ns 2025-05-02T12:56:22.385288Z DEBUG resolving host="neon.s3.self-hosted.company.com" 2025-05-02T12:56:22.390796Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: encountered orchestrator error; halting ``` --- storage_scrubber/src/tenant_snapshot.rs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index 24231e32fc..d0ca53f8ab 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -24,7 +24,6 @@ pub struct SnapshotDownloader { remote_client: GenericRemoteStorage, #[allow(dead_code)] target: RootTarget, - bucket_config: BucketConfig, tenant_id: TenantId, output_path: Utf8PathBuf, concurrency: usize, @@ -43,7 +42,6 @@ impl SnapshotDownloader { Ok(Self { remote_client, target, - bucket_config, tenant_id, output_path, concurrency, @@ -218,11 +216,9 @@ impl SnapshotDownloader { } pub async fn download(&self) -> anyhow::Result<()> { - let (remote_client, target) = - init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?; - // Generate a stream of TenantShardId - let shards = stream_tenant_shards(&remote_client, &target, self.tenant_id).await?; + let shards = + stream_tenant_shards(&self.remote_client, &self.target, self.tenant_id).await?; let shards: Vec = shards.try_collect().await?; // Only read from shards that have the highest count: avoids redundantly downloading @@ -240,7 +236,8 @@ impl SnapshotDownloader { for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) { // Generate a stream of TenantTimelineId - let timelines = stream_tenant_timelines(&remote_client, &target, shard).await?; + let timelines = + stream_tenant_timelines(&self.remote_client, &self.target, shard).await?; // Generate a stream of S3TimelineBlobData async fn load_timeline_index( @@ -251,8 +248,8 @@ impl SnapshotDownloader { let data = list_timeline_blobs(remote_client, ttid, target).await?; Ok((ttid, data)) } - let timelines = - timelines.map_ok(|ttid| load_timeline_index(&remote_client, &target, ttid)); + let timelines = timelines + .map_ok(|ttid| load_timeline_index(&self.remote_client, &self.target, ttid)); let mut timelines = std::pin::pin!(timelines.try_buffered(8)); while let Some(i) = timelines.next().await { From 622b3b29936d0496808396e447e678177a58412d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Thu, 8 May 2025 17:13:11 +0200 Subject: [PATCH 066/142] Fixes for enabling --timelines-onto-safekeepers in tests (#11854) Second PR with fixes extracted from #11712, relating to `--timelines-onto-safekeepers`. Does the following: * Moves safekeeper registration to `neon_local` instead of the test fixtures * Pass safekeeper JWT token if `--timelines-onto-safekeepers` is enabled * Allow some warnings related to offline safekeepers (similarly to how we allow them for offline pageservers) * Enable generations on the compute's config if `--timelines-onto-safekeepers` is enabled * fix parallel `pull_timeline` race condition (the one that #11786 put for later) Fixes #11424 Part of #11670 --- control_plane/src/bin/neon_local.rs | 9 +- control_plane/src/storage_controller.rs | 100 ++++++++++++++++-- safekeeper/src/http/routes.rs | 3 +- safekeeper/src/pull_timeline.rs | 30 ++++-- test_runner/fixtures/neon_fixtures.py | 24 ----- .../fixtures/pageserver/allowed_errors.py | 4 + 6 files changed, 131 insertions(+), 39 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 610fa5f865..191a22f1de 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -1417,7 +1417,14 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res let pageserver_id = args.endpoint_pageserver_id; let remote_ext_base_url = &args.remote_ext_base_url; - let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new); + let default_generation = env + .storage_controller + .timelines_onto_safekeepers + .then_some(1); + let safekeepers_generation = args + .safekeepers_generation + .or(default_generation) + .map(SafekeeperGeneration::new); // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? { diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index a36815d27e..755d67a7ad 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -10,7 +10,8 @@ use camino::{Utf8Path, Utf8PathBuf}; use hyper0::Uri; use nix::unistd::Pid; use pageserver_api::controller_api::{ - NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, + NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, + SafekeeperSchedulingPolicyRequest, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantLocateResponse, }; use pageserver_api::models::{ @@ -20,7 +21,7 @@ use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use pem::Pem; use postgres_backend::AuthType; -use reqwest::Method; +use reqwest::{Method, Response}; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use tokio::process::Command; @@ -570,6 +571,11 @@ impl StorageController { let peer_jwt_token = encode_from_key_file(&peer_claims, private_key) .expect("failed to generate jwt token"); args.push(format!("--peer-jwt-token={peer_jwt_token}")); + + let claims = Claims::new(None, Scope::SafekeeperData); + let jwt_token = + encode_from_key_file(&claims, private_key).expect("failed to generate jwt token"); + args.push(format!("--safekeeper-jwt-token={jwt_token}")); } if let Some(public_key) = &self.public_key { @@ -614,6 +620,10 @@ impl StorageController { self.env.base_data_dir.display() )); + if self.env.safekeepers.iter().any(|sk| sk.auth_enabled) && self.private_key.is_none() { + anyhow::bail!("Safekeeper set up for auth but no private key specified"); + } + if self.config.timelines_onto_safekeepers { args.push("--timelines-onto-safekeepers".to_string()); } @@ -640,6 +650,10 @@ impl StorageController { ) .await?; + if self.config.timelines_onto_safekeepers { + self.register_safekeepers().await?; + } + Ok(()) } @@ -743,6 +757,23 @@ impl StorageController { where RQ: Serialize + Sized, RS: DeserializeOwned + Sized, + { + let response = self.dispatch_inner(method, path, body).await?; + Ok(response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) + } + + /// Simple HTTP request wrapper for calling into storage controller + async fn dispatch_inner( + &self, + method: reqwest::Method, + path: String, + body: Option, + ) -> anyhow::Result + where + RQ: Serialize + Sized, { // In the special case of the `storage_controller start` subcommand, we wish // to use the API endpoint of the newly started storage controller in order @@ -785,10 +816,31 @@ impl StorageController { let response = builder.send().await?; let response = response.error_from_body().await?; - Ok(response - .json() - .await - .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) + Ok(response) + } + + /// Register the safekeepers in the storage controller + #[instrument(skip(self))] + async fn register_safekeepers(&self) -> anyhow::Result<()> { + for sk in self.env.safekeepers.iter() { + let sk_id = sk.id; + let body = serde_json::json!({ + "id": sk_id, + "created_at": "2023-10-25T09:11:25Z", + "updated_at": "2024-08-28T11:32:43Z", + "region_id": "aws-us-east-2", + "host": "127.0.0.1", + "port": sk.pg_port, + "http_port": sk.http_port, + "https_port": sk.https_port, + "version": 5957, + "availability_zone_id": format!("us-east-2b-{sk_id}"), + }); + self.upsert_safekeeper(sk_id, body).await?; + self.safekeeper_scheduling_policy(sk_id, SkSchedulingPolicy::Active) + .await?; + } + Ok(()) } /// Call into the attach_hook API, for use before handing out attachments to pageservers @@ -816,6 +868,42 @@ impl StorageController { Ok(response.generation) } + #[instrument(skip(self))] + pub async fn upsert_safekeeper( + &self, + node_id: NodeId, + request: serde_json::Value, + ) -> anyhow::Result<()> { + let resp = self + .dispatch_inner::( + Method::POST, + format!("control/v1/safekeeper/{node_id}"), + Some(request), + ) + .await?; + if !resp.status().is_success() { + anyhow::bail!( + "setting scheduling policy unsuccessful for safekeeper {node_id}: {}", + resp.status() + ); + } + Ok(()) + } + + #[instrument(skip(self))] + pub async fn safekeeper_scheduling_policy( + &self, + node_id: NodeId, + scheduling_policy: SkSchedulingPolicy, + ) -> anyhow::Result<()> { + self.dispatch::( + Method::POST, + format!("control/v1/safekeeper/{node_id}/scheduling_policy"), + Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }), + ) + .await + } + #[instrument(skip(self))] pub async fn inspect( &self, diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 2b2d721db2..1a25b07496 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -243,8 +243,7 @@ async fn timeline_pull_handler(mut request: Request) -> Result, ssl_ca_certs: Vec, global_timelines: Arc, -) -> Result { +) -> Result { let existing_tli = global_timelines.get(TenantTimelineId::new( request.tenant_id, request.timeline_id, @@ -411,7 +412,9 @@ pub async fn handle_request( for ssl_ca_cert in ssl_ca_certs { http_client = http_client.add_root_certificate(ssl_ca_cert); } - let http_client = http_client.build()?; + let http_client = http_client + .build() + .map_err(|e| ApiError::InternalServerError(e.into()))?; let http_hosts = request.http_hosts.clone(); @@ -443,10 +446,10 @@ pub async fn handle_request( // offline and C comes online. Then we want a pull on C with A and B as hosts to work. let min_required_successful = (http_hosts.len() - 1).max(1); if statuses.len() < min_required_successful { - bail!( + return Err(ApiError::InternalServerError(anyhow::anyhow!( "only got {} successful status responses. required: {min_required_successful}", statuses.len() - ) + ))); } // Find the most advanced safekeeper @@ -465,7 +468,7 @@ pub async fn handle_request( assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); - pull_timeline( + match pull_timeline( status, safekeeper_host, sk_auth_token, @@ -473,6 +476,21 @@ pub async fn handle_request( global_timelines, ) .await + { + Ok(resp) => Ok(resp), + Err(e) => { + match e.downcast_ref::() { + Some(TimelineError::AlreadyExists(_)) => Ok(PullTimelineResponse { + safekeeper_host: None, + }), + Some(TimelineError::CreationInProgress(_)) => { + // We don't return success here because creation might still fail. + Err(ApiError::Conflict("Creation in progress".to_owned())) + } + _ => Err(ApiError::InternalServerError(e)), + } + } + } } async fn pull_timeline( diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 370eca5130..547c640a40 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1409,30 +1409,6 @@ class NeonEnv: for f in futs: f.result() - # Last step: register safekeepers at the storage controller - if ( - self.storage_controller_config is not None - and self.storage_controller_config.get("timelines_onto_safekeepers") is True - ): - for sk_id, sk in enumerate(self.safekeepers): - # 0 is an invalid safekeeper id - sk_id = sk_id + 1 - body = { - "id": sk_id, - "created_at": "2023-10-25T09:11:25Z", - "updated_at": "2024-08-28T11:32:43Z", - "region_id": "aws-us-east-2", - "host": "127.0.0.1", - "port": sk.port.pg, - "http_port": sk.port.http, - "https_port": None, - "version": 5957, - "availability_zone_id": f"us-east-2b-{sk_id}", - } - - self.storage_controller.on_safekeeper_deploy(sk_id, body) - self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active") - self.endpoint_storage.start(timeout_in_seconds=timeout_in_seconds) def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 24c856e279..43bffd919c 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -122,6 +122,10 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ ".*Call to node.*management API.*failed.*Timeout.*", ".*Failed to update node .+ after heartbeat round.*error sending request for url.*", ".*background_reconcile: failed to fetch top tenants:.*client error \\(Connect\\).*", + # Many tests will take safekeepers offline + ".*Call to safekeeper.*management API.*failed.*receive body.*", + ".*Call to safekeeper.*management API.*failed.*ReceiveBody.*", + ".*Call to safekeeper.*management API.*failed.*Timeout.*", # Many tests will start up with a node offline ".*startup_reconcile: Could not scan node.*", # Tests run in dev mode From 8477d15f95ffb094c444e658bbcdb95301b1a750 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 8 May 2025 18:11:45 +0200 Subject: [PATCH 067/142] feat(direct IO): remove special case in test suite for compat tests (#11864) PR - https://github.com/neondatabase/neon/pull/11558 adds special treatment for compat snapshot binaries which don't understand the `direct-rw` mode. A new compat snapshot has been published since, so, we can remove the special case. refs: - fixes https://github.com/neondatabase/neon/issues/11598 --- test_runner/fixtures/neon_fixtures.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 547c640a40..aa468d9386 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1299,13 +1299,6 @@ class NeonEnv: for key, value in override.items(): ps_cfg[key] = value - if self.pageserver_virtual_file_io_mode is not None: - # TODO(christian): https://github.com/neondatabase/neon/issues/11598 - if not config.test_may_use_compatibility_snapshot_binaries: - ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode - else: - log.info("ignoring virtual_file_io_mode parametrization for compatibility test") - if self.pageserver_wal_receiver_protocol is not None: key, value = PageserverWalReceiverProtocol.to_config_key_value( self.pageserver_wal_receiver_protocol From bef5954fd7b8ea43cac6f43a111d437cd7a360ad Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 8 May 2025 17:46:57 +0100 Subject: [PATCH 068/142] feat(proxy): track SNI usage by protocol, including for http (#11863) ## Problem We want to see how many users of the legacy serverless driver are still using the old URL for SQL-over-HTTP traffic. ## Summary of changes Adds a protocol field to the connections_by_sni metric. Ensures it's incremented for sql-over-http. --- proxy/src/auth/credentials.rs | 29 ++++++++++++++------------- proxy/src/metrics.rs | 15 +++++++++++--- proxy/src/serverless/mod.rs | 1 + proxy/src/serverless/sql_over_http.rs | 28 +++++++++++++++++++++++++- test_runner/fixtures/neon_fixtures.py | 8 ++++---- 5 files changed, 59 insertions(+), 22 deletions(-) diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 183976374a..526d0df7f2 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -12,9 +12,9 @@ use tracing::{debug, warn}; use crate::auth::password_hack::parse_endpoint_param; use crate::context::RequestContext; use crate::error::{ReportableError, UserFacingError}; -use crate::metrics::{Metrics, SniKind}; +use crate::metrics::{Metrics, SniGroup, SniKind}; use crate::proxy::NeonOptions; -use crate::serverless::SERVERLESS_DRIVER_SNI; +use crate::serverless::{AUTH_BROKER_SNI, SERVERLESS_DRIVER_SNI}; use crate::types::{EndpointId, RoleName}; #[derive(Debug, Error, PartialEq, Eq, Clone)] @@ -65,7 +65,7 @@ pub(crate) fn endpoint_sni(sni: &str, common_names: &HashSet) -> Option< if !common_names.contains(common_name) { return None; } - if subdomain == SERVERLESS_DRIVER_SNI { + if subdomain == SERVERLESS_DRIVER_SNI || subdomain == AUTH_BROKER_SNI { return None; } Some(EndpointId::from(subdomain)) @@ -128,22 +128,23 @@ impl ComputeUserInfoMaybeEndpoint { let metrics = Metrics::get(); debug!(%user, "credentials"); - if sni.is_some() { + + let protocol = ctx.protocol(); + let kind = if sni.is_some() { debug!("Connection with sni"); - metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni); + SniKind::Sni } else if endpoint.is_some() { - metrics - .proxy - .accepted_connections_by_sni - .inc(SniKind::NoSni); debug!("Connection without sni"); + SniKind::NoSni } else { - metrics - .proxy - .accepted_connections_by_sni - .inc(SniKind::PasswordHack); debug!("Connection with password hack"); - } + SniKind::PasswordHack + }; + + metrics + .proxy + .accepted_connections_by_sni + .inc(SniGroup { protocol, kind }); let options = NeonOptions::parse_params(params); diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index e5fc0b724b..4b22c912eb 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -115,8 +115,8 @@ pub struct ProxyMetrics { #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] pub allowed_vpc_endpoint_ids: Histogram<10>, - /// Number of connections (per sni). - pub accepted_connections_by_sni: CounterVec>, + /// Number of connections, by the method we used to determine the endpoint. + pub accepted_connections_by_sni: CounterVec, /// Number of connection failures (per kind). pub connection_failures_total: CounterVec>, @@ -342,11 +342,20 @@ pub enum LatencyExclusions { ClientCplaneComputeRetry, } +#[derive(LabelGroup)] +#[label(set = SniSet)] +pub struct SniGroup { + pub protocol: Protocol, + pub kind: SniKind, +} + #[derive(FixedCardinalityLabel, Copy, Clone)] -#[label(singleton = "kind")] pub enum SniKind { + /// Domain name based routing. SNI for libpq/websockets. Host for HTTP Sni, + /// Metadata based routing. `options` for libpq/websockets. Header for HTTP NoSni, + /// Metadata based routing, using the password field. PasswordHack, } diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 6f24ad3dec..2a7069b1c2 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -56,6 +56,7 @@ use crate::serverless::backend::PoolingBackend; use crate::serverless::http_util::{api_error_into_response, json_response}; pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; +pub(crate) const AUTH_BROKER_SNI: &str = "apiauth"; pub async fn task_main( config: &'static ProxyConfig, diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index fee5942b7e..dfaeedaeae 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -38,7 +38,7 @@ use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; use crate::context::RequestContext; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::http::{ReadBodyError, read_body_with_limit}; -use crate::metrics::{HttpDirection, Metrics}; +use crate::metrics::{HttpDirection, Metrics, SniGroup, SniKind}; use crate::proxy::{NeonOptions, run_until_cancelled}; use crate::serverless::backend::HttpConnError; use crate::types::{DbName, RoleName}; @@ -227,6 +227,32 @@ fn get_conn_info( } } + // check the URL that was used, for metrics + { + let host_endpoint = headers + // get the host header + .get("host") + // extract the domain + .and_then(|h| { + let (host, _port) = h.to_str().ok()?.split_once(':')?; + Some(host) + }) + // get the endpoint prefix + .map(|h| h.split_once('.').map_or(h, |(prefix, _)| prefix)); + + let kind = if host_endpoint == Some(&*endpoint) { + SniKind::Sni + } else { + SniKind::NoSni + }; + + let protocol = ctx.protocol(); + Metrics::get() + .proxy + .accepted_connections_by_sni + .inc(SniGroup { protocol, kind }); + } + ctx.set_user_agent( headers .get(hyper::header::USER_AGENT) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index aa468d9386..1b4562c0b3 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3835,7 +3835,7 @@ class NeonAuthBroker: external_http_port: int, auth_backend: NeonAuthBroker.ProxyV1, ): - self.domain = "apiauth.local.neon.build" # resolves to 127.0.0.1 + self.domain = "local.neon.build" # resolves to 127.0.0.1 self.host = "127.0.0.1" self.http_port = http_port self.external_http_port = external_http_port @@ -3852,7 +3852,7 @@ class NeonAuthBroker: # generate key of it doesn't exist crt_path = self.test_output_dir / "proxy.crt" key_path = self.test_output_dir / "proxy.key" - generate_proxy_tls_certs("apiauth.local.neon.build", key_path, crt_path) + generate_proxy_tls_certs(f"apiauth.{self.domain}", key_path, crt_path) args = [ str(self.neon_binpath / "proxy"), @@ -3896,10 +3896,10 @@ class NeonAuthBroker: log.info(f"Executing http query: {query}") - connstr = f"postgresql://{user}@{self.domain}/postgres" + connstr = f"postgresql://{user}@ep-foo-bar-1234.{self.domain}/postgres" async with httpx.AsyncClient(verify=str(self.test_output_dir / "proxy.crt")) as client: response = await client.post( - f"https://{self.domain}:{self.external_http_port}/sql", + f"https://apiauth.{self.domain}:{self.external_http_port}/sql", json={"query": query, "params": args}, headers={ "Neon-Connection-String": connstr, From b37bb7d7edaab870d05bff7286e345066d49664e Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 8 May 2025 20:48:24 +0200 Subject: [PATCH 069/142] pageserver: timeline shutdown: fully quiesce ingest path before`freeze_and_flush` (#11851) # Problem Before this PR, timeline shutdown would - cancel the walreceiver cancellation token subtree (child token of Timeline::cancel) - call freeze_and_flush - Timeline::cancel.cancel() - ... bunch of waiting for things ... - Timeline::gate.close() As noted by the comment that is deleted by this PR, this left a window where, after freeze_and_flush, walreceiver could still be running and ingest data into a new InMemoryLayer. This presents a potential source of log noise during Timeline shutdown where the InMemoryLayer created after the freeze_and_flush observes that Timeline::cancel is cancelled, failing the ingest with some anyhow::Error wrapping (deeply) a `FlushTaskError::Cancelled` instance (`flush task cancelled` error message). # Solution It turns out that it is quite easy to shut down, not just cancel, walreceiver completely because the only subtask spawned by walreceiver connection manager is the `handle_walreceiver_connection` task, which is properly shut down and waited upon when the manager task observes cancellation and exits its retry loop. The alternative is to replace all the usage of `anyhow` on the ingest path with differentiated error types. A lot of busywork for little gain to fix a potential logging noise nuisance, so, not doing that for now. # Correctness / Risk We do not risk leaking walreceiver child tasks because existing discipline is to hold a gate guard. We will prolong `Timeline::shutdown` to the degree that we're no longer making progress with the rest of shutdown while the walreceiver task hasn't yet observed cancellation. In practice, this should be negligible. `Timeline::shutdown` could fail to complete if there is a hidden dependency of walreceiver shutdown on some subsystem. The code certainly suggests there isn't, and I'm not aware of any such dependency. Anyway, impact will be low because we only shut down Timeline instances that are obsolete, either because there is a newer attachment at a different location, or because the timeline got deleted by the user. We would learn about this through stuck cplane operations or stuck storcon reconciliations. We would be able to mitigate by cancelling such stuck operations/reconciliations and/or by rolling back pageserver. # Refs - identified this while investigating https://github.com/neondatabase/neon/issues/11762 - PR that _does_ fix a bunch _real_ `flush task cancelled` noise on the compaction path: https://github.com/neondatabase/neon/pull/11853 --- pageserver/src/tenant/timeline.rs | 12 ++---------- pageserver/src/tenant/timeline/walreceiver.rs | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c8d897d074..d7f5958128 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2127,22 +2127,14 @@ impl Timeline { debug_assert_current_span_has_tenant_and_timeline_id(); // Regardless of whether we're going to try_freeze_and_flush - // or not, stop ingesting any more data. Walreceiver only provides - // cancellation but no "wait until gone", because it uses the Timeline::gate. - // So, only after the self.gate.close() below will we know for sure that - // no walreceiver tasks are left. - // For `try_freeze_and_flush=true`, this means that we might still be ingesting - // data during the call to `self.freeze_and_flush()` below. - // That's not ideal, but, we don't have the concept of a ChildGuard, - // which is what we'd need to properly model early shutdown of the walreceiver - // task sub-tree before the other Timeline task sub-trees. + // or not, stop ingesting any more data. let walreceiver = self.walreceiver.lock().unwrap().take(); tracing::debug!( is_some = walreceiver.is_some(), "Waiting for WalReceiverManager..." ); if let Some(walreceiver) = walreceiver { - walreceiver.cancel(); + walreceiver.shutdown().await; } // ... and inform any waiters for newer LSNs that there won't be any. self.last_record_lsn.shutdown(); diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 4f80073cc3..0f73eb839b 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -63,6 +63,7 @@ pub struct WalReceiver { /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token. /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`. cancel: CancellationToken, + task: tokio::task::JoinHandle<()>, } impl WalReceiver { @@ -79,7 +80,7 @@ impl WalReceiver { let loop_status = Arc::new(std::sync::RwLock::new(None)); let manager_status = Arc::clone(&loop_status); let cancel = timeline.cancel.child_token(); - WALRECEIVER_RUNTIME.spawn({ + let task = WALRECEIVER_RUNTIME.spawn({ let cancel = cancel.clone(); async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -120,14 +121,25 @@ impl WalReceiver { Self { manager_status, cancel, + task, } } #[instrument(skip_all, level = tracing::Level::DEBUG)] - pub fn cancel(&self) { + pub async fn shutdown(self) { debug_assert_current_span_has_tenant_and_timeline_id(); debug!("cancelling walreceiver tasks"); self.cancel.cancel(); + match self.task.await { + Ok(()) => debug!("Shutdown success"), + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { + // already logged by panic hook + } + Err(je) => { + error!("shutdown walreceiver task join error: {je}") + } + } } pub(crate) fn status(&self) -> Option { From 101e115b3885dd966a839ef50b450771988fa9aa Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 9 May 2025 09:54:40 +0300 Subject: [PATCH 070/142] Change prefetch logic in vacuum (#11650) ## Problem See https://neondb.slack.com/archives/C03QLRH7PPD/p1745003314183649 Vacuum doesn't use prefetch because this strange logic in `lazy_scan_heap`: ``` /* And only up to the next unskippable block */ if (next_prefetch_block + prefetch_budget > vacrel->next_unskippable_block) prefetch_budget = vacrel->next_unskippable_block - next_prefetch_block; ``` ## Summary of changes Disable prefetch only if vacuum jumps to next skippable block (there is SKIP_PAGES_THRESHOLD) which cancel seqscan and perform jump only if gap is large enough). Postgres PRs: https://github.com/neondatabase/postgres/pull/620 https://github.com/neondatabase/postgres/pull/621 https://github.com/neondatabase/postgres/pull/622 https://github.com/neondatabase/postgres/pull/623 --------- Co-authored-by: Konstantin Knizhnik --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v17 | 2 +- vendor/revisions.json | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 108856a4ae..06b405bc98 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 108856a4ae76be285b04497a0ed08fcbe60ddbe9 +Subproject commit 06b405bc982fd53522689aa4acbfd9c44b7993cf diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index b838c8969b..72f83df76c 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit b838c8969b7c63f3e637a769656f5f36793b797c +Subproject commit 72f83df76c61ce18d81bd371f0afd2a43d59c052 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index b763ab54b9..0d59c91c1a 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit b763ab54b98d232a0959371ab1d07f06ed77c49e +Subproject commit 0d59c91c1a23e667f1d1169d5f040b3fa0a0ab44 diff --git a/vendor/revisions.json b/vendor/revisions.json index 4307fd1c3f..10aad7e1a2 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,7 +1,7 @@ { "v17": [ "17.4", - "b763ab54b98d232a0959371ab1d07f06ed77c49e" + "0d59c91c1a23e667f1d1169d5f040b3fa0a0ab44" ], "v16": [ "16.8", @@ -9,10 +9,10 @@ ], "v15": [ "15.12", - "b838c8969b7c63f3e637a769656f5f36793b797c" + "72f83df76c61ce18d81bd371f0afd2a43d59c052" ], "v14": [ "14.17", - "108856a4ae76be285b04497a0ed08fcbe60ddbe9" + "06b405bc982fd53522689aa4acbfd9c44b7993cf" ] } From 5cd7f936f90978673a1f6a1dc64765e701035aa4 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Fri, 9 May 2025 08:48:30 +0100 Subject: [PATCH 071/142] fix(neon-rls): optimistically assume role grants are already assigned for replicas (#11811) ## Problem Read replicas cannot grant permissions for roles for Neon RLS. Usually the permission is already granted, so we can optimistically check. See INC-509 ## Summary of changes Perform a permission lookup prior to actually executing any grants. --- Cargo.lock | 1 + compute_tools/Cargo.toml | 1 + compute_tools/src/compute.rs | 52 +++++++++++++++++-------- test_runner/fixtures/neon_fixtures.py | 10 ++++- test_runner/regress/test_role_grants.py | 7 ++++ 5 files changed, 52 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fe4cc35029..7083baa092 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1303,6 +1303,7 @@ dependencies = [ "futures", "http 1.1.0", "indexmap 2.0.1", + "itertools 0.10.5", "jsonwebtoken", "metrics", "nix 0.27.1", diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 8ee5dd0665..f9da3ba700 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -28,6 +28,7 @@ flate2.workspace = true futures.workspace = true http.workspace = true indexmap.workspace = true +itertools.workspace = true jsonwebtoken.workspace = true metrics.workspace = true nix.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 25920675c1..f494e2444a 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -11,6 +11,7 @@ use compute_api::spec::{ use futures::StreamExt; use futures::future::join_all; use futures::stream::FuturesUnordered; +use itertools::Itertools; use nix::sys::signal::{Signal, kill}; use nix::unistd::Pid; use once_cell::sync::Lazy; @@ -18,7 +19,7 @@ use postgres; use postgres::NoTls; use postgres::error::SqlState; use remote_storage::{DownloadError, RemotePath}; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::net::SocketAddr; use std::os::unix::fs::{PermissionsExt, symlink}; use std::path::Path; @@ -1995,23 +1996,40 @@ LIMIT 100", tokio::spawn(conn); // TODO: support other types of grants apart from schemas? - let query = format!( - "GRANT {} ON SCHEMA {} TO {}", - privileges - .iter() - // should not be quoted as it's part of the command. - // is already sanitized so it's ok - .map(|p| p.as_str()) - .collect::>() - .join(", "), - // quote the schema and role name as identifiers to sanitize them. - schema_name.pg_quote(), - role_name.pg_quote(), - ); - db_client - .simple_query(&query) + + // check the role grants first - to gracefully handle read-replicas. + let select = "SELECT privilege_type + FROM pg_namespace + JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true + JOIN pg_user users ON acl.grantee = users.usesysid + WHERE users.usename = $1 + AND nspname = $2"; + let rows = db_client + .query(select, &[role_name, schema_name]) .await - .with_context(|| format!("Failed to execute query: {}", query))?; + .with_context(|| format!("Failed to execute query: {select}"))?; + + let already_granted: HashSet = rows.into_iter().map(|row| row.get(0)).collect(); + + let grants = privileges + .iter() + .filter(|p| !already_granted.contains(p.as_str())) + // should not be quoted as it's part of the command. + // is already sanitized so it's ok + .map(|p| p.as_str()) + .join(", "); + + if !grants.is_empty() { + // quote the schema and role name as identifiers to sanitize them. + let schema_name = schema_name.pg_quote(); + let role_name = role_name.pg_quote(); + + let query = format!("GRANT {grants} ON SCHEMA {schema_name} TO {role_name}",); + db_client + .simple_query(&query) + .await + .with_context(|| format!("Failed to execute query: {}", query))?; + } Ok(()) } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1b4562c0b3..131820f23e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4613,7 +4613,10 @@ class EndpointFactory: return self def new_replica( - self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None + self, + origin: Endpoint, + endpoint_id: str | None = None, + config_lines: list[str] | None = None, ): branch_name = origin.branch_name assert origin in self.endpoints @@ -4629,7 +4632,10 @@ class EndpointFactory: ) def new_replica_start( - self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None + self, + origin: Endpoint, + endpoint_id: str | None = None, + config_lines: list[str] | None = None, ): branch_name = origin.branch_name assert origin in self.endpoints diff --git a/test_runner/regress/test_role_grants.py b/test_runner/regress/test_role_grants.py index b2251875f0..5b13d461f0 100644 --- a/test_runner/regress/test_role_grants.py +++ b/test_runner/regress/test_role_grants.py @@ -39,3 +39,10 @@ def test_role_grants(neon_simple_env: NeonEnv): res = cur.fetchall() assert res == [(1,)], "select should not succeed" + + # confirm that replicas can also ensure the grants are correctly set. + replica = env.endpoints.new_replica_start(endpoint) + replica_client = replica.http_client() + replica_client.set_role_grants( + "test_role_grants", "test_role", "test_schema", ["CREATE", "USAGE"] + ) From 03d635b916ed057826d80bbc709864acb1c108f1 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 9 May 2025 12:07:08 +0300 Subject: [PATCH 072/142] Add more guards for prefetch_pump_state (#11859) ## Problem See https://neondb.slack.com/archives/C08PJ07BZ44/p1746566292750689 Looks like there are more cases when `prefetch_pump_state` can be called in unexpected place and cause core dump. ## Summary of changes Add more guards. --------- Co-authored-by: Konstantin Knizhnik --- pgxn/neon/communicator.c | 36 +++++++++++++++++++++--------------- pgxn/neon/communicator.h | 2 +- pgxn/neon/pagestore_smgr.c | 20 ++++++++++---------- vendor/postgres-v16 | 2 +- vendor/revisions.json | 2 +- 5 files changed, 34 insertions(+), 28 deletions(-) diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c index 818a149499..9609f186b9 100644 --- a/pgxn/neon/communicator.c +++ b/pgxn/neon/communicator.c @@ -425,15 +425,12 @@ compact_prefetch_buffers(void) * point inside and outside PostgreSQL. * * This still does throw errors when it receives malformed responses from PS. - * - * When we're not called from CHECK_FOR_INTERRUPTS (indicated by - * IsHandlingInterrupts) we also report we've ended prefetch receive work, - * just in case state tracking was lost due to an error in the sync getPage - * response code. */ void -communicator_prefetch_pump_state(bool IsHandlingInterrupts) +communicator_prefetch_pump_state(void) { + START_PREFETCH_RECEIVE_WORK(); + while (MyPState->ring_receive != MyPState->ring_flush) { NeonResponse *response; @@ -482,9 +479,7 @@ communicator_prefetch_pump_state(bool IsHandlingInterrupts) } } - /* We never pump the prefetch state while handling other pages */ - if (!IsHandlingInterrupts) - END_PREFETCH_RECEIVE_WORK(); + END_PREFETCH_RECEIVE_WORK(); communicator_reconfigure_timeout_if_needed(); } @@ -672,9 +667,10 @@ prefetch_wait_for(uint64 ring_index) Assert(MyPState->ring_unused > ring_index); + START_PREFETCH_RECEIVE_WORK(); + while (MyPState->ring_receive <= ring_index) { - START_PREFETCH_RECEIVE_WORK(); entry = GetPrfSlot(MyPState->ring_receive); Assert(entry->status == PRFS_REQUESTED); @@ -683,17 +679,18 @@ prefetch_wait_for(uint64 ring_index) result = false; break; } - - END_PREFETCH_RECEIVE_WORK(); CHECK_FOR_INTERRUPTS(); } + if (result) { /* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */ PrefetchRequest *slot = GetPrfSlot(ring_index); - return slot->status == PRFS_RECEIVED; + result = slot->status == PRFS_RECEIVED; } - return false; + END_PREFETCH_RECEIVE_WORK(); + + return result; ; } @@ -720,6 +717,7 @@ prefetch_read(PrefetchRequest *slot) Assert(slot->status == PRFS_REQUESTED); Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_receive); + Assert(readpage_reentrant_guard); if (slot->status != PRFS_REQUESTED || slot->response != NULL || @@ -802,6 +800,7 @@ communicator_prefetch_receive(BufferTag tag) PrfHashEntry *entry; PrefetchRequest hashkey; + Assert(readpage_reentrant_guard); hashkey.buftag = tag; entry = prfh_lookup(MyPState->prf_hash, &hashkey); if (entry != NULL && prefetch_wait_for(entry->slot->my_ring_index)) @@ -821,8 +820,12 @@ communicator_prefetch_receive(BufferTag tag) void prefetch_on_ps_disconnect(void) { + bool save_readpage_reentrant_guard = readpage_reentrant_guard; MyPState->ring_flush = MyPState->ring_unused; + /* Prohibit callig of prefetch_pump_state */ + START_PREFETCH_RECEIVE_WORK(); + while (MyPState->ring_receive < MyPState->ring_unused) { PrefetchRequest *slot; @@ -851,6 +854,9 @@ prefetch_on_ps_disconnect(void) MyNeonCounters->getpage_prefetch_discards_total += 1; } + /* Restore guard */ + readpage_reentrant_guard = save_readpage_reentrant_guard; + /* * We can have gone into retry due to network error, so update stats with * the latest available @@ -2509,7 +2515,7 @@ communicator_processinterrupts(void) if (timeout_signaled) { if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0) - communicator_prefetch_pump_state(true); + communicator_prefetch_pump_state(); timeout_signaled = false; communicator_reconfigure_timeout_if_needed(); diff --git a/pgxn/neon/communicator.h b/pgxn/neon/communicator.h index f55c4b10f1..5376c9b839 100644 --- a/pgxn/neon/communicator.h +++ b/pgxn/neon/communicator.h @@ -44,7 +44,7 @@ extern int communicator_read_slru_segment(SlruKind kind, int64 segno, void *buffer); extern void communicator_reconfigure_timeout_if_needed(void); -extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts); +extern void communicator_prefetch_pump_state(void); #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 87eb420717..f574517b2a 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1179,7 +1179,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, blocknum += iterblocks; } - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); return false; } @@ -1218,7 +1218,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); return false; } @@ -1262,7 +1262,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, */ neon_log(SmgrTrace, "writeback noop"); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1315,7 +1315,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } /* Try to read PS results if they are available */ - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); @@ -1339,7 +1339,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -1449,7 +1449,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, nblocks, PG_IOV_MAX); /* Try to read PS results if they are available */ - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks); @@ -1480,7 +1480,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -1665,7 +1665,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1727,7 +1727,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1902,7 +1902,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop"); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 05ddf212e2..d72d76f2cd 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 05ddf212e2e07b788b5c8b88bdcf98630941f6ae +Subproject commit d72d76f2cdee4194dd052ce099e9784aca7c794a diff --git a/vendor/revisions.json b/vendor/revisions.json index 10aad7e1a2..e76510f969 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -5,7 +5,7 @@ ], "v16": [ "16.8", - "05ddf212e2e07b788b5c8b88bdcf98630941f6ae" + "d72d76f2cdee4194dd052ce099e9784aca7c794a" ], "v15": [ "15.12", From d0dc65da124d3f84e2f64ac5e3927b0a299c9eab Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Fri, 9 May 2025 18:12:49 +0800 Subject: [PATCH 073/142] fix(pageserver): give up gc-compaction if one key has too long history (#11869) ## Problem The limitation we imposed last week https://github.com/neondatabase/neon/pull/11709 is not enough to protect excessive memory usage. ## Summary of changes If a single key accumulated too much history, give up compaction. In the future, we can make the `generate_key_retention` function take a stream of keys instead of first accumulating them in memory, thus easily support such long key history cases. Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline/compaction.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 07cd274a41..6b155268d6 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -3606,6 +3606,13 @@ impl Timeline { last_key = Some(key); } accumulated_values.push((key, lsn, val)); + + if accumulated_values.len() >= 65536 { + // Assume all of them are images, that would be 512MB of data in memory for a single key. + return Err(CompactionError::Other(anyhow!( + "too many values for a single key, giving up gc-compaction" + ))); + } } else { let last_key: &mut Key = last_key.as_mut().unwrap(); stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction From d0aaec2abbf502a962351b5939f1fae974053cd5 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Fri, 9 May 2025 11:55:26 +0100 Subject: [PATCH 074/142] storage_controller: create imported timelines on safekeepers (#11801) ## Problem SK timeline creations were skipped for imported timelines since we didn't know the correct start LSN of the timeline at that point. ## Summary of changes Created imported timelines on the SK as part of the import finalize step. We use the last record LSN of shard 0 as the start LSN for the safekeeper timeline. Closes https://github.com/neondatabase/neon/issues/11569 --- storage_controller/src/service.rs | 51 ++++++++++++++----- .../src/service/safekeeper_service.rs | 36 +++++++++++++ test_runner/regress/test_import_pgdata.py | 33 ++++++------ 3 files changed, 90 insertions(+), 30 deletions(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index fdb791c2cf..193050460d 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -3886,10 +3886,10 @@ impl Service { None } else if safekeepers { - // Note that we do not support creating the timeline on the safekeepers - // for imported timelines. The `start_lsn` of the timeline is not known - // until the import finshes. - // https://github.com/neondatabase/neon/issues/11569 + // Note that for imported timelines, we do not create the timeline on the safekeepers + // straight away. Instead, we do it once the import finalized such that we know what + // start LSN to provide for the safekeepers. This is done in + // [`Self::finalize_timeline_import`]. let res = self .tenant_timeline_create_safekeepers(tenant_id, &timeline_info) .instrument(tracing::info_span!("timeline_create_safekeepers", %tenant_id, timeline_id=%timeline_info.timeline_id)) @@ -3966,11 +3966,22 @@ impl Service { let active = self.timeline_active_on_all_shards(&import).await?; match active { - true => { + Some(timeline_info) => { tracing::info!("Timeline became active on all shards"); + + if self.config.timelines_onto_safekeepers { + // Now that we know the start LSN of this timeline, create it on the + // safekeepers. + self.tenant_timeline_create_safekeepers_until_success( + import.tenant_id, + timeline_info, + ) + .await?; + } + break; } - false => { + None => { tracing::info!("Timeline not active on all shards yet"); tokio::select! { @@ -4004,9 +4015,6 @@ impl Service { .range_mut(TenantShardId::tenant_range(import.tenant_id)) .for_each(|(_id, shard)| shard.importing = TimelineImportState::Idle); - // TODO(vlad): Timeline creations in import mode do not return a correct initdb lsn, - // so we can't create the timeline on the safekeepers. Fix by moving creation here. - // https://github.com/neondatabase/neon/issues/11569 tracing::info!(%import_failed, "Timeline import complete"); Ok(()) @@ -4021,10 +4029,16 @@ impl Service { .await; } + /// If the timeline is active on all shards, returns the [`TimelineInfo`] + /// collected from shard 0. + /// + /// An error is returned if the shard layout has changed during the import. + /// This is guarded against within the storage controller and the pageserver, + /// and, therefore, unexpected. async fn timeline_active_on_all_shards( self: &Arc, import: &TimelineImport, - ) -> anyhow::Result { + ) -> anyhow::Result> { let targets = { let locked = self.inner.read().unwrap(); let mut targets = Vec::new(); @@ -4048,13 +4062,17 @@ impl Service { .expect("Pageservers may not be deleted while referenced"); targets.push((*tenant_shard_id, node.clone())); } else { - return Ok(false); + return Ok(None); } } targets }; + if targets.is_empty() { + anyhow::bail!("No shards found to finalize import for"); + } + let results = self .tenant_for_shards_api( targets, @@ -4070,10 +4088,17 @@ impl Service { ) .await; - Ok(results.into_iter().all(|res| match res { + let all_active = results.iter().all(|res| match res { Ok(info) => info.state == TimelineState::Active, Err(_) => false, - })) + }); + + if all_active { + // Both unwraps are validated above + Ok(Some(results.into_iter().next().unwrap().unwrap())) + } else { + Ok(None) + } } pub(crate) async fn tenant_timeline_archival_config( diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 5eecf0d415..5c15660ba3 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -323,6 +323,42 @@ impl Service { }) } + pub(crate) async fn tenant_timeline_create_safekeepers_until_success( + self: &Arc, + tenant_id: TenantId, + timeline_info: TimelineInfo, + ) -> anyhow::Result<()> { + const BACKOFF: Duration = Duration::from_secs(5); + + loop { + if self.cancel.is_cancelled() { + anyhow::bail!("Shut down requested while finalizing import"); + } + + let res = self + .tenant_timeline_create_safekeepers(tenant_id, &timeline_info) + .await; + + match res { + Ok(_) => { + tracing::info!("Timeline created on safekeepers"); + break; + } + Err(err) => { + tracing::error!("Failed to create timeline on safekeepers: {err}"); + tokio::select! { + _ = self.cancel.cancelled() => { + anyhow::bail!("Shut down requested while finalizing import"); + }, + _ = tokio::time::sleep(BACKOFF) => {} + }; + } + } + } + + Ok(()) + } + /// Directly insert the timeline into the database without reconciling it with safekeepers. /// /// Useful if the timeline already exists on the specified safekeepers, diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index 2fda1991f7..05e63ad955 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -24,6 +24,7 @@ from fixtures.utils import ( skip_in_debug_build, wait_until, ) +from fixtures.workload import Workload from mypy_boto3_kms import KMSClient from mypy_boto3_kms.type_defs import EncryptResponseTypeDef from mypy_boto3_s3 import S3Client @@ -97,6 +98,10 @@ def test_pgdata_import_smoke( f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/storage/api/v1/" ) + if neon_env_builder.storage_controller_config is None: + neon_env_builder.storage_controller_config = {} + neon_env_builder.storage_controller_config["timelines_onto_safekeepers"] = True + env = neon_env_builder.init_start() # The test needs LocalFs support, which is only built in testing mode. @@ -286,34 +291,28 @@ def test_pgdata_import_smoke( # # validate that we can write # - rw_endpoint = env.endpoints.create_start( - branch_name=import_branch_name, - endpoint_id="rw", - tenant_id=tenant_id, - config_lines=ep_config, - ) - rw_endpoint.safe_psql("create table othertable(values text)") - rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()")) + workload = Workload(env, tenant_id, timeline_id, branch_name=import_branch_name) + workload.init() + workload.write_rows(64) + workload.validate() - # TODO: consider using `class Workload` here - # to do compaction and whatnot? + rw_lsn = Lsn(workload.endpoint().safe_psql_scalar("select pg_current_wal_flush_lsn()")) # # validate that we can branch (important use case) # # ... at the tip - _ = env.create_branch( + child_timeline_id = env.create_branch( new_branch_name="br-tip", ancestor_branch_name=import_branch_name, tenant_id=tenant_id, ancestor_start_lsn=rw_lsn, ) - br_tip_endpoint = env.endpoints.create_start( - branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id, config_lines=ep_config - ) - validate_vanilla_equivalence(br_tip_endpoint) - br_tip_endpoint.safe_psql("select * from othertable") + child_workload = workload.branch(timeline_id=child_timeline_id, branch_name="br-tip") + child_workload.validate() + + validate_vanilla_equivalence(child_workload.endpoint()) # ... at the initdb lsn _ = env.create_branch( @@ -330,7 +329,7 @@ def test_pgdata_import_smoke( ) validate_vanilla_equivalence(br_initdb_endpoint) with pytest.raises(psycopg2.errors.UndefinedTable): - br_initdb_endpoint.safe_psql("select * from othertable") + br_initdb_endpoint.safe_psql(f"select * from {workload.table}") @run_only_on_default_postgres(reason="PG version is irrelevant here") From 93b964f829f05b4c7e9bf6408f504bf6b70e033b Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Fri, 9 May 2025 20:07:52 +0800 Subject: [PATCH 075/142] fix(pageserver): do not do image compaction if it's below gc cutoff (#11872) ## Problem We observe image compaction errors after gc-compaction finishes compacting below the gc_cutoff. This is because `repartition` returns an LSN below the gc horizon as we (likely) determined that `distance <= self.repartition_threshold`. I think it's better to keep the current behavior of when to trigger compaction but we should skip image compaction if the returned LSN is below the gc horizon. ## Summary of changes If the repartition returns an invalid LSN, skip image compaction. Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline/compaction.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 6b155268d6..e7d39db70d 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1277,6 +1277,8 @@ impl Timeline { return Ok(CompactionOutcome::YieldForL0); } + let gc_cutoff = *self.applied_gc_cutoff_lsn.read(); + // 2. Repartition and create image layers if necessary match self .repartition( @@ -1287,7 +1289,7 @@ impl Timeline { ) .await { - Ok(((dense_partitioning, sparse_partitioning), lsn)) => { + Ok(((dense_partitioning, sparse_partitioning), lsn)) if lsn >= gc_cutoff => { // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them let image_ctx = RequestContextBuilder::from(ctx) .access_stats_behavior(AccessStatsBehavior::Skip) @@ -1341,6 +1343,10 @@ impl Timeline { } } + Ok(_) => { + info!("skipping repartitioning due to image compaction LSN being below GC cutoff"); + } + // Suppress errors when cancelled. Err(_) if self.cancel.is_cancelled() => {} Err(err) if err.is_cancel() => {} From 33abfc2b741de285846a8cfaef5c2e158d039342 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 9 May 2025 15:34:22 +0200 Subject: [PATCH 076/142] storcon: remove finished safekeeper reconciliations from in-memory hashmap (#11876) ## Problem Currently there is a memory leak, in that finished safekeeper reconciliations leave a cancellation token behind which is never cleaned up. ## Summary of changes The change adds cleanup after finishing of a reconciliation. In order to ensure we remove the correct cancellation token, and we haven't raced with another reconciliation, we introduce a `TokenId` counter to tell tokens apart. Part of https://github.com/neondatabase/neon/issues/11670 --- .../src/service/safekeeper_reconciler.rs | 133 ++++++++++++------ 1 file changed, 88 insertions(+), 45 deletions(-) diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index 71c73a0112..17bb132982 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -1,4 +1,9 @@ -use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration}; +use std::{ + collections::HashMap, + str::FromStr, + sync::{Arc, atomic::AtomicU64}, + time::Duration, +}; use clashmap::{ClashMap, Entry}; use safekeeper_api::models::PullTimelineRequest; @@ -169,10 +174,17 @@ pub(crate) struct ScheduleRequest { pub(crate) kind: SafekeeperTimelineOpKind, } +/// A way to keep ongoing/queued reconcile requests apart +#[derive(Copy, Clone, PartialEq, Eq)] +struct TokenId(u64); + +type OngoingTokens = ClashMap<(TenantId, Option), (CancellationToken, TokenId)>; + /// Handle to per safekeeper reconciler. struct ReconcilerHandle { - tx: UnboundedSender<(ScheduleRequest, CancellationToken)>, - ongoing_tokens: Arc), CancellationToken>>, + tx: UnboundedSender<(ScheduleRequest, CancellationToken, TokenId)>, + ongoing_tokens: Arc, + token_id_counter: AtomicU64, cancel: CancellationToken, } @@ -185,24 +197,28 @@ impl ReconcilerHandle { &self, tenant_id: TenantId, timeline_id: Option, - ) -> CancellationToken { + ) -> (CancellationToken, TokenId) { + let token_id = self + .token_id_counter + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let token_id = TokenId(token_id); let entry = self.ongoing_tokens.entry((tenant_id, timeline_id)); if let Entry::Occupied(entry) = &entry { - let cancel: &CancellationToken = entry.get(); + let (cancel, _) = entry.get(); cancel.cancel(); } - entry.insert(self.cancel.child_token()).clone() + entry.insert((self.cancel.child_token(), token_id)).clone() } /// Cancel an ongoing reconciliation fn cancel_reconciliation(&self, tenant_id: TenantId, timeline_id: Option) { - if let Some((_, cancel)) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) { + if let Some((_, (cancel, _id))) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) { cancel.cancel(); } } fn schedule_reconcile(&self, req: ScheduleRequest) { - let cancel = self.new_token_slot(req.tenant_id, req.timeline_id); + let (cancel, token_id) = self.new_token_slot(req.tenant_id, req.timeline_id); let hostname = req.safekeeper.skp.host.clone(); - if let Err(err) = self.tx.send((req, cancel)) { + if let Err(err) = self.tx.send((req, cancel, token_id)) { tracing::info!("scheduling request onto {hostname} returned error: {err}"); } } @@ -211,13 +227,14 @@ impl ReconcilerHandle { pub(crate) struct SafekeeperReconciler { inner: SafekeeperReconcilerInner, concurrency_limiter: Arc, - rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>, + rx: UnboundedReceiver<(ScheduleRequest, CancellationToken, TokenId)>, cancel: CancellationToken, } /// Thin wrapper over `Service` to not clutter its inherent functions #[derive(Clone)] struct SafekeeperReconcilerInner { + ongoing_tokens: Arc, service: Arc, } @@ -226,15 +243,20 @@ impl SafekeeperReconciler { // We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking. let (tx, rx) = mpsc::unbounded_channel(); let concurrency = service.config.safekeeper_reconciler_concurrency; + let ongoing_tokens = Arc::new(ClashMap::new()); let mut reconciler = SafekeeperReconciler { - inner: SafekeeperReconcilerInner { service }, + inner: SafekeeperReconcilerInner { + service, + ongoing_tokens: ongoing_tokens.clone(), + }, rx, concurrency_limiter: Arc::new(Semaphore::new(concurrency)), cancel: cancel.clone(), }; let handle = ReconcilerHandle { tx, - ongoing_tokens: Arc::new(ClashMap::new()), + ongoing_tokens, + token_id_counter: AtomicU64::new(0), cancel, }; tokio::spawn(async move { reconciler.run().await }); @@ -246,7 +268,9 @@ impl SafekeeperReconciler { req = self.rx.recv() => req, _ = self.cancel.cancelled() => break, }; - let Some((req, req_cancel)) = req else { break }; + let Some((req, req_cancel, req_token_id)) = req else { + break; + }; let permit_res = tokio::select! { req = self.concurrency_limiter.clone().acquire_owned() => req, @@ -265,7 +289,7 @@ impl SafekeeperReconciler { let timeline_id = req.timeline_id; let node_id = req.safekeeper.skp.id; inner - .reconcile_one(req, req_cancel) + .reconcile_one(req, req_cancel, req_token_id) .instrument(tracing::info_span!( "reconcile_one", ?kind, @@ -280,8 +304,14 @@ impl SafekeeperReconciler { } impl SafekeeperReconcilerInner { - async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) { + async fn reconcile_one( + &self, + req: ScheduleRequest, + req_cancel: CancellationToken, + req_token_id: TokenId, + ) { let req_host = req.safekeeper.skp.host.clone(); + let success; match req.kind { SafekeeperTimelineOpKind::Pull => { let Some(timeline_id) = req.timeline_id else { @@ -302,19 +332,22 @@ impl SafekeeperReconcilerInner { tenant_id: req.tenant_id, timeline_id, }; - self.reconcile_inner( - req, - async |client| client.pull_timeline(&pull_req).await, - |resp| { - if let Some(host) = resp.safekeeper_host { - tracing::info!("pulled timeline from {host} onto {req_host}"); - } else { - tracing::info!("timeline already present on safekeeper on {req_host}"); - } - }, - req_cancel, - ) - .await; + success = self + .reconcile_inner( + &req, + async |client| client.pull_timeline(&pull_req).await, + |resp| { + if let Some(host) = resp.safekeeper_host { + tracing::info!("pulled timeline from {host} onto {req_host}"); + } else { + tracing::info!( + "timeline already present on safekeeper on {req_host}" + ); + } + }, + req_cancel, + ) + .await; } SafekeeperTimelineOpKind::Exclude => { // TODO actually exclude instead of delete here @@ -325,22 +358,23 @@ impl SafekeeperReconcilerInner { ); return; }; - self.reconcile_inner( - req, - async |client| client.delete_timeline(tenant_id, timeline_id).await, - |_resp| { - tracing::info!("deleted timeline from {req_host}"); - }, - req_cancel, - ) - .await; + success = self + .reconcile_inner( + &req, + async |client| client.delete_timeline(tenant_id, timeline_id).await, + |_resp| { + tracing::info!("deleted timeline from {req_host}"); + }, + req_cancel, + ) + .await; } SafekeeperTimelineOpKind::Delete => { let tenant_id = req.tenant_id; if let Some(timeline_id) = req.timeline_id { - let deleted = self + success = self .reconcile_inner( - req, + &req, async |client| client.delete_timeline(tenant_id, timeline_id).await, |_resp| { tracing::info!("deleted timeline from {req_host}"); @@ -348,13 +382,13 @@ impl SafekeeperReconcilerInner { req_cancel, ) .await; - if deleted { + if success { self.delete_timeline_from_db(tenant_id, timeline_id).await; } } else { - let deleted = self + success = self .reconcile_inner( - req, + &req, async |client| client.delete_tenant(tenant_id).await, |_resp| { tracing::info!(%tenant_id, "deleted tenant from {req_host}"); @@ -362,12 +396,21 @@ impl SafekeeperReconcilerInner { req_cancel, ) .await; - if deleted { + if success { self.delete_tenant_timelines_from_db(tenant_id).await; } } } } + if success { + self.ongoing_tokens.remove_if( + &(req.tenant_id, req.timeline_id), + |_ttid, (_cancel, token_id)| { + // Ensure that this request is indeed the request we just finished and not a new one + req_token_id == *token_id + }, + ); + } } async fn delete_timeline_from_db(&self, tenant_id: TenantId, timeline_id: TimelineId) { match self @@ -421,10 +464,10 @@ impl SafekeeperReconcilerInner { self.delete_timeline_from_db(tenant_id, timeline_id).await; } } - /// Returns whether the reconciliation happened successfully + /// Returns whether the reconciliation happened successfully (or we got cancelled) async fn reconcile_inner( &self, - req: ScheduleRequest, + req: &ScheduleRequest, closure: impl Fn(SafekeeperClient) -> F, log_success: impl FnOnce(T) -> U, req_cancel: CancellationToken, From 3b7cc4234c8675b777a3f85798734c0b41748d11 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 9 May 2025 19:02:24 +0200 Subject: [PATCH 077/142] Fix PS connect attempt timeouts when facing interrupts (#11880) With the 50ms timeouts of pumping state in connector.c, we need to correctly handle these timeouts that also wake up pg_usleep. This new approach makes the connection attempts re-start the wait whenever it gets woken up early; and CHECK_FOR_INTERRUPTS() is called to make sure we don't miss query cancellations. ## Problem https://neondb.slack.com/archives/C04DGM6SMTM/p1746794528680269 ## Summary of changes Make sure we start sleeping again if pg_usleep got woken up ahead of time. --- pgxn/neon/libpagestore.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index ee4e6ccc5b..3b6c4247c3 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -433,7 +433,6 @@ pageserver_connect(shardno_t shard_no, int elevel) now = GetCurrentTimestamp(); us_since_last_attempt = (int64) (now - shard->last_reconnect_time); - shard->last_reconnect_time = now; /* * Make sure we don't do exponential backoff with a constant multiplier @@ -447,14 +446,23 @@ pageserver_connect(shardno_t shard_no, int elevel) /* * If we did other tasks between reconnect attempts, then we won't * need to wait as long as a full delay. + * + * This is a loop to protect against interrupted sleeps. */ - if (us_since_last_attempt < shard->delay_us) + while (us_since_last_attempt < shard->delay_us) { pg_usleep(shard->delay_us - us_since_last_attempt); + + /* At least we should handle cancellations here */ + CHECK_FOR_INTERRUPTS(); + + now = GetCurrentTimestamp(); + us_since_last_attempt = (int64) (now - shard->last_reconnect_time); } /* update the delay metric */ shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC); + shard->last_reconnect_time = now; /* * Connect using the connection string we got from the From f5070f6aa4dad26b669811bf72923665f0340147 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 9 May 2025 20:13:35 +0200 Subject: [PATCH 078/142] fixup(direct IO): PR #11864 broke test suite parametrization (#11887) PR - github.com/neondatabase/neon/pull/11864 committed yesterday rendered the `PAGESERVER_VIRTUAL_FILE_IO_MODE` env-var-based parametrization ineffective. As a consequence, the tests and benchmarks in `test_runner/` were using the binary built-in-default, i.e., `buffered`. --- test_runner/fixtures/neon_fixtures.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 131820f23e..8f56ee4392 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1274,6 +1274,8 @@ class NeonEnv: if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine + if self.pageserver_virtual_file_io_mode is not None: + ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode if config.pageserver_default_tenant_config_compaction_algorithm is not None: tenant_config = ps_cfg.setdefault("tenant_config", {}) tenant_config["compaction_algorithm"] = ( From 79ddc803af16e35c5d5a9b1c2c520c1fa88adcc4 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Sat, 10 May 2025 16:19:52 +0200 Subject: [PATCH 079/142] feat(direct IO): runtime alignment validation; support config flag on macOS; default to `DirectRw` (#11868) This PR adds a runtime validation mode to check adherence to alignment and size-multiple requirements at the VirtualFile level. This can help prevent alignment bugs from slipping into production because test systems may have more lax requirements than production. (This is not the case today, but it could change in the future). It also allows catching O_DIRECT bugs on systems that don't have O_DIRECT (macOS). Consequently, we can now accept `virtual_file_io_mode={direct,direct-rw}` on macOS now. This has the side benefit of removing some annoying conditional compilation around `IoMode`. A third benefit is that it helped weed out size-multiple requirement violation bugs in how the VirtualFile unit tests exercise read and write APIs. I seized the opportunity to trim these tests down to what actually matters, i.e., exercising of the `OpenFiles` file descriptor cache. Lastly, this PR flips the binary-built-in default to `DirectRw` so that when running Python regress tests and benchmarks without specifying `PAGESERVER_VIRTUAL_FILE_IO_MODE`, one gets the production behavior. Refs - fixes https://github.com/neondatabase/neon/issues/11676 --- .../pageserver_config/pageserver.toml | 1 + libs/pageserver_api/src/models.rs | 28 +- pageserver/benches/bench_ingest.rs | 9 +- pageserver/src/virtual_file.rs | 309 +++++++----------- pageserver/src/virtual_file/open_options.rs | 59 +++- .../fixtures/pageserver/allowed_errors.py | 7 + 6 files changed, 178 insertions(+), 235 deletions(-) diff --git a/docker-compose/pageserver_config/pageserver.toml b/docker-compose/pageserver_config/pageserver.toml index 7d603b6c65..81445ed412 100644 --- a/docker-compose/pageserver_config/pageserver.toml +++ b/docker-compose/pageserver_config/pageserver.toml @@ -5,3 +5,4 @@ listen_http_addr='0.0.0.0:9898' remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' } control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address control_plane_emergency_mode=true +virtual_file_io_mode="buffered" # the CI runners where we run the docker compose tests have slow disks diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ff911499ab..5fcdefba66 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1832,6 +1832,7 @@ pub mod virtual_file { Eq, Hash, strum_macros::EnumString, + strum_macros::EnumIter, strum_macros::Display, serde_with::DeserializeFromStr, serde_with::SerializeDisplay, @@ -1843,10 +1844,8 @@ pub mod virtual_file { /// Uses buffered IO. Buffered, /// Uses direct IO for reads only. - #[cfg(target_os = "linux")] Direct, /// Use direct IO for reads and writes. - #[cfg(target_os = "linux")] DirectRw, } @@ -1854,26 +1853,13 @@ pub mod virtual_file { pub fn preferred() -> Self { // The default behavior when running Rust unit tests without any further // flags is to use the newest behavior (DirectRw). - // The CI uses the following environment variable to unit tests for all - // different modes. + // The CI uses the environment variable to unit tests for all different modes. // NB: the Python regression & perf tests have their own defaults management // that writes pageserver.toml; they do not use this variable. - if cfg!(test) { - static CACHED: LazyLock = LazyLock::new(|| { - utils::env::var_serde_json_string( - "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE", - ) - .unwrap_or( - #[cfg(target_os = "linux")] - IoMode::DirectRw, - #[cfg(not(target_os = "linux"))] - IoMode::Buffered, - ) - }); - *CACHED - } else { - IoMode::Buffered - } + static ENV_OVERRIDE: LazyLock> = LazyLock::new(|| { + utils::env::var_serde_json_string("NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE") + }); + ENV_OVERRIDE.unwrap_or(IoMode::DirectRw) } } @@ -1883,9 +1869,7 @@ pub mod virtual_file { fn try_from(value: u8) -> Result { Ok(match value { v if v == (IoMode::Buffered as u8) => IoMode::Buffered, - #[cfg(target_os = "linux")] v if v == (IoMode::Direct as u8) => IoMode::Direct, - #[cfg(target_os = "linux")] v if v == (IoMode::DirectRw as u8) => IoMode::DirectRw, x => return Err(x), }) diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 2836450a0e..eaadfe14ae 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -14,6 +14,7 @@ use pageserver_api::key::Key; use pageserver_api::models::virtual_file::IoMode; use pageserver_api::shard::TenantShardId; use pageserver_api::value::Value; +use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; use utils::bin_ser::BeSer; use utils::id::{TenantId, TimelineId}; @@ -244,13 +245,7 @@ fn criterion_benchmark(c: &mut Criterion) { ]; let exploded_parameters = { let mut out = Vec::new(); - for io_mode in [ - IoMode::Buffered, - #[cfg(target_os = "linux")] - IoMode::Direct, - #[cfg(target_os = "linux")] - IoMode::DirectRw, - ] { + for io_mode in IoMode::iter() { for param in expect.clone() { let HandPickedParameters { volume_mib, diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index f429e59ef3..c707d35114 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -74,6 +74,8 @@ pub struct VirtualFile { impl VirtualFile { /// Open a file in read-only mode. Like File::open. + /// + /// Insensitive to `virtual_file_io_mode` setting. pub async fn open>( path: P, ctx: &RequestContext, @@ -95,31 +97,20 @@ impl VirtualFile { Self::open_with_options_v2(path.as_ref(), OpenOptions::new().read(true), ctx).await } + /// `O_DIRECT` will be enabled base on `virtual_file_io_mode`. pub async fn open_with_options_v2>( path: P, - #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut open_options: OpenOptions, + mut open_options: OpenOptions, ctx: &RequestContext, ) -> Result { let mode = get_io_mode(); - let set_o_direct = match (mode, open_options.is_write()) { + let direct = match (mode, open_options.is_write()) { (IoMode::Buffered, _) => false, - #[cfg(target_os = "linux")] (IoMode::Direct, false) => true, - #[cfg(target_os = "linux")] (IoMode::Direct, true) => false, - #[cfg(target_os = "linux")] (IoMode::DirectRw, _) => true, }; - if set_o_direct { - #[cfg(target_os = "linux")] - { - open_options = open_options.custom_flags(nix::libc::O_DIRECT); - } - #[cfg(not(target_os = "linux"))] - unreachable!( - "O_DIRECT is not supported on this platform, IoMode's that result in set_o_direct=true shouldn't even be defined" - ); - } + open_options = open_options.direct(direct); let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?; Ok(VirtualFile { inner, _mode: mode }) } @@ -791,6 +782,12 @@ impl VirtualFileInner { where Buf: tokio_epoll_uring::IoBufMut + Send, { + self.validate_direct_io( + Slice::stable_ptr(&buf).addr(), + Slice::bytes_total(&buf), + offset, + ); + let file_guard = match self .lock_file() .await @@ -816,6 +813,8 @@ impl VirtualFileInner { offset: u64, ctx: &RequestContext, ) -> (FullSlice, Result) { + self.validate_direct_io(buf.as_ptr().addr(), buf.len(), offset); + let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, Err(e) => return (buf, Err(e)), @@ -830,6 +829,64 @@ impl VirtualFileInner { (buf, result) }) } + + /// Validate all reads and writes to adhere to the O_DIRECT requirements of our production systems. + /// + /// Validating it iin userspace sets a consistent bar, independent of what actual OS/filesystem/block device is in use. + fn validate_direct_io(&self, addr: usize, size: usize, offset: u64) { + // TODO: eventually enable validation in the builds we use in real environments like staging, preprod, and prod. + if !(cfg!(feature = "testing") || cfg!(test)) { + return; + } + if !self.open_options.is_direct() { + return; + } + + // Validate buffer memory alignment. + // + // What practically matters as of Linux 6.1 is bdev_dma_alignment() + // which is practically between 512 and 4096. + // On our production systems, the value is 512. + // The IoBuffer/IoBufferMut hard-code that value. + // + // Because the alloctor might return _more_ aligned addresses than requested, + // there is a chance that testing would not catch violations of a runtime requirement stricter than 512. + { + let requirement = 512; + let remainder = addr % requirement; + assert!( + remainder == 0, + "Direct I/O buffer must be aligned: buffer_addr=0x{addr:x} % 0x{requirement:x} = 0x{remainder:x}" + ); + } + + // Validate offset alignment. + // + // We hard-code 512 throughout the code base. + // So enforce just that and not anything more restrictive. + // Even the shallowest testing will expose more restrictive requirements if those ever arise. + { + let requirement = 512; + let remainder = offset % requirement; + assert!( + remainder == 0, + "Direct I/O offset must be aligned: offset=0x{offset:x} % 0x{requirement:x} = 0x{remainder:x}" + ); + } + + // Validate buffer size multiple requirement. + // + // The requirement in Linux 6.1 is bdev_logical_block_size(). + // On our production systems, that is 512. + { + let requirement = 512; + let remainder = size % requirement; + assert!( + remainder == 0, + "Direct I/O buffer size must be a multiple of {requirement}: size=0x{size:x} % 0x{requirement:x} = 0x{remainder:x}" + ); + } + } } // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 @@ -1218,7 +1275,6 @@ mod tests { use std::sync::Arc; use owned_buffers_io::io_buf_ext::IoBufExt; - use owned_buffers_io::slice::SliceMutExt; use rand::seq::SliceRandom; use rand::{Rng, thread_rng}; @@ -1226,162 +1282,38 @@ mod tests { use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; - enum MaybeVirtualFile { - VirtualFile(VirtualFile), - File(File), - } - - impl From for MaybeVirtualFile { - fn from(vf: VirtualFile) -> Self { - MaybeVirtualFile::VirtualFile(vf) - } - } - - impl MaybeVirtualFile { - async fn read_exact_at( - &self, - mut slice: tokio_epoll_uring::Slice, - offset: u64, - ctx: &RequestContext, - ) -> Result, Error> { - match self { - MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await, - MaybeVirtualFile::File(file) => { - let rust_slice: &mut [u8] = slice.as_mut_rust_slice_full_zeroed(); - file.read_exact_at(rust_slice, offset).map(|()| slice) - } - } - } - async fn write_all_at( - &self, - buf: FullSlice, - offset: u64, - ctx: &RequestContext, - ) -> Result<(), Error> { - match self { - MaybeVirtualFile::VirtualFile(file) => { - let (_buf, res) = file.write_all_at(buf, offset, ctx).await; - res - } - MaybeVirtualFile::File(file) => file.write_all_at(&buf[..], offset), - } - } - - // Helper function to slurp a portion of a file into a string - async fn read_string_at( - &mut self, - pos: u64, - len: usize, - ctx: &RequestContext, - ) -> Result { - let slice = IoBufferMut::with_capacity(len).slice_full(); - assert_eq!(slice.bytes_total(), len); - let slice = self.read_exact_at(slice, pos, ctx).await?; - let buf = slice.into_inner(); - assert_eq!(buf.len(), len); - - Ok(String::from_utf8(buf.to_vec()).unwrap()) - } - } - #[tokio::test] async fn test_virtual_files() -> anyhow::Result<()> { - // The real work is done in the test_files() helper function. This - // allows us to run the same set of tests against a native File, and - // VirtualFile. We trust the native Files and wouldn't need to test them, - // but this allows us to verify that the operations return the same - // results with VirtualFiles as with native Files. (Except that with - // native files, you will run out of file descriptors if the ulimit - // is low enough.) - struct A; - - impl Adapter for A { - async fn open( - path: Utf8PathBuf, - opts: OpenOptions, - ctx: &RequestContext, - ) -> Result { - let vf = VirtualFile::open_with_options_v2(&path, opts, ctx).await?; - Ok(MaybeVirtualFile::VirtualFile(vf)) - } - } - test_files::("virtual_files").await - } - - #[tokio::test] - async fn test_physical_files() -> anyhow::Result<()> { - struct B; - - impl Adapter for B { - async fn open( - path: Utf8PathBuf, - opts: OpenOptions, - _ctx: &RequestContext, - ) -> Result { - Ok(MaybeVirtualFile::File({ - let owned_fd = opts.open(path.as_std_path()).await?; - File::from(owned_fd) - })) - } - } - - test_files::("physical_files").await - } - - /// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition - /// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function - /// in trait which benefits from the new lifetime capture rules already. - trait Adapter { - async fn open( - path: Utf8PathBuf, - opts: OpenOptions, - ctx: &RequestContext, - ) -> Result; - } - - async fn test_files(testname: &str) -> anyhow::Result<()> - where - A: Adapter, - { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); - let testdir = crate::config::PageServerConf::test_repo_dir(testname); + let testdir = crate::config::PageServerConf::test_repo_dir("test_virtual_files"); std::fs::create_dir_all(&testdir)?; + let zeropad512 = |content: &[u8]| { + let mut buf = IoBufferMut::with_capacity_zeroed(512); + buf[..content.len()].copy_from_slice(content); + buf.freeze().slice_len() + }; + let path_a = testdir.join("file_a"); - let mut file_a = A::open( + let file_a = VirtualFile::open_with_options_v2( path_a.clone(), OpenOptions::new() + .read(true) .write(true) + // set create & truncate flags to ensure when we trigger a reopen later in this test, + // the reopen_options must have masked out those flags; if they don't, then + // the after reopen we will fail to read the `content_a` that we write here. .create(true) - .truncate(true) - .to_owned(), + .truncate(true), &ctx, ) .await?; + let (_, res) = file_a.write_all_at(zeropad512(b"content_a"), 0, &ctx).await; + res?; - file_a - .write_all_at(IoBuffer::from(b"foobar").slice_len(), 0, &ctx) - .await?; - - // cannot read from a file opened in write-only mode - let _ = file_a.read_string_at(0, 1, &ctx).await.unwrap_err(); - - // Close the file and re-open for reading - let mut file_a = A::open(path_a, OpenOptions::new().read(true), &ctx).await?; - - // cannot write to a file opened in read-only mode - let _ = file_a - .write_all_at(IoBuffer::from(b"bar").slice_len(), 0, &ctx) - .await - .unwrap_err(); - - // Try simple read - assert_eq!("foobar", file_a.read_string_at(0, 6, &ctx).await?); - - // Create another test file, and try FileExt functions on it. let path_b = testdir.join("file_b"); - let mut file_b = A::open( + let file_b = VirtualFile::open_with_options_v2( path_b.clone(), OpenOptions::new() .read(true) @@ -1391,37 +1323,44 @@ mod tests { &ctx, ) .await?; - file_b - .write_all_at(IoBuffer::from(b"BAR").slice_len(), 3, &ctx) - .await?; - file_b - .write_all_at(IoBuffer::from(b"FOO").slice_len(), 0, &ctx) - .await?; + let (_, res) = file_b.write_all_at(zeropad512(b"content_b"), 0, &ctx).await; + res?; - assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA"); + let assert_first_512_eq = async |vfile: &VirtualFile, expect: &[u8]| { + let buf = vfile + .read_exact_at(IoBufferMut::with_capacity_zeroed(512).slice_full(), 0, &ctx) + .await + .unwrap(); + assert_eq!(&buf[..], &zeropad512(expect)[..]); + }; - // Open a lot of files, enough to cause some evictions. (Or to be precise, - // open the same file many times. The effect is the same.) + // Open a lot of file descriptors / VirtualFile instances. + // Enough to cause some evictions in the fd cache. - let mut vfiles = Vec::new(); + let mut file_b_dupes = Vec::new(); for _ in 0..100 { - let mut vfile = A::open(path_b.clone(), OpenOptions::new().read(true), &ctx).await?; - assert_eq!("FOOBAR", vfile.read_string_at(0, 6, &ctx).await?); - vfiles.push(vfile); + let vfile = VirtualFile::open_with_options_v2( + path_b.clone(), + OpenOptions::new().read(true), + &ctx, + ) + .await?; + assert_first_512_eq(&vfile, b"content_b").await; + file_b_dupes.push(vfile); } // make sure we opened enough files to definitely cause evictions. - assert!(vfiles.len() > TEST_MAX_FILE_DESCRIPTORS * 2); + assert!(file_b_dupes.len() > TEST_MAX_FILE_DESCRIPTORS * 2); // The underlying file descriptor for 'file_a' should be closed now. Try to read - // from it again. - assert_eq!("foobar", file_a.read_string_at(0, 6, &ctx).await?); + // from it again. The VirtualFile reopens the file internally. + assert_first_512_eq(&file_a, b"content_a").await; // Check that all the other FDs still work too. Use them in random order for // good measure. - vfiles.as_mut_slice().shuffle(&mut thread_rng()); - for vfile in vfiles.iter_mut() { - assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?); + file_b_dupes.as_mut_slice().shuffle(&mut thread_rng()); + for vfile in file_b_dupes.iter_mut() { + assert_first_512_eq(vfile, b"content_b").await; } Ok(()) @@ -1452,7 +1391,7 @@ mod tests { // Open the file many times. let mut files = Vec::new(); for _ in 0..VIRTUAL_FILES { - let f = VirtualFileInner::open_with_options( + let f = VirtualFile::open_with_options_v2( &test_file_path, OpenOptions::new().read(true), &ctx, @@ -1497,8 +1436,6 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_basic() { - let ctx = - RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic"); std::fs::create_dir_all(&testdir).unwrap(); @@ -1508,26 +1445,22 @@ mod tests { VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); - let post = file.read_string_at(0, 3, &ctx).await.unwrap(); + + let post = std::fs::read_to_string(&path).unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); - drop(file); VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); - let post = file.read_string_at(0, 3, &ctx).await.unwrap(); + + let post = std::fs::read_to_string(&path).unwrap(); assert_eq!(post, "bar"); assert!(!tmp_path.exists()); - drop(file); } #[tokio::test] async fn test_atomic_overwrite_preexisting_tmp() { - let ctx = - RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp"); std::fs::create_dir_all(&testdir).unwrap(); @@ -1542,10 +1475,8 @@ mod tests { .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); - let post = file.read_string_at(0, 3, &ctx).await.unwrap(); + let post = std::fs::read_to_string(&path).unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); - drop(file); } } diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs index a40dfed4a4..7d478f3600 100644 --- a/pageserver/src/virtual_file/open_options.rs +++ b/pageserver/src/virtual_file/open_options.rs @@ -8,7 +8,13 @@ use super::io_engine::IoEngine; #[derive(Debug, Clone)] pub struct OpenOptions { + /// We keep a copy of the write() flag we pass to the `inner`` `OptionOptions` + /// to support [`Self::is_write`]. write: bool, + /// We don't expose + pass through a raw `custom_flags()` style API. + /// The only custom flag we support is `O_DIRECT`, which we track here + /// and map to `custom_flags()` in the [`Self::open`] method. + direct: bool, inner: Inner, } #[derive(Debug, Clone)] @@ -30,6 +36,7 @@ impl Default for OpenOptions { }; Self { write: false, + direct: false, inner, } } @@ -44,6 +51,10 @@ impl OpenOptions { self.write } + pub(super) fn is_direct(&self) -> bool { + self.direct + } + pub fn read(mut self, read: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { @@ -116,13 +127,38 @@ impl OpenOptions { } pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result { - match &self.inner { - Inner::StdFs(x) => x.open(path).map(|file| file.into()), + #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] + let mut custom_flags = 0; + if self.direct { #[cfg(target_os = "linux")] - Inner::TokioEpollUring(x) => { + { + custom_flags |= nix::libc::O_DIRECT; + } + #[cfg(not(target_os = "linux"))] + { + // Other platforms may be used for development but don't necessarily have a 1:1 equivalent to Linux's O_DIRECT (macOS!). + // Just don't set the flag; to catch alignment bugs typical for O_DIRECT, + // we have a runtime validation layer inside `VirtualFile::write_at` and `VirtualFile::read_at`. + static WARNING: std::sync::Once = std::sync::Once::new(); + WARNING.call_once(|| { + let span = tracing::info_span!(parent: None, "open_options"); + let _enter = span.enter(); + tracing::warn!("your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs; this warning is logged once per process"); + }); + } + } + + match self.inner.clone() { + Inner::StdFs(mut x) => x + .custom_flags(custom_flags) + .open(path) + .map(|file| file.into()), + #[cfg(target_os = "linux")] + Inner::TokioEpollUring(mut x) => { + x.custom_flags(custom_flags); let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await; let (_, res) = super::io_engine::retry_ecanceled_once((), |()| async { - let res = system.open(path, x).await; + let res = system.open(path, &x).await; ((), res) }) .await; @@ -144,19 +180,8 @@ impl OpenOptions { self } - pub fn custom_flags(mut self, flags: i32) -> Self { - if flags & nix::libc::O_APPEND != 0 { - super::io_engine::panic_operation_must_be_idempotent(); - } - match &mut self.inner { - Inner::StdFs(x) => { - let _ = x.custom_flags(flags); - } - #[cfg(target_os = "linux")] - Inner::TokioEpollUring(x) => { - let _ = x.custom_flags(flags); - } - } + pub fn direct(mut self, direct: bool) -> Self { + self.direct = direct; self } } diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 43bffd919c..9b564f0a60 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -111,6 +111,13 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*stalling layer flushes for compaction backpressure.*", ".*layer roll waiting for flush due to compaction backpressure.*", ".*BatchSpanProcessor.*", + *( + [ + r".*your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs.*" + ] + if sys.platform != "linux" + else [] + ), ) From 64353b48dbd5a73fc2cf9c9eb1bd3c9b442715cc Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Sat, 10 May 2025 17:06:06 +0200 Subject: [PATCH 080/142] direct+concurrent IO: retroactive RFC (#11788) refs - direct IO epic: https://github.com/neondatabase/neon/issues/8130 - concurrent IO epic https://github.com/neondatabase/neon/issues/9378 - obsoletes direct IO proposal RFC: https://github.com/neondatabase/neon/pull/8240 - discussion in https://neondb.slack.com/archives/C07BZ38E6SD/p1746028030574349 --- docs/rfcs/030-vectored-timeline-get.md | 2 + .../2025-04-30-direct-io-for-pageserver.md | 362 ++++++++++++++++++ ...0-pageserver-concurrent-io-on-read-path.md | 251 ++++++++++++ 3 files changed, 615 insertions(+) create mode 100644 docs/rfcs/2025-04-30-direct-io-for-pageserver.md create mode 100644 docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md diff --git a/docs/rfcs/030-vectored-timeline-get.md b/docs/rfcs/030-vectored-timeline-get.md index 093a964f38..e933eac5fe 100644 --- a/docs/rfcs/030-vectored-timeline-get.md +++ b/docs/rfcs/030-vectored-timeline-get.md @@ -7,6 +7,8 @@ Author: Christian Schwarz A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver. +**EDIT**: the implementation of this feature is described in [Vlad's (internal) tech talk](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link). + # Motivation During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space. diff --git a/docs/rfcs/2025-04-30-direct-io-for-pageserver.md b/docs/rfcs/2025-04-30-direct-io-for-pageserver.md new file mode 100644 index 0000000000..847f5e4040 --- /dev/null +++ b/docs/rfcs/2025-04-30-direct-io-for-pageserver.md @@ -0,0 +1,362 @@ +# Direct IO For Pageserver + +Date: Apr 30, 2025 + +## Summary + +This document is a retroactive RFC. It +- provides some background on what direct IO is, +- motivates why Pageserver should be using it for its IO, and +- describes how we changed Pageserver to use it. + +The [initial proposal](https://github.com/neondatabase/neon/pull/8240) that kicked off the work can be found in this closed GitHub PR. + +People primarily involved in this project were: +- Yuchen Liang +- Vlad Lazar +- Christian Schwarz + +## Timeline + +For posterity, here is the rough timeline of the development work that got us to where we are today. + +- Jan 2024: [integrate `tokio-epoll-uring`](https://github.com/neondatabase/neon/pull/5824) along with owned buffers API +- March 2024: `tokio-epoll-uring` enabled in all regions in buffered IO mode +- Feb 2024 to June 2024: PS PageCache Bypass For Data Blocks + - Feb 2024: [Vectored Get Implementation](https://github.com/neondatabase/neon/pull/6576) bypasses delta & image layer blocks for page requests + - Apr to June 2024: [Epic: bypass PageCache for use data blocks](https://github.com/neondatabase/neon/issues/7386) addresses remaining users +- Aug to Nov 2024: direct IO: first code; preliminaries; read path coding; BufferedWriter; benchmarks show perf regressions too high, no-go. +- Nov 2024 to Jan 2025: address perf regressions by developing page_service pipelining (aka batching) and concurrent IO ([Epic](https://github.com/neondatabase/neon/issues/9376)) +- Feb to March 2024: rollout batching, then concurrent+direct IO => read path and InMemoryLayer is now direct IO +- Apr 2025: develop & roll out direct IO for the write path + +## Background: Terminology & Glossary + +**kernel page cache**: the Linux kernel's page cache is a write-back cache for filesystem contents. +The cached unit is memory-page-sized & aligned chunks of the files that are being cached (typically 4k). +The cache lives in kernel memory and is not directly accessible through userspace. + +**Buffered IO**: an application's read/write system calls go through the kernel page cache. +For example, a 10 byte sized read or write to offset 5000 in a file will load the file contents +at offset `[4096,8192)` into a free page in the kernel page cache. If necessary, it will evict +a page to make room (cf eviction). Then, the kernel performs a memory-to-memory copy of 10 bytes +from/to the offset `4` (`5000 = 4096 + 4`) within the cached page. If it's a write, the kernel keeps +track of the fact that the page is now "dirty" in some ancillary structure. + +**Writeback**: a buffered read/write syscall returns after the memory-to-memory copy. The modifications +made by e.g. write system calls are not even *issued* to disk, let alone durable. Instead, the kernel +asynchronously writes back dirtied pages based on a variety of conditions. For us, the most relevant +ones are a) explicit request by userspace (`fsync`) and b) memory pressure. + +**Memory pressure**: the kernel page cache is a best effort service and a user of spare memory capacity. +If there is no free memory, the kernel page allocator will take pages used by page cache to satisfy allocations. +Before reusing a page like that, the page has to be written back (writeback, see above). +The far-reaching consequence of this is that **any allocation of anonymous memory can do IO** if the only +way to get that memory is by eviction & re-using a dirty page cache page. +Notably, this includes a simple `malloc` in userspace, because eventually that boils down to `mmap(..., MAP_ANON, ...)`. +I refer to this effect as the "malloc latency backscatter" caused by buffered IO. + +**Direct IO** allows application's read/write system calls to bypass the kernel page cache. The filesystem +is still involved because it is ultimately in charge of mapping the concept of files & offsets within them +to sectors on block devices. Typically, the filesystem poses size and alignment requirements for memory buffers +and file offsets (statx `Dio_mem_align` / `Dio_offset_align`), see [this gist](https://gist.github.com/problame/1c35cac41b7cd617779f8aae50f97155). +The IO operations will fail at runtime with EINVAL if the alignment requirements are not met. + +**"buffered" vs "direct"**: the central distinction between buffered and direct IO is about who allocates and +fills the IO buffers, and who controls when exactly the IOs are issued. In buffered IO, it's the syscall handlers, +kernel page cache, and memory management subsystems (cf "writeback"). In direct IO, all of it is done by +the application. +It takes more effort by the application to program with direct instead of buffered IO. +The return is precise control over and a clear distinction between consumption/modification of memory vs disk. + +**Pageserver PageCache**: Pageserver has an additional `PageCache` (referred to as PS PageCache from here on, as opposed to "kernel page cache"). +Its caching unit is 8KiB blocks of the layer files written by Pageserver. +A miss in PageCache is filled by reading from the filesystem, through the `VirtualFile` abstraction layer. +The default size is tiny (64MiB), very much like Postgres's `shared_buffers`. +We ran production at 128MiB for a long time but gradually moved it up to 2GiB over the past ~year. + +**VirtualFile** is Pageserver's abstraction for file IO, very similar to the facility in Postgres that bears the same name. +Its historical purpose appears to be working around open file descriptor limitations, which is practically irrelevant on Linux. +However, the facility in Pageserver is useful as an intermediary layer for metrics and abstracts over the different kinds of +IO engines that Pageserver supports (`std-fs` vs `tokio-epoll-uring`). + +## Background: History Of Caching In Pageserver + +For multiple years, Pageserver's `PageCache` was on the path of all read _and write_ IO. +It performed write-back to the kernel using buffered IO. + +We converted it into a read-only cache of immutable data in [PR 4994](https://github.com/neondatabase/neon/pull/4994). + +The introduction of `tokio-epoll-uring` required converting the code base to used owned IO buffers. +The `PageCache` pages are usable as owned IO buffers. + +We then started bypassing PageCache for user data blocks. +Data blocks are the 8k blocks of data in layer files that hold the multiple `Value`s, as opposed to the disk btree index blocks that tell us which values exist in a file at what offsets. +The disk btree embedded in delta & image layers remains `PageCache`'d. +Epics for that work were: +- Vectored `Timeline::get` (cf RFC 30) skipped delta and image layer data block `PageCache`ing outright. +- Epic https://github.com/neondatabase/neon/issues/7386 took care of the remaining users for data blocks: + - Materialized page cache (cached materialized pages; shown to be ~0% hit rate in practice) + - InMemoryLayer + - Compaction + +The outcome of the above: +1. All data blocks are always read through the `VirtualFile` APIs, hitting the kernel buffered read path (=> kernel page cache). +2. Indirect blocks (=disk btree blocks) would be cached in the PS `PageCache`. + +In production we size the PS `PageCache` to be 2GiB. +Thus drives hit rate up to ~99.95% and the eviction rate / replacement rates down to less than 200/second on a 1-minute average, on the busiest machines. +High baseline replacement rates are treated as a signal of resource exhaustion (page cache insufficient to host working set of the PS). +The response to this is to migrate tenants away, or increase PS `PageCache` size. +It is currently manual but could be automated, e.g., in Storage Controller. + +In the future, we may eliminate the `PageCache` even for indirect blocks. +For example with an LRU cache that has as unit the entire disk btree content +instead of individual blocks. + +## High-Level Design + +So, before work on this project started, all data block reads and the entire write path of Pageserver were using kernel-buffered IO, i.e., the kernel page cache. +We now want to get the kernel page cache out of the picture by using direct IO for all interaction with the filesystem. +This achieves the following system properties: + +**Predictable VirtualFile latencies** +* With buffered IO, reads are sometimes fast, sometimes slow, depending on kernel page cache hit/miss. +* With buffered IO, appends when writing out new layer files during ingest or compaction are sometimes fast, sometimes slow because of write-back backpressure. +* With buffered IO, the "malloc backscatter" phenomenon pointed out in the Glossary section is not something we actively observe. + But we do have occasional spikes in Dirty memory amount and Memory PSI graphs, so it may already be affecting to some degree. +* By switching to direct IO, above operations will have the (predictable) device latency -- always. + Reads and appends always go to disk. + And malloc will not have to write back dirty data. + +**Explicitness & Tangibility of resource usage** +* In a multi-tenant system, it is generally desirable and valuable to be *explicit* about the main resources we use for each tenant. +* By using direct IO, we become explicit about the resources *disk IOPs* and *memory capacity* in a way that was previously being conflated through the kernel page cache, outside our immediate control. +* We will be able to build per-tenant observability of resource usage ("what tenant is causing the actual IOs that are sent to the disk?"). +* We will be able to build accounting & QoS by implementing an IO scheduler that is tenant aware. The kernel is not tenant-aware and can't do that. + +**CPU Efficiency** +* The involvement of the kernel page cache means one additional memory-to-memory copy on read and write path. +* Direct IO will eliminate that memory-to-memory copy, if we can make the userspace buffers used for the IO calls satisfy direct IO alignment requirements. + +The **trade-off** is that we no longer get the theoretical benefits of the kernel page cache. These are: +- read latency improvements for repeat reads of the same data ("locality of reference") + - asterisk: only if that state is still cache-resident by time of next access +- write throughput by having kernel page cache batch small VFS writes into bigger disk writes + - asterisk: only if memory pressure is low enough that the kernel can afford to delay writeback + +We are **happy to make this trade-off**: +- Because of the advantages listed above. +- Because we empirically have enough DRAM on Pageservers to serve metadata (=index blocks) from PS PageCache. + (At just 2GiB PS PageCache size, we average a 99.95% hit rate). + So, the latency of going to disk is only for data block reads, not the index traversal. +- Because **the kernel page cache is ineffective** at high tenant density anyway (#tenants/pageserver instance). + And because dense packing of tenants will always be desirable to drive COGS down, we should design the system for it. + (See the appendix for a more detailed explanation why this is). +- So, we accept that some reads that used to be fast by circumstance will have higher but **predictable** latency than before. + +### Desired End State + +The desired end state of the project is as follows, and with some asterisks, we have achieved it. + +All IOs of the Pageserver data path use direct IO, thereby bypassing the kernel page cache. + +In particular, the "data path" includes +- the wal ingest path +- compaction +- anything on the `Timeline::get` / `Timeline::get_vectored` path. + +The production Pageserver config is tuned such that virtually all non-data blocks are cached in the PS PageCache. +Hit rate target is 99.95%. + +There are no regressions to ingest latency. + +The total "wait-for-disk time" contribution to random getpage request latency is `O(1 read IOP latency)`. +We accomplish that by having a near 100% PS PageCache hit rate so that layer index traversal effectively never needs not wait for IO. +Thereby, it can issue all the data blocks as it traverses the index, and only wait at the end of it (concurrent IO). + +The amortized "wait-for-disk time" contribution of this direct IO proposal to a series of sequential getpage requests is `1/32 * read IOP latency` for each getpage request. +We accomplish this by server-side batching of up to 32 reads into a single `Timeline::get_vectored` call. +(This is an ideal world where our batches are full - that's not the case in prod today because of lack of queue depth). + +## Design & Implementation + +### Prerequisites + +A lot of prerequisite work had to happen to enable use of direct IO. + +To meet the "wait-for-disk time" requirements from the DoD, we implement for the read path: +- page_service level server-side batching (config field `page_service_pipelining`) +- concurrent IO (config field `get_vectored_concurrent_io`) +The work for both of these these was tracked [in the epic](https://github.com/neondatabase/neon/issues/9376). +Server-side batching will likely be obsoleted by the [#proj-compute-communicator](https://github.com/neondatabase/neon/pull/10799). +The Concurrent IO work is described in retroactive RFC `2025-04-30-pageserver-concurrent-io-on-read-path.md`. +The implementation is relatively brittle and needs further investment, see the `Future Work` section in that RFC. + +For the write path, and especially WAL ingest, we need to hide write latency. +We accomplish this by implementing a (`BufferedWriter`) type that does double-buffering: flushes of the filled +buffer happen in a sidecar tokio task while new writes fill a new buffer. +We refactor InMemoryLayer as well as BlobWriter (=> delta and image layer writers) to use this new `BufferedWriter`. +The most comprehensive write-up of this work is in [the PR description](https://github.com/neondatabase/neon/pull/11558). + +### Ensuring Adherence to Alignment Requirements + +Direct IO puts requirements on +- memory buffer alignment +- io size (=memory buffer size) +- file offset alignment + +The requirements are specific to a combination of filesystem/block-device/architecture(hardware page size!). + +In Neon production environments we currently use ext4 with Linux 6.1.X on AWS and Azure storage-optimized instances (locally attached NVMe). +Instead of dynamic discovery using `statx`, we statically hard-code 512 bytes as the buffer/offset alignment and size-multiple. +We made this decision because: +- a) it is compatible with all the environments we need to run in +- b) our primary workload can be small-random-read-heavy (we do merge adjacent reads if possible, but the worst case is that all `Value`s that needs to be read are far apart) +- c) 512-byte tail latency on the production instance types is much better than 4k (p99.9: 3x lower, p99.99 5x lower). +- d) hard-coding at compile-time allows us to use the Rust type system to enforce the use of only aligned IO buffers, eliminating a source of runtime errors typically associated with direct IO. + +This was [discussed here](https://neondb.slack.com/archives/C07BZ38E6SD/p1725036790965549?thread_ts=1725026845.455259&cid=C07BZ38E6SD). + +The new `IoBufAligned` / `IoBufAlignedMut` marker traits indicate that a given buffer meets memory alignment requirements. +All `VirtualFile` APIs and several software layers built on top of them only accept buffers that implement those traits. +Implementors of the marker traits are: +- `IoBuffer` / `IoBufferMut`: used for most reads and writes +- `PageWriteGuardBuf`: for filling PS PageCache pages (index blocks!) + +The alignment requirement is infectious; it permeates bottom-up throughout the code base. +We stop the infection at roughly the same layers in the code base where we stopped permeating the +use of owned-buffers-style API for tokio-epoll-uring. The way the stopping works is by introducing +a memory-to-memory copy from/to some unaligned memory location on the stack/current/heap. +The places where we currently stop permeating are sort of arbitrary. For example, it would probably +make sense to replace more usage of `Bytes` that we know holds 8k pages with 8k-sized `IoBuffer`s. + +The `IoBufAligned` / `IoBufAlignedMut` types do not protect us from the following types of runtime errors: +- non-adherence to file offset alignment requirements +- non-adherence to io size requirements + +The following higher-level constructs ensure we meet the requirements: +- read path: the `ChunkedVectoredReadBuilder` and `mod vectored_dio_read` ensure reads happen at aligned offsets and in appropriate size multiples. +- write path: `BufferedWriter` only writes in multiples of the capacity, at offsets that are `start_offset+N*capacity`; see its doc comment. + +Note that these types are used always, regardless of whether direct IO is enabled or not. +There are some cases where this adds unnecessary overhead to buffered IO (e.g. all memcpy's inflated to multiples of 512). +But we could not identify meaningful impact in practice when we shipped these changes while we were still using buffered IO. + +### Configuration / Feature Flagging + +In the previous section we described how all users of VirtualFile were changed to always adhere to direct IO alignment and size-multiple requirements. +To actually enable direct IO, all we need to do is set the `O_DIRECT` flag in `open` syscalls / io_uring operations. + +We set `O_DIRECT` based on: +- the VirtualFile API used to create/open the VirtualFile instance +- the `virtual_file_io_mode` configuration flag +- the OpenOptions `read` and/or `write` flags. + +The VirtualFile APIs suffixed with `_v2` are the only ones that _may_ open with `O_DIRECT` depending on the other two factors in above list. +Other APIs never use `O_DIRECT`. +(The name is bad and should really be `_maybe_direct_io`.) + +The reason for having new APIs is because all code used VirtualFile but implementation and rollout happened in consecutive phases (read path, InMemoryLayer, write path). +At the VirtualFile level, context on whether an instance of VirtualFile is on read path, InMemoryLayer, or write path is not available. + +The `_v2` APIs then check make the decision to set `O_DIRECT` based on the `virtual_file_io_mode` flag and the OpenOptions `read`/`write` flags. +The result is the following runtime behavior: + +|what|OpenOptions|`v_f_io_mode`
=`buffered`|`v_f_io_mode`
=`direct`|`v_f_io_mode`
=`direct-rw`| +|-|-|-|-|-| +|`DeltaLayerInner`|read|()|O_DIRECT|O_DIRECT| +|`ImageLayerInner`|read|()|O_DIRECT|O_DIRECT| +|`InMemoryLayer`|read + write|()|()*|O_DIRECT| +|`DeltaLayerWriter`| write | () | () | O_DIRECT | +|`ImageLayerWriter`| write | () | () | O_DIRECT | +|`download_layer_file`|write |()|()|O_DIRECT| + +The `InMemoryLayer` is marked with `*` because there was a period when it *did* use O_DIRECT under `=direct`. +That period was when we implemented and shipped the first version of `BufferedWriter`. +We used it in `InMemoryLayer` and `download_layer_file` but it was only sensitive to `v_f_io_mode` in `InMemoryLayer`. +The introduction of `=direct-rw`, and the switch of the remaining write path to `BufferedWriter`, happened later, +in https://github.com/neondatabase/neon/pull/11558. + +Note that this way of feature flagging inside VirtualFile makes it less and less a general purpose POSIX file access abstraction. +For example, with `=direct-rw` enabled, it is no longer possible to open a `VirtualFile` without `O_DIRECT`. It'll always be set. + +## Correctness Validation + +The correctness risks with this project were: +- Memory safety issues in the `IoBuffer` / `IoBufferMut` implementation. + These types expose an API that is largely identical to that of the `bytes` crate and/or Vec. +- Runtime errors (=> downtime / unavailability) because of non-adherence to alignment/size-multiple requirements, resulting in EINVAL on the read path. + +We sadly do not have infrastructure to run pageserver under `cargo miri`. +So for memory safety issues, we relied on careful peer review. + +We do assert the production-like alignment requirements in testing builds. +However, these asserts were added retroactively. +The actual validation before rollout happened in staging and pre-prod. +We eventually enabled `=direct`/`=direct-rw` for Rust unit tests and the regression test suite. +I cannot recall a single instance of staging/pre-prod/production errors caused by non-adherence to alignment/size-multiple requirements. +Evidently developer testing was good enough. + +## Performance Validation + +The read path went through a lot of iterations of benchmarking in staging and pre-prod. +The benchmarks in those environments demonstrated performance regressions early in the implementation. +It was actually this performance testing that made us implement batching and concurrent IO to avoid unacceptable regressions. + +The write path was much quicker to validate because `bench_ingest` covered all of the (less numerous) access patterns. + +## Future Work + +There is minor and major follow-up work that can be considered in the future. +Check the (soon-to-be-closed) Epic https://github.com/neondatabase/neon/issues/8130's "Follow-Ups" section for a current list. + +Read Path: +- PS PageCache hit rate is crucial to unlock concurrent IO and reasonable latency for random reads generally. + Instead of reactively sizing PS PageCache, we should estimate the required PS PageCache size + and potentially also use that to drive placement decisions of shards from StorageController + https://github.com/neondatabase/neon/issues/9288 +- ... unless we get rid of PS PageCache entirely and cache the index block in a more specialized cache. + But even then, an estimation of the working set would be helpful to figure out caching strategy. + +Write Path: +- BlobWriter and its users could switch back to a borrowed API https://github.com/neondatabase/neon/issues/10129 +- ... unless we want to implement bypass mode for large writes https://github.com/neondatabase/neon/issues/10101 +- The `TempVirtualFile` introduced as part of this project could internalize more of the common usage pattern: https://github.com/neondatabase/neon/issues/11692 +- Reduce conditional compilation around `virtual_file_io_mode`: https://github.com/neondatabase/neon/issues/11676 + +Both: +- A performance simulation mode that pads VirtualFile op latencies to typical NVMe latencies, even if the underlying storage is faster. + This would avoid misleadingly good performance on developer systems and in benchmarks on systems that are less busy than production hosts. + However, padding latencies at microsecond scale is non-trivial. + +Misc: +- We should finish trimming VirtualFile's scope to be truly limited to core data path read & write. + Abstractions for reading & writing pageserver config, location config, heatmaps, etc, should use + APIs in a different package (`VirtualFile::crashsafe_overwrite` and `VirtualFile::read_to_string` + are good entrypoints for cleanup.) https://github.com/neondatabase/neon/issues/11809 + +# Appendix + +## Why Kernel Page Cache Is Ineffective At Tenant High Density + +In the Motivation section, we stated: + +> - **The kernel page cache ineffective** at high tenant density anyways (#tenants/pageserver instance). + +The reason is that the Pageserver workload sent from Computes is whatever is a Compute cache(s) miss. +That's either sequential scans or random reads. +A random read workload simply causes cache thrashing because a packed Pageserver NVMe drive (`im4gn.2xlarge`) has ~100x more capacity than DRAM available. +It is complete waste to have the kernel page cache cache data blocks in this case. +Sequential read workloads *can* benefit iff those pages have been updated recently (=no image layer yet) and together in time/LSN space. +In such cases, the WAL records of those updates likely sit on the same delta layer block. +When Compute does a sequential scan, it sends a series of single-page requests for these individual pages. +When Pageserver processes the second request in such a series, it goes to the same delta layer block and have a kernel page cache hit. +This dependence on kernel page cache for sequential scan performance is significant, but the solution is at a higher level than generic data block caching. +We can either add a small per-connection LRU cache for such delta layer blocks. +Or we can merge those sequential requests into a larger vectored get request, which is designed to never read a block twice. +This amortizes the read latency for our delta layer block across the vectored get batch size (which currently is up to 32). + +There are Pageserver-internal workloads that do sequential access (compaction, image layer generation), but these +1. are not latency-critical and can do batched access outside of the `page_service` protocol constraints (image layer generation) +2. don't actually need to reconstruct images and therefore can use totally different access methods (=> compaction can use k-way merge iterators with their own internal buffering / prefetching). diff --git a/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md b/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md new file mode 100644 index 0000000000..2dc937d298 --- /dev/null +++ b/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md @@ -0,0 +1,251 @@ +# Concurrent IO for Pageserver Read Path + +Date: May 6, 2025 + +## Summary + +This document is a retroactive RFC on the Pageserver Concurrent IO work that happened in late 2024 / early 2025. + +The gist of it is that Pageserver's `Timeline::get_vectored` now _issues_ the data block read operations against layer files +_as it traverses the layer map_ and only _wait_ once, for all of them, after traversal is complete. + +Assuming a good PS PageCache hits on the index blocks during traversal, this drives down the "wait-for-disk" time +contribution down from `random_read_io_latency * O(number_of_values)` to `random_read_io_latency * O(1 + traversal)`. + +The motivation for why this work had to happen when it happened was the switch of Pageserver to +- not cache user data blocks in PS PageCache and +- switch to use direct IO. +More context on this are given in complimentary RFC `./rfcs/2025-04-30-direct-io-for-pageserver.md`. + +### Refs + +- Epic: https://github.com/neondatabase/neon/issues/9378 +- Prototyping happened during the Lisbon 2024 Offsite hackathon: https://github.com/neondatabase/neon/pull/9002 +- Main implementation PR with good description: https://github.com/neondatabase/neon/issues/9378 + +Design and implementation by: +- Vlad Lazar +- Christian Schwarz + +## Background & Motivation + +The Pageserver read path (`Timeline::get_vectored`) consists of two high-level steps: +- Retrieve the delta and image `Value`s required to reconstruct the requested Page@LSN (`Timeline::get_values_reconstruct_data`). +- Pass these values to walredo to reconstruct the page images. + +The read path used to be single-key but has been made multi-key some time ago. +([Internal tech talk by Vlad](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link)) +However, for simplicity, most of this doc will explain things in terms of a single key being requested. + +The `Value` retrieval step above can be broken down into the following functions: +- **Traversal** of the layer map to figure out which `Value`s from which layer files are required for the page reconstruction. +- **Read IO Planning**: planning of the read IOs that need to be issued to the layer files / filesystem / disk. + The main job here is to coalesce the small value reads into larger filesystem-level read operations. + This layer also takes care of direct IO alignment and size-multiple requirements (cf the RFC for details.) + Check `struct VectoredReadPlanner` and `mod vectored_dio_read` for how it's done. +- **Perform the read IO** using `tokio-epoll-uring`. + +Before this project, above functions were sequentially interleaved, meaning: +1. we would advance traversal, ... +2. discover, that we need to read a value, ... +3. read it from disk using `tokio-epoll-uring`, ... +4. goto 1 unless we're done. + +This meant that if N `Value`s need to be read to reconstruct a page, +the time we spend waiting for disk will be we `random_read_io_latency * O(number_of_values)`. + +## Design + +The **traversal** and **read IO Planning** jobs still happen sequentially, layer by layer, as before. +But instead of performing the read IOs inline, we submit the IOs to a concurrent tokio task for execution. +After the last read from the last layer is submitted, we wait for the IOs to complete. + +Assuming the filesystem / disk is able to actually process the submitted IOs without queuing, +we arrive at _time spent waiting for disk_ ~ `random_read_io_latency * O(1 + traversal)`. + +Note this whole RFC is concerned with the steady state where all layer files required for reconstruction are resident on local NVMe. +Traversal will stall on on-demand layer download if a layer is not yet resident. +It cannot proceed without the layer being resident beccause its next step depends on the contents of the layer index. + +### Avoiding Waiting For IO During Traversal + +The `traversal` component in above time-spent-waiting-for-disk estimation is dominant and needs to be minimized. + +Before this project, traversal needed to perform IOs for the following: +1. The time we are waiting on PS PageCache to page in the visited layers' disk btree index blocks. +2. When visiting a delta layer, reading the data block that contains a `Value` for a requested key, + to determine whether the `Value::will_init` the page and therefore traversal can stop for this key. + +The solution for (1) is to raise the PS PageCache size such that the hit rate is practically 100%. +(Check out the `Background: History Of Caching In Pageserver` section in the RFC on Direct IO for more details.) + +The solution for (2) is source `will_init` from the disk btree index keys, which fortunately +already encode this bit of information since the introduction of the current storage/layer format. + +### Concurrent IOs, Submission & Completion + +To separate IO submission from waiting for its completion, +we introduce the notion of an `IoConcurrency` struct through which IOs are issued. + +An IO is an opaque future that +- captures the `tx` side of a `oneshot` channel +- performs the read IO by calling `VirtualFile::read_exact_at().await` +- sending the result into the `tx` + +Issuing an IO means `Box`ing the future above and handing that `Box` over to the `IoConcurrency` struct. + +The traversal code that submits the IO stores the the corresponding `oneshot::Receiver` +in the `VectoredValueReconstructState`, in the the place where we previously stored +the sequentially read `img` and `records` fields. + +When we're done with traversal, we wait for all submitted IOs: +for each key, there is a future that awaits all the `oneshot::Receiver`s +for that key, and then calls into walredo to reconstruct the page image. +Walredo is now invoked concurrently for each value instead of sequentially. +Walredo itself remains unchanged. + +The spawned IO futures are driven to completion by a sidecar tokio task that +is separate from the task that performs all the layer visiting and spawning of IOs. +That tasks receives the IO futures via an unbounded mpsc channel and +drives them to completion inside a `FuturedUnordered`. + +### Error handling, Panics, Cancellation-Safety + +There are two error classes during reconstruct data retrieval: +* traversal errors: index lookup, move to next layer, and the like +* value read IO errors + +A traversal error fails the entire `get_vectored` request, as before this PR. +A value read error only fails reconstruction of that value. + +Panics and dropping of the `get_vectored` future before it completes +leaves the sidecar task running and does not cancel submitted IOs +(see next section for details on sidecar task lifecycle). +All of this is safe, but, today's preference in the team is to close out +all resource usage explicitly if possible, rather than cancelling + forgetting +about it on drop. So, there is warning if we drop a +`VectoredValueReconstructState`/`ValuesReconstructState` that still has uncompleted IOs. + +### Sidecar Task Lifecycle + +The sidecar tokio task is spawned as part of the `IoConcurrency::spawn_from_conf` struct. +The `IoConcurrency` object acts as a handle through which IO futures are submitted. + +The spawned tokio task holds the `Timeline::gate` open. +It is _not_ sensitive to `Timeline::cancel`, but instead to the `IoConcurrency` object being dropped. + +Once the `IoConcurrency` struct is dropped, no new IO futures can come in +but already submitted IO futures will be driven to completion regardless. +We _could_ safely stop polling these futures because `tokio-epoll-uring` op futures are cancel-safe. +But the underlying kernel and hardware resources are not magically freed up by that. +So, again, in the interest of closing out all outstanding resource usage, we make timeline shutdown wait for sidecar tasks and their IOs to complete. +Under normal conditions, this should be in the low hundreds of microseconds. + +It is advisable to make the `IoConcurrency` as long-lived as possible to minimize the amount of +tokio task churn (=> lower pressure on tokio). Generally this means creating it "high up" in the call stack. +The pain with this is that the `IoConcurrency` reference needs to be propagated "down" to +the (short-lived) functions/scope where we issue the IOs. +We would like to use `RequestContext` for this propagation in the future (issue [here](https://github.com/neondatabase/neon/issues/10460)). +For now, we just add another argument to the relevant code paths. + +### Feature Gating + +The `IoConcurrency` is an `enum` with two variants: `Sequential` and `SidecarTask`. + +The behavior from before this project is available through `IoConcurrency::Sequential`, +which awaits the IO futures in place, without "spawning" or "submitting" them anywhere. + +The `get_vectored_concurrent_io` pageserver config variable determines the runtime value, +**except** for the places that use `IoConcurrency::sequential` to get an `IoConcurrency` object. + +### Alternatives Explored & Caveats Encountered + +A few words on the rationale behind having a sidecar *task* and what +alternatives were considered but abandoned. + +#### Why We Need A Sidecar *Task* / Why Just `FuturesUnordered` Doesn't Work + +We explored to not have a sidecar task, and instead have a `FuturesUnordered` per +`Timeline::get_vectored`. We would queue all IO futures in it and poll it for the +first time after traversal is complete (i.e., at `collect_pending_ios`). + +The obvious disadvantage, but not showstopper, is that we wouldn't be submitting +IOs until traversal is complete. + +The showstopper however, is that deadlocks happen if we don't drive the +IO futures to completion independently of the traversal task. +The reason is that both the IO futures and the traversal task may hold _some_, +_and_ try to acquire _more_, shared limited resources. +For example, both the travseral task and IO future may try to acquire +* a `VirtualFile` file descriptor cache slot async mutex (observed during impl) +* a `tokio-epoll-uring` submission slot (observed during impl) +* a `PageCache` slot (currently this is not the case but we may move more code into the IO futures in the future) + +#### Why We Don't Do `tokio::task`-per-IO-future + +Another option is to spawn a short-lived `tokio::task` for each IO future. +We implemented and benchmarked it during development, but found little +throughput improvement and moderate mean & tail latency degradation. +Concerns about pressure on the tokio scheduler led us to abandon this variant. + +## Future Work + +In addition to what is listed here, also check the "Punted" list in the epic: +https://github.com/neondatabase/neon/issues/9378 + +### Enable `Timeline::get` + +The only major code path that still uses `IoConcurrency::sequential` is `Timeline::get`. +The impact is that roughly the following parts of pageserver do not benefit yet: +- parts of basebackup +- reads performed by the ingest path +- most internal operations that read metadata keys (e.g. `collect_keyspace`!) + +The solution is to propagate `IoConcurrency` via `RequestContext`:https://github.com/neondatabase/neon/issues/10460 + +The tricky part is to figure out at which level of the code the `IoConcurrency` is spawned (and added to the RequestContext). + +Also, propagation via `RequestContext` makes makes it harder to tell during development whether a given +piece of code uses concurrent vs sequential mode: one has to recurisvely walk up the call tree to find the +place that puts the `IoConcurrency` into the `RequestContext`. +We'd have to use `::Sequential` as the conservative default value in a fresh `RequestContext`, and add some +observability to weed out places that fail to enrich with a properly spanwed `IoConcurrency::spawn_from_conf`. + +### Concurrent On-Demand Downloads enabled by Detached Indices + +As stated earlier, traversal stalls on on-demand download because its next step depends on the contents of the layer index. +Once we have separated indices from data blocks (=> https://github.com/neondatabase/neon/issues/11695) +we will only need to stall if the index is not resident. The download of the data blocks can happen concurrently or in the background. For example: +- Move the `Layer::get_or_maybe_download().await` inside the IO futures. + This goes in the opposite direction of the next "future work" item below, but it's easy to do. +- Serve the IO future directly from object storage and dispatch the layer download + to some other actor, e.g., an actor that is responsible for both downloads & eviction. + +### New `tokio-epoll-uring` API That Separates Submission & Wait-For-Completion + +Instead of `$op().await` style API, it would be useful to have a different `tokio-epoll-uring` API +that separates enqueuing (without necessarily `io_uring_enter`ing the kernel each time), submission, +and then wait for completion. + +The `$op().await` API is too opaque, so we _have_ to stuff it into a `FuturesUnordered`. + +A split API as sketched above would allow traversal to ensure an IO operation is enqueued to the kernel/disk (and get back-pressure iff the io_uring squeue is full). +While avoiding spending of CPU cycles on processing of completions while we're still traversing. + +The idea gets muddied by the fact that we may self-deadlock if we submit too much without completing. +So, the submission part of the split API needs to process completions if squeue is full. + +In any way, this split API is precondition for the bigger issue with the design presented here, +which we dicsuss in the next section. + +### Opaque Futures Are Brittle + +The use of opaque futures to represent submitted IOs is a clever hack to minimize changes & allow for near-perfect feature-gating. +However, we take on **brittleness** because callers must guarantee that the submitted futures are independent. +By our experience, it is non-trivial to identify or rule out the interdependencies. +See the lengthy doc comment on the `IoConcurrency::spawn_io` method for more details. + +The better interface and proper subsystem boundary is a _descriptive_ struct of what needs to be done ("read this range from this VirtualFile into this buffer") +and get back a means to wait for completion. +The subsystem can thereby reason by its own how operations may be related; +unlike today, where the submitted opaque future can do just about anything. From a537b2ffd05cb952a3198ca8b36e0dfdfd26e270 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 12 May 2025 09:25:54 +0200 Subject: [PATCH 081/142] pull_timeline: check tombstones by default (#11873) Make `pull_timeline` check tombstones by default. Otherwise, we'd be recreating timelines if the order between creation and deletion got mixed up, as seen in #11838. Fixes #11838. --- libs/safekeeper_api/src/models.rs | 1 + safekeeper/src/pull_timeline.rs | 6 +++++- storage_controller/src/service/safekeeper_reconciler.rs | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index cc31b38fe7..8658dc4011 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -299,6 +299,7 @@ pub struct PullTimelineRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub http_hosts: Vec, + pub ignore_tombstone: Option, } #[derive(Debug, Serialize, Deserialize)] diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 66f2877cc5..c955e667bd 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -468,12 +468,15 @@ pub async fn handle_request( assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); + let check_tombstone = !request.ignore_tombstone.unwrap_or_default(); + match pull_timeline( status, safekeeper_host, sk_auth_token, http_client, global_timelines, + check_tombstone, ) .await { @@ -499,6 +502,7 @@ async fn pull_timeline( sk_auth_token: Option, http_client: reqwest::Client, global_timelines: Arc, + check_tombstone: bool, ) -> Result { let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); info!( @@ -570,7 +574,7 @@ async fn pull_timeline( // Finally, load the timeline. let _tli = global_timelines - .load_temp_timeline(ttid, &tli_dir_path, false) + .load_temp_timeline(ttid, &tli_dir_path, check_tombstone) .await?; Ok(PullTimelineResponse { diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index 17bb132982..f756d98c64 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -331,6 +331,7 @@ impl SafekeeperReconcilerInner { http_hosts, tenant_id: req.tenant_id, timeline_id, + ignore_tombstone: Some(false), }; success = self .reconcile_inner( From 307e1e64c8f9edf641ae92e920821af4eb013b09 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Mon, 12 May 2025 17:17:35 +0800 Subject: [PATCH 082/142] fix(scrubber): more logs wrt relic timelines (#11895) ## Problem Further investigation on https://github.com/neondatabase/neon/issues/11159 reveals that the list_tenant function can find all the shards of the tenant, but then the shard gets missing during the gc timeline list blob. One reason could be that in some ways the timeline gets recognized as a relic timeline. ## Summary of changes Add logging to help identify the issue. Signed-off-by: Alex Chi Z --- storage_scrubber/src/checks.rs | 3 ++- storage_scrubber/src/pageserver_physical_gc.rs | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index b151b612bf..40f3523a7e 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -355,6 +355,7 @@ pub(crate) async fn list_timeline_blobs( match res { ListTimelineBlobsResult::Ready(data) => Ok(data), ListTimelineBlobsResult::MissingIndexPart(_) => { + tracing::warn!("listing raced with removal of an index, retrying"); // Retry if listing raced with removal of an index let data = list_timeline_blobs_impl(remote_client, id, root_target) .await? @@ -441,7 +442,7 @@ async fn list_timeline_blobs_impl( } if index_part_keys.is_empty() && s3_layers.is_empty() { - tracing::debug!("Timeline is empty: expected post-deletion state."); + tracing::info!("Timeline is empty: expected post-deletion state."); if initdb_archive { tracing::info!("Timeline is post deletion but initdb archive is still present."); } diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index e1a4095a3c..49ab192285 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -593,6 +593,7 @@ async fn gc_timeline( index_part_snapshot_time: _, } => (index_part, *index_part_generation, data.unused_index_keys), BlobDataParseResult::Relic => { + tracing::info!("Skipping timeline {ttid}, it is a relic"); // Post-deletion tenant location: don't try and GC it. return Ok(summary); } From a618056770cf83e3a6ff44ccea92d0e15cc1c67a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Ko=C5=82odziejczak?= <31549762+mrl5@users.noreply.github.com> Date: Mon, 12 May 2025 13:24:33 +0200 Subject: [PATCH 083/142] chore(compute): skip audit logs for pg_session_jwt extension (#11883) references https://github.com/neondatabase/cloud/issues/28480#issuecomment-2866961124 related https://github.com/neondatabase/cloud/issues/28863 cc @MihaiBojin @conradludgate --- compute_tools/src/config.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 42d245f55a..933b30134f 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -224,7 +224,10 @@ pub fn write_postgres_conf( writeln!(file, "pgaudit.log_rotation_age=5")?; // Enable audit logs for pg_session_jwt extension - writeln!(file, "pg_session_jwt.audit_log=on")?; + // TODO: Consider a good approach for shipping pg_session_jwt logs to the same sink as + // pgAudit - additional context in https://github.com/neondatabase/cloud/issues/28863 + // + // writeln!(file, "pg_session_jwt.audit_log=on")?; // Add audit shared_preload_libraries, if they are not present. // From a77919f4b2668277795d731a343f0955bf144eb7 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 12 May 2025 16:48:48 +0100 Subject: [PATCH 084/142] merge pg-sni-router into proxy (#11882) ## Problem We realised that pg-sni-router doesn't need to be separate from proxy. just a separate port. ## Summary of changes Add pg-sni-router config to proxy and expose the service. --- proxy/src/binary/local_proxy.rs | 4 +- proxy/src/binary/pg_sni_router.rs | 106 +++++---- proxy/src/binary/proxy.rs | 212 ++++++++++++------ proxy/src/tls/server_config.rs | 33 +-- test_runner/fixtures/neon_fixtures.py | 25 +++ .../regress/test_proxy_metric_collection.py | 4 + test_runner/regress/test_sni_router.py | 26 ++- 7 files changed, 283 insertions(+), 127 deletions(-) diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index ee7f6ffcd7..a566383390 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -423,8 +423,8 @@ async fn refresh_config_inner( if let Some(tls_config) = data.tls { let tls_config = tokio::task::spawn_blocking(move || { crate::tls::server_config::configure_tls( - &tls_config.key_path, - &tls_config.cert_path, + tls_config.key_path.as_ref(), + tls_config.cert_path.as_ref(), None, false, ) diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index 19be058ac3..2239d064b2 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -1,8 +1,10 @@ -/// A stand-alone program that routes connections, e.g. from -/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. -/// -/// This allows connecting to pods/services running in the same Kubernetes cluster from -/// the outside. Similar to an ingress controller for HTTPS. +//! A stand-alone program that routes connections, e.g. from +//! `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. +//! +//! This allows connecting to pods/services running in the same Kubernetes cluster from +//! the outside. Similar to an ingress controller for HTTPS. + +use std::path::Path; use std::{net::SocketAddr, sync::Arc}; use anyhow::{Context, anyhow, bail, ensure}; @@ -86,46 +88,7 @@ pub async fn run() -> anyhow::Result<()> { args.get_one::("tls-key"), args.get_one::("tls-cert"), ) { - (Some(key_path), Some(cert_path)) => { - let key = { - let key_bytes = std::fs::read(key_path).context("TLS key file")?; - - let mut keys = - rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); - - ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - PrivateKeyDer::Pkcs8( - keys.pop() - .expect("keys should not be empty") - .context(format!("Failed to read TLS keys at '{key_path}'"))?, - ) - }; - - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - - let cert_chain: Vec<_> = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .try_collect() - .with_context(|| { - format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") - })? - }; - - // needed for channel bindings - let first_cert = cert_chain.first().context("missing certificate")?; - let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - - let tls_config = - rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) - .context("ring should support TLS1.2 and TLS1.3")? - .with_no_client_auth() - .with_single_cert(cert_chain, key)? - .into(); - - (tls_config, tls_server_end_point) - } + (Some(key_path), Some(cert_path)) => parse_tls(key_path.as_ref(), cert_path.as_ref())?, _ => bail!("tls-key and tls-cert must be specified"), }; @@ -188,7 +151,58 @@ pub async fn run() -> anyhow::Result<()> { match signal {} } -async fn task_main( +pub(super) fn parse_tls( + key_path: &Path, + cert_path: &Path, +) -> anyhow::Result<(Arc, TlsServerEndPoint)> { + let key = { + let key_bytes = std::fs::read(key_path).context("TLS key file")?; + + let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); + + ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); + PrivateKeyDer::Pkcs8( + keys.pop() + .expect("keys should not be empty") + .context(format!( + "Failed to read TLS keys at '{}'", + key_path.display() + ))?, + ) + }; + + let cert_chain_bytes = std::fs::read(cert_path).context(format!( + "Failed to read TLS cert file at '{}.'", + cert_path.display() + ))?; + + let cert_chain: Vec<_> = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() + .with_context(|| { + format!( + "Failed to read TLS certificate chain from bytes from file at '{}'.", + cert_path.display() + ) + })? + }; + + // needed for channel bindings + let first_cert = cert_chain.first().context("missing certificate")?; + let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; + + let tls_config = + rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) + .context("ring should support TLS1.2 and TLS1.3")? + .with_no_client_auth() + .with_single_cert(cert_chain, key)? + .into(); + + Ok((tls_config, tls_server_end_point)) +} + +pub(super) async fn task_main( dest_suffix: Arc, tls_config: Arc, compute_tls_config: Option>, diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index e03f2f33d9..fe0d551f7f 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -1,9 +1,10 @@ use std::net::SocketAddr; +use std::path::PathBuf; use std::pin::pin; use std::sync::Arc; use std::time::Duration; -use anyhow::bail; +use anyhow::{bail, ensure}; use arc_swap::ArcSwapOption; use futures::future::Either; use remote_storage::RemoteStorageConfig; @@ -62,18 +63,18 @@ struct ProxyCliArgs { region: String, /// listen for incoming client connections on ip:port #[clap(short, long, default_value = "127.0.0.1:4432")] - proxy: String, + proxy: SocketAddr, #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)] auth_backend: AuthBackendType, /// listen for management callback connection on ip:port #[clap(short, long, default_value = "127.0.0.1:7000")] - mgmt: String, + mgmt: SocketAddr, /// listen for incoming http connections (metrics, etc) on ip:port #[clap(long, default_value = "127.0.0.1:7001")] - http: String, + http: SocketAddr, /// listen for incoming wss connections on ip:port #[clap(long)] - wss: Option, + wss: Option, /// redirect unauthenticated users to the given uri in case of console redirect auth #[clap(short, long, default_value = "http://localhost:3000/psql_session/")] uri: String, @@ -99,18 +100,18 @@ struct ProxyCliArgs { /// /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir #[clap(short = 'k', long, alias = "ssl-key")] - tls_key: Option, + tls_key: Option, /// path to TLS cert for client postgres connections /// /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir #[clap(short = 'c', long, alias = "ssl-cert")] - tls_cert: Option, + tls_cert: Option, /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`. #[clap(long, alias = "allow-ssl-keylogfile")] allow_tls_keylogfile: bool, /// path to directory with TLS certificates for client postgres connections #[clap(long)] - certs_dir: Option, + certs_dir: Option, /// timeout for the TLS handshake #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] handshake_timeout: tokio::time::Duration, @@ -229,6 +230,9 @@ struct ProxyCliArgs { // TODO: rename to `console_redirect_confirmation_timeout`. #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)] webauth_confirmation_timeout: std::time::Duration, + + #[clap(flatten)] + pg_sni_router: PgSniRouterArgs, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -277,6 +281,25 @@ struct SqlOverHttpArgs { sql_over_http_max_response_size_bytes: usize, } +#[derive(clap::Args, Clone, Debug)] +struct PgSniRouterArgs { + /// listen for incoming client connections on ip:port + #[clap(id = "sni-router-listen", long, default_value = "127.0.0.1:4432")] + listen: SocketAddr, + /// listen for incoming client connections on ip:port, requiring TLS to compute + #[clap(id = "sni-router-listen-tls", long, default_value = "127.0.0.1:4433")] + listen_tls: SocketAddr, + /// path to TLS key for client postgres connections + #[clap(id = "sni-router-tls-key", long)] + tls_key: Option, + /// path to TLS cert for client postgres connections + #[clap(id = "sni-router-tls-cert", long)] + tls_cert: Option, + /// append this domain zone to the SNI hostname to get the destination address + #[clap(id = "sni-router-destination", long)] + dest: Option, +} + pub async fn run() -> anyhow::Result<()> { let _logging_guard = crate::logging::init().await?; let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); @@ -307,73 +330,51 @@ pub async fn run() -> anyhow::Result<()> { Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), } info!("Using region: {}", args.aws_region); - - // TODO: untangle the config args - let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { - ("plain", redis_url) => match redis_url { - None => { - bail!("plain auth requires redis_notifications to be set"); - } - Some(url) => { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone())) - } - }, - ("irsa", _) => match (&args.redis_host, args.redis_port) { - (Some(host), Some(port)) => Some( - ConnectionWithCredentialsProvider::new_with_credentials_provider( - host.to_string(), - port, - elasticache::CredentialsProvider::new( - args.aws_region, - args.redis_cluster_name, - args.redis_user_id, - ) - .await, - ), - ), - (None, None) => { - warn!( - "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client" - ); - None - } - _ => { - bail!("redis-host and redis-port must be specified together"); - } - }, - _ => { - bail!("unknown auth type given"); - } - }; - - let redis_notifications_client = if let Some(url) = args.redis_notifications { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) - } else { - regional_redis_client.clone() - }; + let (regional_redis_client, redis_notifications_client) = configure_redis(&args).await?; // Check that we can bind to address before further initialization - let http_address: SocketAddr = args.http.parse()?; - info!("Starting http on {http_address}"); - let http_listener = TcpListener::bind(http_address).await?.into_std()?; + info!("Starting http on {}", args.http); + let http_listener = TcpListener::bind(args.http).await?.into_std()?; - let mgmt_address: SocketAddr = args.mgmt.parse()?; - info!("Starting mgmt on {mgmt_address}"); - let mgmt_listener = TcpListener::bind(mgmt_address).await?; + info!("Starting mgmt on {}", args.mgmt); + let mgmt_listener = TcpListener::bind(args.mgmt).await?; let proxy_listener = if args.is_auth_broker { None } else { - let proxy_address: SocketAddr = args.proxy.parse()?; - info!("Starting proxy on {proxy_address}"); + info!("Starting proxy on {}", args.proxy); + Some(TcpListener::bind(args.proxy).await?) + }; - Some(TcpListener::bind(proxy_address).await?) + let sni_router_listeners = { + let args = &args.pg_sni_router; + if args.dest.is_some() { + ensure!( + args.tls_key.is_some(), + "sni-router-tls-key must be provided" + ); + ensure!( + args.tls_cert.is_some(), + "sni-router-tls-cert must be provided" + ); + + info!( + "Starting pg-sni-router on {} and {}", + args.listen, args.listen_tls + ); + + Some(( + TcpListener::bind(args.listen).await?, + TcpListener::bind(args.listen_tls).await?, + )) + } else { + None + } }; // TODO: rename the argument to something like serverless. // It now covers more than just websockets, it also covers SQL over HTTP. let serverless_listener = if let Some(serverless_address) = args.wss { - let serverless_address: SocketAddr = serverless_address.parse()?; info!("Starting wss on {serverless_address}"); Some(TcpListener::bind(serverless_address).await?) } else if args.is_auth_broker { @@ -458,6 +459,37 @@ pub async fn run() -> anyhow::Result<()> { } } + // spawn pg-sni-router mode. + if let Some((listen, listen_tls)) = sni_router_listeners { + let args = args.pg_sni_router; + let dest = args.dest.expect("already asserted it is set"); + let key_path = args.tls_key.expect("already asserted it is set"); + let cert_path = args.tls_cert.expect("already asserted it is set"); + + let (tls_config, tls_server_end_point) = + super::pg_sni_router::parse_tls(&key_path, &cert_path)?; + + let dest = Arc::new(dest); + + client_tasks.spawn(super::pg_sni_router::task_main( + dest.clone(), + tls_config.clone(), + None, + tls_server_end_point, + listen, + cancellation_token.clone(), + )); + + client_tasks.spawn(super::pg_sni_router::task_main( + dest, + tls_config, + Some(config.connect_to_compute.tls.clone()), + tls_server_end_point, + listen_tls, + cancellation_token.clone(), + )); + } + client_tasks.spawn(crate::context::parquet::worker( cancellation_token.clone(), args.parquet_upload, @@ -565,7 +597,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { (Some(key_path), Some(cert_path)) => Some(config::configure_tls( key_path, cert_path, - args.certs_dir.as_ref(), + args.certs_dir.as_deref(), args.allow_tls_keylogfile, )?), (None, None) => None, @@ -811,6 +843,60 @@ fn build_auth_backend( } } +async fn configure_redis( + args: &ProxyCliArgs, +) -> anyhow::Result<( + Option, + Option, +)> { + // TODO: untangle the config args + let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { + ("plain", redis_url) => match redis_url { + None => { + bail!("plain auth requires redis_notifications to be set"); + } + Some(url) => { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone())) + } + }, + ("irsa", _) => match (&args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host.to_string(), + port, + elasticache::CredentialsProvider::new( + args.aws_region.clone(), + args.redis_cluster_name.clone(), + args.redis_user_id.clone(), + ) + .await, + ), + ), + (None, None) => { + // todo: upgrade to error? + warn!( + "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client" + ); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }, + _ => { + bail!("unknown auth type given"); + } + }; + + let redis_notifications_client = if let Some(url) = &args.redis_notifications { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(&**url)) + } else { + regional_redis_client.clone() + }; + + Ok((regional_redis_client, redis_notifications_client)) +} + #[cfg(test)] mod tests { use std::time::Duration; diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs index 8f8917ef62..66c53b3aff 100644 --- a/proxy/src/tls/server_config.rs +++ b/proxy/src/tls/server_config.rs @@ -1,4 +1,5 @@ use std::collections::{HashMap, HashSet}; +use std::path::Path; use std::sync::Arc; use anyhow::{Context, bail}; @@ -21,9 +22,9 @@ pub struct TlsConfig { /// Configure TLS for the main endpoint. pub fn configure_tls( - key_path: &str, - cert_path: &str, - certs_dir: Option<&String>, + key_path: &Path, + cert_path: &Path, + certs_dir: Option<&Path>, allow_tls_keylogfile: bool, ) -> anyhow::Result { // add default certificate @@ -39,8 +40,7 @@ pub fn configure_tls( let key_path = path.join("tls.key"); let cert_path = path.join("tls.crt"); if key_path.exists() && cert_path.exists() { - cert_resolver - .add_cert_path(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?; + cert_resolver.add_cert_path(&key_path, &cert_path)?; } } } @@ -86,7 +86,7 @@ pub struct CertResolver { } impl CertResolver { - fn parse_new(key_path: &str, cert_path: &str) -> anyhow::Result { + fn parse_new(key_path: &Path, cert_path: &Path) -> anyhow::Result { let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; Self::new(priv_key, cert_chain) } @@ -103,7 +103,7 @@ impl CertResolver { Ok(Self { certs, default }) } - fn add_cert_path(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> { + fn add_cert_path(&mut self, key_path: &Path, cert_path: &Path) -> anyhow::Result<()> { let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; self.add_cert(priv_key, cert_chain) } @@ -124,26 +124,29 @@ impl CertResolver { } fn parse_key_cert( - key_path: &str, - cert_path: &str, + key_path: &Path, + cert_path: &Path, ) -> anyhow::Result<(PrivateKeyDer<'static>, Vec>)> { let priv_key = { let key_bytes = std::fs::read(key_path) - .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?; + .with_context(|| format!("Failed to read TLS keys at '{}'", key_path.display()))?; rustls_pemfile::private_key(&mut &key_bytes[..]) - .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? - .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? + .with_context(|| format!("Failed to parse TLS keys at '{}'", key_path.display()))? + .with_context(|| format!("Failed to parse TLS keys at '{}'", key_path.display()))? }; - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + let cert_chain_bytes = std::fs::read(cert_path).context(format!( + "Failed to read TLS cert file at '{}.'", + cert_path.display() + ))?; let cert_chain = { rustls_pemfile::certs(&mut &cert_chain_bytes[..]) .try_collect() .with_context(|| { format!( - "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." + "Failed to read TLS certificate chain from bytes from file at '{}'.", + cert_path.display() ) })? }; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8f56ee4392..2801a0e867 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3607,6 +3607,8 @@ class NeonProxy(PgProtocol): http_port: int, mgmt_port: int, external_http_port: int, + router_port: int, + router_tls_port: int, auth_backend: NeonProxy.AuthBackend, metric_collection_endpoint: str | None = None, metric_collection_interval: str | None = None, @@ -3623,6 +3625,8 @@ class NeonProxy(PgProtocol): self.test_output_dir = test_output_dir self.proxy_port = proxy_port self.mgmt_port = mgmt_port + self.router_port = router_port + self.router_tls_port = router_tls_port self.auth_backend = auth_backend self.metric_collection_endpoint = metric_collection_endpoint self.metric_collection_interval = metric_collection_interval @@ -3637,6 +3641,14 @@ class NeonProxy(PgProtocol): key_path = self.test_output_dir / "proxy.key" generate_proxy_tls_certs("*.local.neon.build", key_path, crt_path) + # generate key for pg-sni-router. + # endpoint.namespace.local.neon.build resolves to 127.0.0.1 + generate_proxy_tls_certs( + "endpoint.namespace.local.neon.build", + self.test_output_dir / "router.key", + self.test_output_dir / "router.crt", + ) + args = [ str(self.neon_binpath / "proxy"), *["--http", f"{self.host}:{self.http_port}"], @@ -3646,6 +3658,11 @@ class NeonProxy(PgProtocol): *["--sql-over-http-timeout", f"{self.http_timeout_seconds}s"], *["-c", str(crt_path)], *["-k", str(key_path)], + *["--sni-router-listen", f"{self.host}:{self.router_port}"], + *["--sni-router-listen-tls", f"{self.host}:{self.router_tls_port}"], + *["--sni-router-tls-cert", str(self.test_output_dir / "router.crt")], + *["--sni-router-tls-key", str(self.test_output_dir / "router.key")], + *["--sni-router-destination", "local.neon.build"], *self.auth_backend.extra_args(), ] @@ -3945,6 +3962,8 @@ def link_proxy( proxy_port = port_distributor.get_port() mgmt_port = port_distributor.get_port() external_http_port = port_distributor.get_port() + router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() with NeonProxy( neon_binpath=neon_binpath, @@ -3952,6 +3971,8 @@ def link_proxy( proxy_port=proxy_port, http_port=http_port, mgmt_port=mgmt_port, + router_port=router_port, + router_tls_port=router_tls_port, external_http_port=external_http_port, auth_backend=NeonProxy.Link(), ) as proxy: @@ -3985,6 +4006,8 @@ def static_proxy( mgmt_port = port_distributor.get_port() http_port = port_distributor.get_port() external_http_port = port_distributor.get_port() + router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() with NeonProxy( neon_binpath=neon_binpath, @@ -3992,6 +4015,8 @@ def static_proxy( proxy_port=proxy_port, http_port=http_port, mgmt_port=mgmt_port, + router_port=router_port, + router_tls_port=router_tls_port, external_http_port=external_http_port, auth_backend=NeonProxy.Postgres(auth_endpoint), ) as proxy: diff --git a/test_runner/regress/test_proxy_metric_collection.py b/test_runner/regress/test_proxy_metric_collection.py index 85d8a6daaa..7442d50f68 100644 --- a/test_runner/regress/test_proxy_metric_collection.py +++ b/test_runner/regress/test_proxy_metric_collection.py @@ -52,6 +52,8 @@ def proxy_with_metric_collector( proxy_port = port_distributor.get_port() mgmt_port = port_distributor.get_port() external_http_port = port_distributor.get_port() + router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() (host, port) = httpserver_listen_address metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" @@ -63,6 +65,8 @@ def proxy_with_metric_collector( proxy_port=proxy_port, http_port=http_port, mgmt_port=mgmt_port, + router_port=router_port, + router_tls_port=router_tls_port, external_http_port=external_http_port, metric_collection_endpoint=metric_collection_endpoint, metric_collection_interval=metric_collection_interval, diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py index 19952fc71b..61893f22ba 100644 --- a/test_runner/regress/test_sni_router.py +++ b/test_runner/regress/test_sni_router.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING import backoff from fixtures.log_helper import log -from fixtures.neon_fixtures import PgProtocol, VanillaPostgres +from fixtures.neon_fixtures import NeonProxy, PgProtocol, VanillaPostgres if TYPE_CHECKING: from pathlib import Path @@ -41,6 +41,7 @@ class PgSniRouter(PgProtocol): self, neon_binpath: Path, port: int, + tls_port: int, destination: str, tls_cert: Path, tls_key: Path, @@ -53,6 +54,7 @@ class PgSniRouter(PgProtocol): self.host = host self.neon_binpath = neon_binpath self.port = port + self.tls_port = tls_port self.destination = destination self.tls_cert = tls_cert self.tls_key = tls_key @@ -64,6 +66,7 @@ class PgSniRouter(PgProtocol): args = [ str(self.neon_binpath / "pg_sni_router"), *["--listen", f"127.0.0.1:{self.port}"], + *["--listen-tls", f"127.0.0.1:{self.tls_port}"], *["--tls-cert", str(self.tls_cert)], *["--tls-key", str(self.tls_key)], *["--destination", self.destination], @@ -127,10 +130,12 @@ def test_pg_sni_router( pg_port = vanilla_pg.default_options["port"] router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() with PgSniRouter( neon_binpath=neon_binpath, port=router_port, + tls_port=router_tls_port, destination="local.neon.build", tls_cert=test_output_dir / "router.crt", tls_key=test_output_dir / "router.key", @@ -146,3 +151,22 @@ def test_pg_sni_router( hostaddr="127.0.0.1", ) assert out[0][0] == 1 + + +def test_pg_sni_router_in_proxy( + static_proxy: NeonProxy, + vanilla_pg: VanillaPostgres, +): + # static_proxy starts this. + assert vanilla_pg.is_running() + pg_port = vanilla_pg.default_options["port"] + + out = static_proxy.safe_psql( + "select 1", + dbname="postgres", + sslmode="require", + host=f"endpoint--namespace--{pg_port}.local.neon.build", + hostaddr="127.0.0.1", + port=static_proxy.router_port, + ) + assert out[0][0] == 1 From 9971fba5848ca3928b54e123a338d454e6c65283 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Mon, 12 May 2025 12:36:07 -0500 Subject: [PATCH 085/142] Properly configure the dynamic loader to load our compiled libraries (#11858) The first line in /etc/ld.so.conf is: /etc/ld.so.conf.d/* We want to control library load order so that our compiled binaries are picked up before others from system packages. The previous solution allowed the system libraries to load before ours. Part-of: https://github.com/neondatabase/neon/issues/11857 Signed-off-by: Tristan Partin --- compute/compute-node.Dockerfile | 3 ++- compute/etc/ld.so.conf.d/00-neon.conf | 1 + docker-compose/compute_wrapper/shell/compute.sh | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 compute/etc/ld.so.conf.d/00-neon.conf diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 6233eaf709..e6e6053554 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1971,7 +1971,8 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml # Make the libraries we built available -RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig +COPY --chmod=0666 compute/etc/ld.so.conf.d/00-neon.conf /etc/ld.so.conf.d/00-neon.conf +RUN /sbin/ldconfig # rsyslog config permissions # directory for rsyslogd pid file diff --git a/compute/etc/ld.so.conf.d/00-neon.conf b/compute/etc/ld.so.conf.d/00-neon.conf new file mode 100644 index 0000000000..e8e4bdcd42 --- /dev/null +++ b/compute/etc/ld.so.conf.d/00-neon.conf @@ -0,0 +1 @@ +/usr/local/lib diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index 723b2f8afb..20a1ffb7a0 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -14,6 +14,14 @@ PG_VERSION=${PG_VERSION:-14} CONFIG_FILE_ORG=/var/db/postgres/configs/config.json CONFIG_FILE=/tmp/config.json +# Test that the first library path that the dynamic loader looks in is the path +# that we use for custom compiled software +first_path="$(ldconfig --verbose 2>/dev/null \ + | grep --invert-match ^$'\t' \ + | cut --delimiter=: --fields=1 \ + | head --lines=1)" +test "$first_path" == '/usr/local/lib' || true # Remove the || true in a follow-up PR. Needed for backwards compat. + echo "Waiting pageserver become ready." while ! nc -z pageserver 6400; do sleep 1; From a113c48c43c9ff0130e404e47a55e4721bbb63a4 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 13 May 2025 09:33:53 +0100 Subject: [PATCH 086/142] proxy: fix redis batching support (#11905) ## Problem For `StoreCancelKey`, we were inserting 2 commands, but we were not inserting two replies. This mismatch leads to errors when decoding the response. ## Summary of changes Abstract the command + reply pipeline so that commands and replies are registered at the same time. --- proxy/src/cancellation.rs | 125 ++++++++++++++++++++++++-------------- proxy/src/redis/kv_ops.rs | 2 +- 2 files changed, 79 insertions(+), 48 deletions(-) diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index c5ba04eb8c..f34fb747ca 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -6,12 +6,12 @@ use ipnet::{IpNet, Ipv4Net, Ipv6Net}; use postgres_client::CancelToken; use postgres_client::tls::MakeTlsConnect; use pq_proto::CancelKeyData; -use redis::{FromRedisValue, Pipeline, Value, pipe}; +use redis::{Cmd, FromRedisValue, Value}; use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::net::TcpStream; use tokio::sync::{mpsc, oneshot}; -use tracing::{debug, info, warn}; +use tracing::{debug, error, info, warn}; use crate::auth::backend::ComputeUserInfo; use crate::auth::{AuthError, check_peer_addr_is_in_list}; @@ -56,8 +56,70 @@ pub enum CancelKeyOp { }, } +pub struct Pipeline { + inner: redis::Pipeline, + replies: Vec, +} + +impl Pipeline { + fn with_capacity(n: usize) -> Self { + Self { + inner: redis::Pipeline::with_capacity(n), + replies: Vec::with_capacity(n), + } + } + + async fn execute(&mut self, client: &mut RedisKVClient) { + let responses = self.replies.len(); + let batch_size = self.inner.len(); + + match client.query(&self.inner).await { + // for each reply, we expect that many values. + Ok(Value::Array(values)) if values.len() == responses => { + debug!( + batch_size, + responses, "successfully completed cancellation jobs", + ); + for (value, reply) in std::iter::zip(values, self.replies.drain(..)) { + reply.send_value(value); + } + } + Ok(value) => { + error!(batch_size, ?value, "unexpected redis return value"); + for reply in self.replies.drain(..) { + reply.send_err(anyhow!("incorrect response type from redis")); + } + } + Err(err) => { + for reply in self.replies.drain(..) { + reply.send_err(anyhow!("could not send cmd to redis: {err}")); + } + } + } + + self.inner.clear(); + self.replies.clear(); + } + + fn add_command_with_reply(&mut self, cmd: Cmd, reply: CancelReplyOp) { + self.inner.add_command(cmd); + self.replies.push(reply); + } + + fn add_command_no_reply(&mut self, cmd: Cmd) { + self.inner.add_command(cmd).ignore(); + } + + fn add_command(&mut self, cmd: Cmd, reply: Option) { + match reply { + Some(reply) => self.add_command_with_reply(cmd, reply), + None => self.add_command_no_reply(cmd), + } + } +} + impl CancelKeyOp { - fn register(self, pipe: &mut Pipeline) -> Option { + fn register(self, pipe: &mut Pipeline) { #[allow(clippy::used_underscore_binding)] match self { CancelKeyOp::StoreCancelKey { @@ -68,18 +130,18 @@ impl CancelKeyOp { _guard, expire, } => { - pipe.hset(&key, field, value); - pipe.expire(key, expire); - let resp_tx = resp_tx?; - Some(CancelReplyOp::StoreCancelKey { resp_tx, _guard }) + let reply = + resp_tx.map(|resp_tx| CancelReplyOp::StoreCancelKey { resp_tx, _guard }); + pipe.add_command(Cmd::hset(&key, field, value), reply); + pipe.add_command_no_reply(Cmd::expire(key, expire)); } CancelKeyOp::GetCancelData { key, resp_tx, _guard, } => { - pipe.hgetall(key); - Some(CancelReplyOp::GetCancelData { resp_tx, _guard }) + let reply = CancelReplyOp::GetCancelData { resp_tx, _guard }; + pipe.add_command_with_reply(Cmd::hgetall(key), reply); } CancelKeyOp::RemoveCancelKey { key, @@ -87,9 +149,9 @@ impl CancelKeyOp { resp_tx, _guard, } => { - pipe.hdel(key, field); - let resp_tx = resp_tx?; - Some(CancelReplyOp::RemoveCancelKey { resp_tx, _guard }) + let reply = + resp_tx.map(|resp_tx| CancelReplyOp::RemoveCancelKey { resp_tx, _guard }); + pipe.add_command(Cmd::hdel(key, field), reply); } } } @@ -170,8 +232,8 @@ pub async fn handle_cancel_messages( client: &mut RedisKVClient, mut rx: mpsc::Receiver, ) -> anyhow::Result<()> { - let mut batch = Vec::new(); - let mut replies = vec![]; + let mut batch = Vec::with_capacity(BATCH_SIZE); + let mut pipeline = Pipeline::with_capacity(BATCH_SIZE); loop { if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 { @@ -182,42 +244,11 @@ pub async fn handle_cancel_messages( let batch_size = batch.len(); debug!(batch_size, "running cancellation jobs"); - let mut pipe = pipe(); for msg in batch.drain(..) { - if let Some(reply) = msg.register(&mut pipe) { - replies.push(reply); - } else { - pipe.ignore(); - } + msg.register(&mut pipeline); } - let responses = replies.len(); - - match client.query(pipe).await { - // for each reply, we expect that many values. - Ok(Value::Array(values)) if values.len() == responses => { - debug!( - batch_size, - responses, "successfully completed cancellation jobs", - ); - for (value, reply) in std::iter::zip(values, replies.drain(..)) { - reply.send_value(value); - } - } - Ok(value) => { - debug!(?value, "unexpected redis return value"); - for reply in replies.drain(..) { - reply.send_err(anyhow!("incorrect response type from redis")); - } - } - Err(err) => { - for reply in replies.drain(..) { - reply.send_err(anyhow!("could not send cmd to redis: {err}")); - } - } - } - - replies.clear(); + pipeline.execute(client).await; } } diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs index aa627b29a6..f71730c533 100644 --- a/proxy/src/redis/kv_ops.rs +++ b/proxy/src/redis/kv_ops.rs @@ -47,7 +47,7 @@ impl RedisKVClient { pub(crate) async fn query( &mut self, - q: impl Queryable, + q: &impl Queryable, ) -> anyhow::Result { if !self.limiter.check() { tracing::info!("Rate limit exceeded. Skipping query"); From a9979620c508a089f3f3d6e020877349ff555b0f Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 13 May 2025 16:53:35 +0800 Subject: [PATCH 087/142] fix(remote_storage): continue on Azure+AWS retryable error (#11903) ## Problem We implemented the retry logic in AWS S3 but not in Azure. Therefore, if there is an error during Azure listing, we will return an Err to the caller, and the stream will end without fetching more tenants. Part of https://github.com/neondatabase/neon/issues/11159 Without this fix, listing tenant will stop once we hit an error (could be network errors -- that happens more frequent on Azure). If we happen to stop at a point that we only listed part of the shards, we will hit the "missed shards" error or even remove layers being used. This bug (for Azure listing) was introduced as part of https://github.com/neondatabase/neon/pull/9840 There is also a bug that stops the stream for AWS when there's a timeout -- this is fixed along with this patch. ## Summary of changes Retry the request on error. In the future, we should make such streams return something like `Result>` where the outer result is the error that ends the stream and the inner one is the error that should be retried by the caller. --------- Signed-off-by: Alex Chi Z --- libs/remote_storage/src/azure_blob.rs | 11 +++++++++-- libs/remote_storage/src/s3_bucket.rs | 9 ++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index a5cddb840f..5363e935e3 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -330,11 +330,18 @@ impl AzureBlobStorage { if let Err(DownloadError::Timeout) = &next_item { timeout_try_cnt += 1; if timeout_try_cnt <= 5 { - continue; + continue 'outer; } } - let next_item = next_item?; + let next_item = match next_item { + Ok(next_item) => next_item, + Err(e) => { + // The error is potentially retryable, so we must rewind the loop after yielding. + yield Err(e); + continue 'outer; + }, + }; // Log a warning if we saw two timeouts in a row before a successful request if timeout_try_cnt > 2 { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 918d9d5a6b..d98ff552ee 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -657,7 +657,14 @@ impl RemoteStorage for S3Bucket { res = request => Ok(res), _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout), _ = cancel.cancelled() => Err(DownloadError::Cancelled), - }?; + }; + + if let Err(DownloadError::Timeout) = &response { + yield Err(DownloadError::Timeout); + continue 'outer; + } + + let response = response?; // always yield cancellation errors and stop the stream let response = response .context("Failed to list S3 prefixes") From 34a42b00caf9e4c45fa3ce29ba95aa2ae7278d05 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 13 May 2025 17:49:14 +0800 Subject: [PATCH 088/142] feat(pageserver): add PostHog lite client (#11821) ## Problem part of https://github.com/neondatabase/neon/issues/11813 ## Summary of changes Add a lite PostHog client that only uses the local flag evaluation functionality. Added a test case that parses an example feature flag and gets the evaluation result. TODO: support boolean flag, remote config; implement all operators in PostHog. --------- Signed-off-by: Alex Chi Z --- Cargo.lock | 16 + Cargo.toml | 1 + libs/posthog_client_lite/Cargo.toml | 14 + libs/posthog_client_lite/src/lib.rs | 634 ++++++++++++++++++++++++++++ workspace_hack/Cargo.toml | 3 + 5 files changed, 668 insertions(+) create mode 100644 libs/posthog_client_lite/Cargo.toml create mode 100644 libs/posthog_client_lite/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 7083baa092..6df5d4a71e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4848,6 +4848,19 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "posthog_client_lite" +version = "0.1.0" +dependencies = [ + "anyhow", + "reqwest", + "serde", + "serde_json", + "sha2", + "thiserror 1.0.69", + "workspace_hack", +] + [[package]] name = "powerfmt" version = "0.2.0" @@ -8439,8 +8452,10 @@ dependencies = [ "fail", "form_urlencoded", "futures-channel", + "futures-core", "futures-executor", "futures-io", + "futures-task", "futures-util", "generic-array", "getrandom 0.2.11", @@ -8470,6 +8485,7 @@ dependencies = [ "once_cell", "p256 0.13.2", "parquet", + "percent-encoding", "prettyplease", "proc-macro2", "prost 0.13.3", diff --git a/Cargo.toml b/Cargo.toml index 8d4cc4a75a..6b87ce549d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ members = [ "libs/utils", "libs/consumption_metrics", "libs/postgres_backend", + "libs/posthog_client_lite", "libs/pq_proto", "libs/tenant_size_model", "libs/metrics", diff --git a/libs/posthog_client_lite/Cargo.toml b/libs/posthog_client_lite/Cargo.toml new file mode 100644 index 0000000000..7c19bf2ccb --- /dev/null +++ b/libs/posthog_client_lite/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "posthog_client_lite" +version = "0.1.0" +edition = "2024" +license.workspace = true + +[dependencies] +anyhow.workspace = true +reqwest.workspace = true +serde.workspace = true +serde_json.workspace = true +sha2.workspace = true +workspace_hack.workspace = true +thiserror.workspace = true diff --git a/libs/posthog_client_lite/src/lib.rs b/libs/posthog_client_lite/src/lib.rs new file mode 100644 index 0000000000..53deb26ab7 --- /dev/null +++ b/libs/posthog_client_lite/src/lib.rs @@ -0,0 +1,634 @@ +//! A lite version of the PostHog client that only supports local evaluation of feature flags. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; +use serde_json::json; +use sha2::Digest; + +#[derive(Debug, thiserror::Error)] +pub enum PostHogEvaluationError { + /// The feature flag is not available, for example, because the local evaluation data is not populated yet. + #[error("Feature flag not available: {0}")] + NotAvailable(String), + #[error("No condition group is matched")] + NoConditionGroupMatched, + /// Real errors, e.g., the rollout percentage does not add up to 100. + #[error("Failed to evaluate feature flag: {0}")] + Internal(String), +} + +#[derive(Deserialize)] +pub struct LocalEvaluationResponse { + #[allow(dead_code)] + flags: Vec, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlag { + key: String, + filters: LocalEvaluationFlagFilters, + active: bool, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagFilters { + groups: Vec, + multivariate: LocalEvaluationFlagMultivariate, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagFilterGroup { + variant: Option, + properties: Option>, + rollout_percentage: i64, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagFilterProperty { + key: String, + value: PostHogFlagFilterPropertyValue, + operator: String, +} + +#[derive(Debug, Serialize, Deserialize)] +#[serde(untagged)] +pub enum PostHogFlagFilterPropertyValue { + String(String), + Number(f64), + Boolean(bool), + List(Vec), +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagMultivariate { + variants: Vec, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagMultivariateVariant { + key: String, + rollout_percentage: i64, +} + +pub struct FeatureStore { + flags: HashMap, +} + +impl Default for FeatureStore { + fn default() -> Self { + Self::new() + } +} + +enum GroupEvaluationResult { + MatchedAndOverride(String), + MatchedAndEvaluate, + Unmatched, +} + +impl FeatureStore { + pub fn new() -> Self { + Self { + flags: HashMap::new(), + } + } + + pub fn set_flags(&mut self, flags: Vec) { + self.flags.clear(); + for flag in flags { + self.flags.insert(flag.key.clone(), flag); + } + } + + /// Generate a consistent hash for a user ID (e.g., tenant ID). + /// + /// The implementation is different from PostHog SDK. In PostHog SDK, it is sha1 of `user_id.distinct_id.salt`. + /// However, as we do not upload all of our tenant IDs to PostHog, we do not have the PostHog distinct_id for a + /// tenant. Therefore, the way we compute it is sha256 of `user_id.feature_id.salt`. + fn consistent_hash(user_id: &str, flag_key: &str, salt: &str) -> f64 { + let mut hasher = sha2::Sha256::new(); + hasher.update(user_id); + hasher.update("."); + hasher.update(flag_key); + hasher.update("."); + hasher.update(salt); + let hash = hasher.finalize(); + let hash_int = u64::from_le_bytes(hash[..8].try_into().unwrap()); + hash_int as f64 / u64::MAX as f64 + } + + /// Evaluate a condition. Returns an error if the condition cannot be evaluated due to parsing error or missing + /// property. + fn evaluate_condition( + &self, + operator: &str, + provided: &PostHogFlagFilterPropertyValue, + requested: &PostHogFlagFilterPropertyValue, + ) -> Result { + match operator { + "exact" => { + let PostHogFlagFilterPropertyValue::String(provided) = provided else { + // Left should be a string + return Err(PostHogEvaluationError::Internal(format!( + "The left side of the condition is not a string: {:?}", + provided + ))); + }; + let PostHogFlagFilterPropertyValue::List(requested) = requested else { + // Right should be a list of string + return Err(PostHogEvaluationError::Internal(format!( + "The right side of the condition is not a list: {:?}", + requested + ))); + }; + Ok(requested.contains(provided)) + } + "lt" | "gt" => { + let PostHogFlagFilterPropertyValue::String(requested) = requested else { + // Right should be a string + return Err(PostHogEvaluationError::Internal(format!( + "The right side of the condition is not a string: {:?}", + requested + ))); + }; + let Ok(requested) = requested.parse::() else { + return Err(PostHogEvaluationError::Internal(format!( + "Can not parse the right side of the condition as a number: {:?}", + requested + ))); + }; + // Left can either be a number or a string + let provided = match provided { + PostHogFlagFilterPropertyValue::Number(provided) => *provided, + PostHogFlagFilterPropertyValue::String(provided) => { + let Ok(provided) = provided.parse::() else { + return Err(PostHogEvaluationError::Internal(format!( + "Can not parse the left side of the condition as a number: {:?}", + provided + ))); + }; + provided + } + _ => { + return Err(PostHogEvaluationError::Internal(format!( + "The left side of the condition is not a number or a string: {:?}", + provided + ))); + } + }; + match operator { + "lt" => Ok(provided < requested), + "gt" => Ok(provided > requested), + op => Err(PostHogEvaluationError::Internal(format!( + "Unsupported operator: {}", + op + ))), + } + } + _ => Err(PostHogEvaluationError::Internal(format!( + "Unsupported operator: {}", + operator + ))), + } + } + + /// Evaluate a percentage. + fn evaluate_percentage(&self, mapped_user_id: f64, percentage: i64) -> bool { + mapped_user_id <= percentage as f64 / 100.0 + } + + /// Evaluate a filter group for a feature flag. Returns an error if there are errors during the evaluation. + /// + /// Return values: + /// Ok(GroupEvaluationResult::MatchedAndOverride(variant)): matched and evaluated to this value + /// Ok(GroupEvaluationResult::MatchedAndEvaluate): condition matched but no variant override, use the global rollout percentage + /// Ok(GroupEvaluationResult::Unmatched): condition unmatched + fn evaluate_group( + &self, + group: &LocalEvaluationFlagFilterGroup, + hash_on_group_rollout_percentage: f64, + provided_properties: &HashMap, + ) -> Result { + if let Some(ref properties) = group.properties { + for property in properties { + if let Some(value) = provided_properties.get(&property.key) { + // The user provided the property value + if !self.evaluate_condition( + property.operator.as_ref(), + value, + &property.value, + )? { + return Ok(GroupEvaluationResult::Unmatched); + } + } else { + // We cannot evaluate, the property is not available + return Err(PostHogEvaluationError::NotAvailable(format!( + "The required property in the condition is not available: {}", + property.key + ))); + } + } + } + + // The group has no condition matchers or we matched the properties + if self.evaluate_percentage(hash_on_group_rollout_percentage, group.rollout_percentage) { + if let Some(ref variant_override) = group.variant { + Ok(GroupEvaluationResult::MatchedAndOverride( + variant_override.clone(), + )) + } else { + Ok(GroupEvaluationResult::MatchedAndEvaluate) + } + } else { + Ok(GroupEvaluationResult::Unmatched) + } + } + + /// Evaluate a multivariate feature flag. Returns `None` if the flag is not available or if there are errors + /// during the evaluation. + /// + /// The parsing logic is as follows: + /// + /// * Match each filter group. + /// - If a group is matched, it will first determine whether the user is in the range of the group's rollout + /// percentage. We will generate a consistent hash for the user ID on the group rollout percentage. This hash + /// is shared across all groups. + /// - If the hash falls within the group's rollout percentage, return the variant if it's overridden, or + /// - Evaluate the variant using the global config and the global rollout percentage. + /// * Otherwise, continue with the next group until all groups are evaluated and no group is within the + /// rollout percentage. + /// * If there are no matching groups, return an error. + /// + /// Example: we have a multivariate flag with 3 groups of the configured global rollout percentage: A (10%), B (20%), C (70%). + /// There is a single group with a condition that has a rollout percentage of 10% and it does not have a variant override. + /// Then, we will have 1% of the users evaluated to A, 2% to B, and 7% to C. + pub fn evaluate_multivariate( + &self, + flag_key: &str, + user_id: &str, + ) -> Result { + let hash_on_global_rollout_percentage = + Self::consistent_hash(user_id, flag_key, "multivariate"); + let hash_on_group_rollout_percentage = + Self::consistent_hash(user_id, flag_key, "within_group"); + self.evaluate_multivariate_inner( + flag_key, + hash_on_global_rollout_percentage, + hash_on_group_rollout_percentage, + &HashMap::new(), + ) + } + + /// Evaluate a multivariate feature flag. Note that we directly take the mapped user ID + /// (a consistent hash ranging from 0 to 1) so that it is easier to use it in the tests + /// and avoid duplicate computations. + /// + /// Use a different consistent hash for evaluating the group rollout percentage. + /// The behavior: if the condition is set to rolling out to 10% of the users, and + /// we set the variant A to 20% in the global config, then 2% of the total users will + /// be evaluated to variant A. + /// + /// Note that the hash to determine group rollout percentage is shared across all groups. So if we have two + /// exactly-the-same conditions with 10% and 20% rollout percentage respectively, a total of 20% of the users + /// will be evaluated (versus 30% if group evaluation is done independently). + pub(crate) fn evaluate_multivariate_inner( + &self, + flag_key: &str, + hash_on_global_rollout_percentage: f64, + hash_on_group_rollout_percentage: f64, + properties: &HashMap, + ) -> Result { + if let Some(flag_config) = self.flags.get(flag_key) { + if !flag_config.active { + return Err(PostHogEvaluationError::NotAvailable(format!( + "The feature flag is not active: {}", + flag_key + ))); + } + // TODO: sort the groups so that variant overrides always get evaluated first and it follows the PostHog + // Python SDK behavior; for now we do not configure conditions without variant overrides in Neon so it + // does not matter. + for group in &flag_config.filters.groups { + match self.evaluate_group(group, hash_on_group_rollout_percentage, properties)? { + GroupEvaluationResult::MatchedAndOverride(variant) => return Ok(variant), + GroupEvaluationResult::MatchedAndEvaluate => { + let mut percentage = 0; + for variant in &flag_config.filters.multivariate.variants { + percentage += variant.rollout_percentage; + if self + .evaluate_percentage(hash_on_global_rollout_percentage, percentage) + { + return Ok(variant.key.clone()); + } + } + // This should not happen because the rollout percentage always adds up to 100, but just in case that PostHog + // returned invalid spec, we return an error. + return Err(PostHogEvaluationError::Internal(format!( + "Rollout percentage does not add up to 100: {}", + flag_key + ))); + } + GroupEvaluationResult::Unmatched => continue, + } + } + // If no group is matched, the feature is not available, and up to the caller to decide what to do. + Err(PostHogEvaluationError::NoConditionGroupMatched) + } else { + // The feature flag is not available yet + Err(PostHogEvaluationError::NotAvailable(format!( + "Not found in the local evaluation spec: {}", + flag_key + ))) + } + } +} + +/// A lite PostHog client. +/// +/// At the point of writing this code, PostHog does not have a functional Rust client with feature flag support. +/// This is a lite version that only supports local evaluation of feature flags and only supports those JSON specs +/// that will be used within Neon. +/// +/// PostHog is designed as a browser-server system: the browser (client) side uses the client key and is exposed +/// to the end users; the server side uses a server key and is not exposed to the end users. The client and the +/// server has different API keys and provide a different set of APIs. In Neon, we only have the server (that is +/// pageserver), and it will use both the client API and the server API. So we need to store two API keys within +/// our PostHog client. +/// +/// The server API is used to fetch the feature flag specs. The client API is used to capture events in case we +/// want to report the feature flag usage back to PostHog. The current plan is to use PostHog only as an UI to +/// configure feature flags so it is very likely that the client API will not be used. +pub struct PostHogClient { + /// The server API key. + server_api_key: String, + /// The client API key. + client_api_key: String, + /// The project ID. + project_id: String, + /// The private API URL. + private_api_url: String, + /// The public API URL. + public_api_url: String, + /// The HTTP client. + client: reqwest::Client, +} + +impl PostHogClient { + pub fn new( + server_api_key: String, + client_api_key: String, + project_id: String, + private_api_url: String, + public_api_url: String, + ) -> Self { + let client = reqwest::Client::new(); + Self { + server_api_key, + client_api_key, + project_id, + private_api_url, + public_api_url, + client, + } + } + + pub fn new_with_us_region( + server_api_key: String, + client_api_key: String, + project_id: String, + ) -> Self { + Self::new( + server_api_key, + client_api_key, + project_id, + "https://us.posthog.com".to_string(), + "https://us.i.posthog.com".to_string(), + ) + } + + /// Fetch the feature flag specs from the server. + /// + /// This is unfortunately an undocumented API at: + /// - + /// - + /// + /// The handling logic in [`FeatureStore`] mostly follows the Python API implementation. + /// See `_compute_flag_locally` in + pub async fn get_feature_flags_local_evaluation( + &self, + ) -> anyhow::Result { + // BASE_URL/api/projects/:project_id/feature_flags/local_evaluation + // with bearer token of self.server_api_key + let url = format!( + "{}/api/projects/{}/feature_flags/local_evaluation", + self.private_api_url, self.project_id + ); + let response = self + .client + .get(url) + .bearer_auth(&self.server_api_key) + .send() + .await?; + let body = response.text().await?; + Ok(serde_json::from_str(&body)?) + } + + /// Capture an event. This will only be used to report the feature flag usage back to PostHog, though + /// it also support a lot of other functionalities. + /// + /// + pub async fn capture_event( + &self, + event: &str, + distinct_id: &str, + properties: &HashMap, + ) -> anyhow::Result<()> { + // PUBLIC_URL/capture/ + // with bearer token of self.client_api_key + let url = format!("{}/capture/", self.public_api_url); + self.client + .post(url) + .body(serde_json::to_string(&json!({ + "api_key": self.client_api_key, + "distinct_id": distinct_id, + "event": event, + "properties": properties, + }))?) + .send() + .await?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn data() -> &'static str { + r#"{ + "flags": [ + { + "id": 132794, + "team_id": 152860, + "name": "", + "key": "gc-compaction", + "filters": { + "groups": [ + { + "variant": "enabled-stage-2", + "properties": [ + { + "key": "plan_type", + "type": "person", + "value": [ + "free" + ], + "operator": "exact" + }, + { + "key": "pageserver_remote_size", + "type": "person", + "value": "10000000", + "operator": "lt" + } + ], + "rollout_percentage": 50 + }, + { + "properties": [ + { + "key": "plan_type", + "type": "person", + "value": [ + "free" + ], + "operator": "exact" + }, + { + "key": "pageserver_remote_size", + "type": "person", + "value": "10000000", + "operator": "lt" + } + ], + "rollout_percentage": 80 + } + ], + "payloads": {}, + "multivariate": { + "variants": [ + { + "key": "disabled", + "name": "", + "rollout_percentage": 90 + }, + { + "key": "enabled-stage-1", + "name": "", + "rollout_percentage": 10 + }, + { + "key": "enabled-stage-2", + "name": "", + "rollout_percentage": 0 + }, + { + "key": "enabled-stage-3", + "name": "", + "rollout_percentage": 0 + }, + { + "key": "enabled", + "name": "", + "rollout_percentage": 0 + } + ] + } + }, + "deleted": false, + "active": true, + "ensure_experience_continuity": false, + "has_encrypted_payloads": false, + "version": 6 + } + ], + "group_type_mapping": {}, + "cohorts": {} + }"# + } + + #[test] + fn parse_local_evaluation() { + let data = data(); + let _: LocalEvaluationResponse = serde_json::from_str(data).unwrap(); + } + + #[test] + fn evaluate_multivariate() { + let mut store = FeatureStore::new(); + let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap(); + store.set_flags(response.flags); + + // This lacks the required properties and cannot be evaluated. + let variant = + store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.40, &HashMap::new()); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NotAvailable(_)) + ),); + + let properties_unmatched = HashMap::from([ + ( + "plan_type".to_string(), + PostHogFlagFilterPropertyValue::String("paid".to_string()), + ), + ( + "pageserver_remote_size".to_string(), + PostHogFlagFilterPropertyValue::Number(1000.0), + ), + ]); + + // This does not match any group so there will be an error. + let variant = + store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.40, &properties_unmatched); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NoConditionGroupMatched) + ),); + let variant = + store.evaluate_multivariate_inner("gc-compaction", 0.80, 0.80, &properties_unmatched); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NoConditionGroupMatched) + ),); + + let properties = HashMap::from([ + ( + "plan_type".to_string(), + PostHogFlagFilterPropertyValue::String("free".to_string()), + ), + ( + "pageserver_remote_size".to_string(), + PostHogFlagFilterPropertyValue::Number(1000.0), + ), + ]); + + // It matches the first group as 0.10 <= 0.50 and the properties are matched. Then it gets evaluated to the variant override. + let variant = store.evaluate_multivariate_inner("gc-compaction", 0.10, 0.10, &properties); + assert_eq!(variant.unwrap(), "enabled-stage-2".to_string()); + + // It matches the second group as 0.50 <= 0.60 <= 0.80 and the properties are matched. Then it gets evaluated using the global percentage. + let variant = store.evaluate_multivariate_inner("gc-compaction", 0.99, 0.60, &properties); + assert_eq!(variant.unwrap(), "enabled-stage-1".to_string()); + let variant = store.evaluate_multivariate_inner("gc-compaction", 0.80, 0.60, &properties); + assert_eq!(variant.unwrap(), "disabled".to_string()); + + // It matches the group conditions but not the group rollout percentage. + let variant = store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.90, &properties); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NoConditionGroupMatched) + ),); + } +} diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f3d8b951a8..fecf62f756 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -39,8 +39,10 @@ env_logger = { version = "0.11" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } form_urlencoded = { version = "1" } futures-channel = { version = "0.3", features = ["sink"] } +futures-core = { version = "0.3" } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } +futures-task = { version = "0.3", default-features = false, features = ["std"] } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } @@ -70,6 +72,7 @@ num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } p256 = { version = "0.13", features = ["jwk"] } parquet = { version = "53", default-features = false, features = ["zstd"] } +percent-encoding = { version = "2" } prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } From cfbef4d586f96b9f5e0648d0a7ea04db54b86962 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 13 May 2025 14:02:25 +0100 Subject: [PATCH 089/142] safekeeper: downgrade stream from future WAL log (#11909) ## Problem 1. Safekeeper selection on the pageserver side isn't very dynamic. Once you connect to one safekeeper, you'll use that one for as long as the safekeeper keeps the connection alive. In principle, we could be more eager, since the wal receiver connection can be cancelled but we don't do that. We wait until the "session" is done and then we pick a new SK. 2. Picking a new SK is quite conservative. We will switch if: a. We haven't received anything from the SK within the last 10 seconds (wal_connect_timeout) or b. The candidate SK is 1GiB ahead or c. The candidate SK is in the same AZ as the PS or d. There's a candidate that is ahead and we've not had any WAL within the last 10 seconds (lagging_wal_timeout) Hence, we can end up with pageservers that are requesting WAL which their safekeeper hasn't seen yet. ## Summary of changes Downgrade warning log to info. --- safekeeper/src/send_wal.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 33e3d0485c..05f827494e 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -513,7 +513,7 @@ impl SafekeeperPostgresHandler { let end_pos = end_watch.get(); if end_pos < start_pos { - warn!( + info!( "requested start_pos {} is ahead of available WAL end_pos {}", start_pos, end_pos ); From 25ab16ee248e0873939569075b836f5d85d3d5f8 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 13 May 2025 14:30:09 +0100 Subject: [PATCH 090/142] chore(compute): Postgres 17.5, 16.9, 15.13 and 14.18 (#11886) Bump all minor versions. the only conflict was src/backend/storage/smgr/smgr.c in v17 where our smgr changes conflicted with https://github.com/postgres/postgres/commit/ee578921b60ef9a14eaea54b608549e4f8b14f26 but it was trivial to resolve. --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/postgres-v17 | 2 +- vendor/revisions.json | 16 ++++++++-------- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 06b405bc98..ead1e76bdc 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 06b405bc982fd53522689aa4acbfd9c44b7993cf +Subproject commit ead1e76bdcb71ef87f52f0610bd7333247f75179 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 72f83df76c..052df87d33 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 72f83df76c61ce18d81bd371f0afd2a43d59c052 +Subproject commit 052df87d338dc30687d0c96f1a4d9b6cb4882b2e diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index d72d76f2cd..bb5eee65ac 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit d72d76f2cdee4194dd052ce099e9784aca7c794a +Subproject commit bb5eee65ac753b5a66d255ec5fb4c0e33180e8fd diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 0d59c91c1a..e5374b7299 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 0d59c91c1a23e667f1d1169d5f040b3fa0a0ab44 +Subproject commit e5374b72997b0afc8374137674e873f7a558120a diff --git a/vendor/revisions.json b/vendor/revisions.json index e76510f969..cf9f474e1a 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ - "17.4", - "0d59c91c1a23e667f1d1169d5f040b3fa0a0ab44" + "17.5", + "e5374b72997b0afc8374137674e873f7a558120a" ], "v16": [ - "16.8", - "d72d76f2cdee4194dd052ce099e9784aca7c794a" + "16.9", + "bb5eee65ac753b5a66d255ec5fb4c0e33180e8fd" ], "v15": [ - "15.12", - "72f83df76c61ce18d81bd371f0afd2a43d59c052" + "15.13", + "052df87d338dc30687d0c96f1a4d9b6cb4882b2e" ], "v14": [ - "14.17", - "06b405bc982fd53522689aa4acbfd9c44b7993cf" + "14.18", + "ead1e76bdcb71ef87f52f0610bd7333247f75179" ] } From 290369061f22c18850e76355d2be885ee82d1302 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 13 May 2025 17:13:42 +0300 Subject: [PATCH 091/142] Check prefetch result in DEBUG_COMPARE_LOCAL mode (#11502) ## Problem Prefetched and LFC results are not checked in DEBUG_COMPARE_LOCAL mode ## Summary of changes Add check for this results as well. --------- Co-authored-by: Konstantin Knizhnik --- pgxn/neon/pagestore_smgr.c | 262 ++++++++++++++++--------------------- 1 file changed, 116 insertions(+), 146 deletions(-) diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index f574517b2a..31e47db7d7 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1281,75 +1281,24 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); } -#if PG_MAJORVERSION_NUM < 17 -/* - * neon_read() -- Read the specified block from a relation. - */ -#if PG_MAJORVERSION_NUM < 16 -static void -neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer) -#else -static void -neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) -#endif -{ - neon_request_lsns request_lsns; - bits8 present; - void *bufferp; - - switch (reln->smgr_relpersistence) - { - case 0: - neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); - - case RELPERSISTENCE_PERMANENT: - break; - - case RELPERSISTENCE_TEMP: - case RELPERSISTENCE_UNLOGGED: - mdread(reln, forkNum, blkno, buffer); - return; - - default: - neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); - } - - /* Try to read PS results if they are available */ - communicator_prefetch_pump_state(); - - neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); - - present = 0; - bufferp = buffer; - if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present)) - { - /* Prefetch hit */ - return; - } - - /* Try to read from local file cache */ - if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) - { - MyNeonCounters->file_cache_hits_total++; - return; - } - - neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); - - /* - * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. - */ - communicator_prefetch_pump_state(); - #ifdef DEBUG_COMPARE_LOCAL +static void +compare_with_local(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void* buffer, XLogRecPtr request_lsn) +{ if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { char pageserver_masked[BLCKSZ]; PGIOAlignedBlock mdbuf; PGIOAlignedBlock mdbuf_masked; - XLogRecPtr request_lsn = request_lsns.request_lsn; +#if PG_MAJORVERSION_NUM >= 17 + { + void* mdbuffers[1] = { mdbuf.data }; + mdreadv(reln, forkNum, blkno, mdbuffers, 1); + } +#else mdread(reln, forkNum, blkno, mdbuf.data); +#endif memcpy(pageserver_masked, buffer, BLCKSZ); memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ); @@ -1413,11 +1362,105 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } } } +} +#endif + + +#if PG_MAJORVERSION_NUM < 17 + +/* + * neon_read() -- Read the specified block from a relation. + */ +#if PG_MAJORVERSION_NUM < 16 +static void +neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer) +#else +static void +neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) +#endif +{ + neon_request_lsns request_lsns; + bits8 present; + void *bufferp; + + switch (reln->smgr_relpersistence) + { + case 0: + neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdread(reln, forkNum, blkno, buffer); + return; + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* Try to read PS results if they are available */ + communicator_prefetch_pump_state(); + + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); + + present = 0; + bufferp = buffer; + if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present)) + { + /* Prefetch hit */ +#ifdef DEBUG_COMPARE_LOCAL + compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); +#else + return; +#endif + } + + /* Try to read from local file cache */ + if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) + { + MyNeonCounters->file_cache_hits_total++; +#ifdef DEBUG_COMPARE_LOCAL + compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); +#else + return; +#endif + } + + neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); + + /* + * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. + */ + communicator_prefetch_pump_state(); + +#ifdef DEBUG_COMPARE_LOCAL + compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); #endif } #endif /* PG_MAJORVERSION_NUM <= 16 */ #if PG_MAJORVERSION_NUM >= 17 + +#ifdef DEBUG_COMPARE_LOCAL +static void +compare_with_localv(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void** buffers, BlockNumber nblocks, neon_request_lsns* request_lsns, bits8* read_pages) +{ + if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + { + for (BlockNumber i = 0; i < nblocks; i++) + { + if (BITMAP_ISSET(read_pages, i)) + { + compare_with_local(reln, forkNum, blkno + i, buffers[i], request_lsns[i].request_lsn); + } + } + } +} +#endif + + static void neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks) @@ -1460,8 +1503,13 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, blocknum, request_lsns, nblocks, buffers, read_pages); +#ifdef DEBUG_COMPARE_LOCAL + compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); + memset(read_pages, 0, sizeof(read_pages)); +#else if (prefetch_result == nblocks) return; +#endif /* Try to read from local file cache */ lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, @@ -1470,9 +1518,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if (lfc_result > 0) MyNeonCounters->file_cache_hits_total += lfc_result; +#ifdef DEBUG_COMPARE_LOCAL + compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); + memset(read_pages, 0, sizeof(read_pages)); +#else /* Read all blocks from LFC, so we're done */ if (prefetch_result + lfc_result == nblocks) return; +#endif communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, buffers, nblocks, read_pages); @@ -1483,91 +1536,8 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL - if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) - { - char pageserver_masked[BLCKSZ]; - PGIOAlignedBlock mdbuf; - PGIOAlignedBlock mdbuf_masked; - XLogRecPtr request_lsn = request_lsns->request_lsn; - - for (int i = 0; i < nblocks; i++) - { - BlockNumber blkno = blocknum + i; - if (!BITMAP_ISSET(read_pages, i)) - continue; - -#if PG_MAJORVERSION_NUM >= 17 - { - void* mdbuffers[1] = { mdbuf.data }; - mdreadv(reln, forknum, blkno, mdbuffers, 1); - } -#else - mdread(reln, forknum, blkno, mdbuf.data); -#endif - - memcpy(pageserver_masked, buffers[i], BLCKSZ); - memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ); - - if (PageIsNew((Page) mdbuf.data)) - { - if (!PageIsNew((Page) pageserver_masked)) - { - neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(buffers[i])); - } - } - else if (PageIsNew((Page) buffers[i])) - { - neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf.data)); - } - else if (PageGetSpecialSize(mdbuf.data) == 0) - { - /* assume heap */ - RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked.data, blkno); - RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); - - if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) - { - neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked.data), - hexdump_page(pageserver_masked)); - } - } - else if (PageGetSpecialSize(mdbuf.data) == MAXALIGN(sizeof(BTPageOpaqueData))) - { - if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf.data))->btpo_cycleid < MAX_BT_CYCLE_ID) - { - /* assume btree */ - RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked.data, blkno); - RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); - - if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) - { - neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked.data), - hexdump_page(pageserver_masked)); - } - } - } - } - } + memset(read_pages, 0xFF, sizeof(read_pages)); + compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); #endif } #endif From 234c882a0768876aa4616420af9a5fb132bb7b38 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Tue, 13 May 2025 14:58:37 +0000 Subject: [PATCH 092/142] proxy: Expose handlers for cpu and heap profiling (#11912) ## Problem It's difficult to understand where proxy spends most of cpu and memory. ## Summary of changes Expose cpu and heap profiling handlers for continuous profiling. neondatabase/cloud#22670 --- proxy/src/bin/proxy.rs | 4 ++++ proxy/src/http/health_server.rs | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 7d4b44841d..d60d32eb3b 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,6 +1,10 @@ #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +#[allow(non_upper_case_globals)] +#[unsafe(export_name = "malloc_conf")] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; + #[tokio::main] async fn main() -> anyhow::Result<()> { proxy::binary::proxy::run().await diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index 5278fe2a3e..b0b5a598d1 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -3,7 +3,7 @@ use std::net::TcpListener; use std::sync::{Arc, Mutex}; use anyhow::{anyhow, bail}; -use http_utils::endpoint::{self, request_span}; +use http_utils::endpoint::{self, profile_cpu_handler, profile_heap_handler, request_span}; use http_utils::error::ApiError; use http_utils::json::json_response; use http_utils::{RouterBuilder, RouterService}; @@ -33,6 +33,12 @@ fn make_router(metrics: AppMetrics) -> RouterBuilder { request_span(r, move |b| prometheus_metrics_handler(b, state)) }) .get("/v1/status", status_handler) + .get("/profile/cpu", move |r| { + request_span(r, profile_cpu_handler) + }) + .get("/profile/heap", move |r| { + request_span(r, profile_heap_handler) + }) } pub async fn task_main( From 045ae13e060c3717c921097444d5c6b09925e87c Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 13 May 2025 18:49:49 +0100 Subject: [PATCH 093/142] pageserver: make imports work with tenant shut downs (#11855) ## Problem Lifetime of imported timelines (and implicitly the import background task) has some shortcomings: 1. Timeline activation upon import completion is tricky. Previously, a timeline that finished importing after a tenant detach would not get activated and there's concerns about the safety of activating concurrently with shut-down. 2. Import jobs can prevent tenant shut down since they hold the tenant gate ## Summary of Changes Track the import tasks in memory and abort them explicitly on tenant shutdown. Integrate more closely with the storage controller: 1. When an import task has finished all of its jobs, it notifies the storage controller, but **does not** mark the import as done in the index_part. When all shards have finished importing, the storage controller will call the `/activate_post_import` idempotent endpoint for all of them. The handler, marks the import complete in index part, resets the tenant if required and checks if the timeline is active yet. 2. Not directly related, but the import job now gets the starting state from the storage controller instead of the import bucket. This paves the way for progress checkpointing. Related: https://github.com/neondatabase/neon/issues/11568 --- pageserver/client/src/mgmt_api.rs | 22 ++ pageserver/src/controller_upcall_client.rs | 40 +++ pageserver/src/deletion_queue.rs | 9 + pageserver/src/http/routes.rs | 105 ++++++ pageserver/src/tenant.rs | 222 ++++++------ .../src/tenant/remote_timeline_client.rs | 29 ++ .../src/tenant/timeline/import_pgdata.rs | 284 +++++++-------- .../src/tenant/timeline/import_pgdata/flow.rs | 4 + .../import_pgdata/importbucket_client.rs | 25 -- .../import_pgdata/importbucket_format.rs | 6 - .../import_pgdata/index_part_format.rs | 8 + storage_controller/src/http.rs | 30 ++ storage_controller/src/pageserver_client.rs | 19 + storage_controller/src/persistence.rs | 33 ++ storage_controller/src/service.rs | 328 +++++++++++------- .../src/service/safekeeper_service.rs | 7 +- storage_controller/src/timeline_import.rs | 22 +- test_runner/regress/test_import_pgdata.py | 91 ++++- 18 files changed, 859 insertions(+), 425 deletions(-) diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 4a87a91910..219e63c9d4 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use std::error::Error as _; +use std::time::Duration; use bytes::Bytes; use detach_ancestor::AncestorDetached; @@ -819,4 +820,25 @@ impl Client { .await .map(|resp| resp.status()) } + + pub async fn activate_post_import( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + activate_timeline_timeout: Duration, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/activate_post_import?timeline_activate_timeout_ms={}", + self.mgmt_api_endpoint, + tenant_shard_id, + timeline_id, + activate_timeline_timeout.as_millis() + ); + + self.request(Method::PUT, uri, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } } diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 468e5463b0..6d186b091a 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -53,6 +53,11 @@ pub trait StorageControllerUpcallApi { timeline_id: TimelineId, status: ShardImportStatus, ) -> impl Future> + Send; + fn get_timeline_import_status( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> impl Future, RetryForeverError>> + Send; } impl StorageControllerUpcallClient { @@ -302,4 +307,39 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { self.retry_http_forever(&url, request).await } + + #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context + async fn get_timeline_import_status( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result, RetryForeverError> { + let url = self + .base_url + .join(format!("timeline_import_status/{}/{}", tenant_shard_id, timeline_id).as_str()) + .expect("Failed to build path"); + + Ok(backoff::retry( + || async { + let response = self.http_client.get(url.clone()).send().await?; + + if let Err(err) = response.error_for_status_ref() { + if matches!(err.status(), Some(reqwest::StatusCode::NOT_FOUND)) { + return Ok(None); + } else { + return Err(err); + } + } + response.json::().await.map(Some) + }, + |_| false, + 3, + u32::MAX, + "storage controller upcall", + &self.cancel, + ) + .await + .ok_or(RetryForeverError::ShuttingDown)? + .expect("We retry forever, this should never be reached")) + } } diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 4d62bc4ab5..65b2de28cd 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -663,6 +663,7 @@ mod test { use camino::Utf8Path; use hex_literal::hex; use pageserver_api::key::Key; + use pageserver_api::models::ShardImportStatus; use pageserver_api::shard::ShardIndex; use pageserver_api::upcall_api::ReAttachResponseTenant; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; @@ -796,6 +797,14 @@ mod test { ) -> Result<(), RetryForeverError> { unimplemented!() } + + async fn get_timeline_import_status( + &self, + _tenant_shard_id: TenantShardId, + _timeline_id: TimelineId, + ) -> Result, RetryForeverError> { + unimplemented!() + } } async fn setup(test_name: &str) -> anyhow::Result { diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 8b6500b020..2edec9dda1 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3500,6 +3500,107 @@ async fn put_tenant_timeline_import_wal( }.instrument(span).await } +/// Activate a timeline after its import has completed +/// +/// The endpoint is idempotent and callers are expected to retry all +/// errors until a successful response. +async fn activate_post_import_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + const DEFAULT_ACTIVATE_TIMEOUT: Duration = Duration::from_secs(1); + let activate_timeout = parse_query_param(&request, "timeline_activate_timeout_ms")? + .map(Duration::from_millis) + .unwrap_or(DEFAULT_ACTIVATE_TIMEOUT); + + let span = info_span!( + "activate_post_import_handler", + tenant_id=%tenant_shard_id.tenant_id, + timeline_id=%timeline_id, + shard_id=%tenant_shard_id.shard_slug() + ); + + async move { + let state = get_state(&request); + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + tenant + .finalize_importing_timeline(timeline_id) + .await + .map_err(ApiError::InternalServerError)?; + + match tenant.get_timeline(timeline_id, false) { + Ok(_timeline) => { + // Timeline is already visible. Reset not required: fall through. + } + Err(GetTimelineError::NotFound { .. }) => { + // This is crude: we reset the whole tenant such that the new timeline is detected + // and activated. We can come up with something more granular in the future. + // + // Note that we only reset the tenant if required: when the timeline is + // not present in [`Tenant::timelines`]. + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + state + .tenant_manager + .reset_tenant(tenant_shard_id, false, &ctx) + .await + .map_err(ApiError::InternalServerError)?; + } + Err(GetTimelineError::ShuttingDown) => { + return Err(ApiError::ShuttingDown); + } + Err(GetTimelineError::NotActive { .. }) => { + unreachable!("Called get_timeline with active_only=false"); + } + } + + let timeline = tenant.get_timeline(timeline_id, false)?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn) + .with_scope_timeline(&timeline); + + let result = + tokio::time::timeout(activate_timeout, timeline.wait_to_become_active(&ctx)).await; + match result { + Ok(Ok(())) => { + // fallthrough + } + // Timeline reached some other state that's not active + // TODO(vlad): if the tenant is broken, return a permananet error + Ok(Err(_timeline_state)) => { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Timeline activation failed" + ))); + } + // Activation timed out + Err(_) => { + return Err(ApiError::Timeout("Timeline activation timed out".into())); + } + } + + let timeline_info = build_timeline_info( + &timeline, false, // include_non_incremental_logical_size, + false, // force_await_initial_logical_size + &ctx, + ) + .await + .context("get local timeline info") + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, timeline_info) + } + .instrument(span) + .await +} + /// Read the end of a tar archive. /// /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. @@ -3924,5 +4025,9 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal", |r| api_handler(r, put_tenant_timeline_import_wal), ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/activate_post_import", + |r| api_handler(r, activate_post_import_handler), + ) .any(handler_404)) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e59db74479..441049f47d 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -50,6 +50,7 @@ use remote_timeline_client::{ use secondary::heatmap::{HeatMapTenant, HeatMapTimeline}; use storage_broker::BrokerClientChannel; use timeline::compaction::{CompactionOutcome, GcCompactionQueue}; +use timeline::import_pgdata::ImportingTimeline; use timeline::offload::{OffloadError, offload_timeline}; use timeline::{ CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata, @@ -284,6 +285,19 @@ pub struct TenantShard { /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating` timelines_offloaded: Mutex>>, + /// Tracks the timelines that are currently importing into this tenant shard. + /// + /// Note that importing timelines are also present in [`Self::timelines_creating`]. + /// Keep this in mind when ordering lock acquisition. + /// + /// Lifetime: + /// * An imported timeline is created while scanning the bucket on tenant attach + /// if the index part contains an `import_pgdata` entry and said field marks the import + /// as in progress. + /// * Imported timelines are removed when the storage controller calls the post timeline + /// import activation endpoint. + timelines_importing: std::sync::Mutex>, + /// The last tenant manifest known to be in remote storage. None if the manifest has not yet /// been either downloaded or uploaded. Always Some after tenant attach. /// @@ -923,19 +937,10 @@ enum StartCreatingTimelineResult { #[allow(clippy::large_enum_variant, reason = "TODO")] enum TimelineInitAndSyncResult { - ReadyToActivate(Arc), + ReadyToActivate, NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata), } -impl TimelineInitAndSyncResult { - fn ready_to_activate(self) -> Option> { - match self { - Self::ReadyToActivate(timeline) => Some(timeline), - _ => None, - } - } -} - #[must_use] struct TimelineInitAndSyncNeedsSpawnImportPgdata { timeline: Arc, @@ -1012,10 +1017,6 @@ enum CreateTimelineCause { enum LoadTimelineCause { Attach, Unoffload, - ImportPgdata { - create_guard: TimelineCreateGuard, - activate: ActivateTimelineArgs, - }, } #[derive(thiserror::Error, Debug)] @@ -1097,7 +1098,7 @@ impl TenantShard { self: &Arc, timeline_id: TimelineId, resources: TimelineResources, - mut index_part: IndexPart, + index_part: IndexPart, metadata: TimelineMetadata, previous_heatmap: Option, ancestor: Option>, @@ -1106,7 +1107,7 @@ impl TenantShard { ) -> anyhow::Result { let tenant_id = self.tenant_shard_id; - let import_pgdata = index_part.import_pgdata.take(); + let import_pgdata = index_part.import_pgdata.clone(); let idempotency = match &import_pgdata { Some(import_pgdata) => { CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata { @@ -1127,7 +1128,7 @@ impl TenantShard { } }; - let (timeline, timeline_ctx) = self.create_timeline_struct( + let (timeline, _timeline_ctx) = self.create_timeline_struct( timeline_id, &metadata, previous_heatmap, @@ -1197,14 +1198,6 @@ impl TenantShard { match import_pgdata { Some(import_pgdata) if !import_pgdata.is_done() => { - match cause { - LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), - LoadTimelineCause::ImportPgdata { .. } => { - unreachable!( - "ImportPgdata should not be reloading timeline import is done and persisted as such in s3" - ) - } - } let mut guard = self.timelines_creating.lock().unwrap(); if !guard.insert(timeline_id) { // We should never try and load the same timeline twice during startup @@ -1260,26 +1253,7 @@ impl TenantShard { "Timeline has no ancestor and no layer files" ); - match cause { - LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), - LoadTimelineCause::ImportPgdata { - create_guard, - activate, - } => { - // TODO: see the comment in the task code above how I'm not so certain - // it is safe to activate here because of concurrent shutdowns. - match activate { - ActivateTimelineArgs::Yes { broker_client } => { - info!("activating timeline after reload from pgdata import task"); - timeline.activate(self.clone(), broker_client, None, &timeline_ctx); - } - ActivateTimelineArgs::No => (), - } - drop(create_guard); - } - } - - Ok(TimelineInitAndSyncResult::ReadyToActivate(timeline)) + Ok(TimelineInitAndSyncResult::ReadyToActivate) } } } @@ -1768,7 +1742,7 @@ impl TenantShard { })?; match effect { - TimelineInitAndSyncResult::ReadyToActivate(_) => { + TimelineInitAndSyncResult::ReadyToActivate => { // activation happens later, on Tenant::activate } TimelineInitAndSyncResult::NeedsSpawnImportPgdata( @@ -1778,13 +1752,24 @@ impl TenantShard { guard, }, ) => { - tokio::task::spawn(self.clone().create_timeline_import_pgdata_task( - timeline, - import_pgdata, - ActivateTimelineArgs::No, - guard, - ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), - )); + let timeline_id = timeline.timeline_id; + let import_task_handle = + tokio::task::spawn(self.clone().create_timeline_import_pgdata_task( + timeline.clone(), + import_pgdata, + guard, + ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), + )); + + let prev = self.timelines_importing.lock().unwrap().insert( + timeline_id, + ImportingTimeline { + timeline: timeline.clone(), + import_task_handle, + }, + ); + + assert!(prev.is_none()); } } } @@ -2678,14 +2663,7 @@ impl TenantShard { .await? } CreateTimelineParams::ImportPgdata(params) => { - self.create_timeline_import_pgdata( - params, - ActivateTimelineArgs::Yes { - broker_client: broker_client.clone(), - }, - ctx, - ) - .await? + self.create_timeline_import_pgdata(params, ctx).await? } }; @@ -2759,7 +2737,6 @@ impl TenantShard { async fn create_timeline_import_pgdata( self: &Arc, params: CreateTimelineParamsImportPgdata, - activate: ActivateTimelineArgs, ctx: &RequestContext, ) -> Result { let CreateTimelineParamsImportPgdata { @@ -2840,24 +2817,71 @@ impl TenantShard { let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself(); - tokio::spawn(self.clone().create_timeline_import_pgdata_task( + let import_task_handle = tokio::spawn(self.clone().create_timeline_import_pgdata_task( timeline.clone(), index_part, - activate, timeline_create_guard, timeline_ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), )); + let prev = self.timelines_importing.lock().unwrap().insert( + timeline.timeline_id, + ImportingTimeline { + timeline: timeline.clone(), + import_task_handle, + }, + ); + + // Idempotency is enforced higher up the stack + assert!(prev.is_none()); + // NB: the timeline doesn't exist in self.timelines at this point Ok(CreateTimelineResult::ImportSpawned(timeline)) } + /// Finalize the import of a timeline on this shard by marking it complete in + /// the index part. If the import task hasn't finished yet, returns an error. + /// + /// This method is idempotent. If the import was finalized once, the next call + /// will be a no-op. + pub(crate) async fn finalize_importing_timeline( + &self, + timeline_id: TimelineId, + ) -> anyhow::Result<()> { + let timeline = { + let locked = self.timelines_importing.lock().unwrap(); + match locked.get(&timeline_id) { + Some(importing_timeline) => { + if !importing_timeline.import_task_handle.is_finished() { + return Err(anyhow::anyhow!("Import task not done yet")); + } + + importing_timeline.timeline.clone() + } + None => { + return Ok(()); + } + } + }; + + timeline + .remote_client + .schedule_index_upload_for_import_pgdata_finalize()?; + timeline.remote_client.wait_completion().await?; + + self.timelines_importing + .lock() + .unwrap() + .remove(&timeline_id); + + Ok(()) + } + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))] async fn create_timeline_import_pgdata_task( self: Arc, timeline: Arc, index_part: import_pgdata::index_part_format::Root, - activate: ActivateTimelineArgs, timeline_create_guard: TimelineCreateGuard, ctx: RequestContext, ) { @@ -2869,7 +2893,6 @@ impl TenantShard { .create_timeline_import_pgdata_task_impl( timeline, index_part, - activate, timeline_create_guard, ctx, ) @@ -2885,60 +2908,15 @@ impl TenantShard { self: Arc, timeline: Arc, index_part: import_pgdata::index_part_format::Root, - activate: ActivateTimelineArgs, - timeline_create_guard: TimelineCreateGuard, + _timeline_create_guard: TimelineCreateGuard, ctx: RequestContext, ) -> Result<(), anyhow::Error> { info!("importing pgdata"); + let ctx = ctx.with_scope_timeline(&timeline); import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone()) .await .context("import")?; - info!("import done"); - - // - // Reload timeline from remote. - // This proves that the remote state is attachable, and it reuses the code. - // - // TODO: think about whether this is safe to do with concurrent TenantShard::shutdown. - // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit. - // But our activate() call might launch new background tasks after TenantShard::shutdown - // already went past shutting down the TenantShard::timelines, which this timeline here is no part of. - // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting - // down while bootstrapping/branching + activating), but, the race condition is much more likely - // to manifest because of the long runtime of this import task. - - // in theory this shouldn't even .await anything except for coop yield - info!("shutting down timeline"); - timeline.shutdown(ShutdownMode::Hard).await; - info!("timeline shut down, reloading from remote"); - // TODO: we can't do the following check because create_timeline_import_pgdata must return an Arc - // let Some(timeline) = Arc::into_inner(timeline) else { - // anyhow::bail!("implementation error: timeline that we shut down was still referenced from somewhere"); - // }; - let timeline_id = timeline.timeline_id; - - // load from object storage like TenantShard::attach does - let resources = self.build_timeline_resources(timeline_id); - let index_part = resources - .remote_client - .download_index_file(&self.cancel) - .await?; - let index_part = match index_part { - MaybeDeletedIndexPart::Deleted(_) => { - // likely concurrent delete call, cplane should prevent this - anyhow::bail!( - "index part says deleted but we are not done creating yet, this should not happen but" - ) - } - MaybeDeletedIndexPart::IndexPart(p) => p, - }; - let metadata = index_part.metadata.clone(); - self - .load_remote_timeline(timeline_id, index_part, metadata, None, resources, LoadTimelineCause::ImportPgdata{ - create_guard: timeline_create_guard, activate, }, &ctx) - .await? - .ready_to_activate() - .context("implementation error: reloaded timeline still needs import after import reported success")?; + info!("import done - waiting for activation"); anyhow::Ok(()) } @@ -3475,6 +3453,14 @@ impl TenantShard { timeline.defuse_for_tenant_drop(); }); } + { + let mut timelines_importing = self.timelines_importing.lock().unwrap(); + timelines_importing + .drain() + .for_each(|(_timeline_id, importing_timeline)| { + importing_timeline.shutdown(); + }); + } // test_long_timeline_create_then_tenant_delete is leaning on this message tracing::info!("Waiting for timelines..."); while let Some(res) = js.join_next().await { @@ -3949,13 +3935,6 @@ where Ok(result) } -enum ActivateTimelineArgs { - Yes { - broker_client: storage_broker::BrokerClientChannel, - }, - No, -} - impl TenantShard { pub fn tenant_specific_overrides(&self) -> pageserver_api::models::TenantConfig { self.tenant_conf.load().tenant_conf.clone() @@ -4322,6 +4301,7 @@ impl TenantShard { timelines: Mutex::new(HashMap::new()), timelines_creating: Mutex::new(HashSet::new()), timelines_offloaded: Mutex::new(HashMap::new()), + timelines_importing: Mutex::new(HashMap::new()), remote_tenant_manifest: Default::default(), gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index ea29f51956..21d68495f7 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -949,6 +949,35 @@ impl RemoteTimelineClient { Ok(()) } + /// If the `import_pgdata` field marks the timeline as having an import in progress, + /// launch an index-file upload operation that transitions it to done in the background + pub(crate) fn schedule_index_upload_for_import_pgdata_finalize( + self: &Arc, + ) -> anyhow::Result<()> { + use import_pgdata::index_part_format; + + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + let to_update = match &upload_queue.dirty.import_pgdata { + Some(import) if !import.is_done() => Some(import), + Some(_) | None => None, + }; + + if let Some(old) = to_update { + let new = + index_part_format::Root::V1(index_part_format::V1::Done(index_part_format::Done { + idempotency_key: old.idempotency_key().clone(), + started_at: *old.started_at(), + finished_at: chrono::Utc::now().naive_utc(), + })); + + upload_queue.dirty.import_pgdata = Some(new); + self.schedule_index_upload(upload_queue); + } + + Ok(()) + } + /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field. pub(crate) fn schedule_index_upload_for_gc_compaction_state_update( self: &Arc, diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index c4a8df39a3..53e15e5395 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use anyhow::{Context, bail}; use pageserver_api::models::ShardImportStatus; use remote_storage::RemotePath; +use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::info; use utils::lsn::Lsn; @@ -17,6 +18,17 @@ mod importbucket_client; mod importbucket_format; pub(crate) mod index_part_format; +pub(crate) struct ImportingTimeline { + pub import_task_handle: JoinHandle<()>, + pub timeline: Arc, +} + +impl ImportingTimeline { + pub(crate) fn shutdown(self) { + self.import_task_handle.abort(); + } +} + pub async fn doit( timeline: &Arc, index_part: index_part_format::Root, @@ -26,173 +38,161 @@ pub async fn doit( let index_part_format::Root::V1(v1) = index_part; let index_part_format::InProgress { location, - idempotency_key, - started_at, + idempotency_key: _, + started_at: _, } = match v1 { index_part_format::V1::Done(_) => return Ok(()), index_part_format::V1::InProgress(in_progress) => in_progress, }; - let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; + let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel); - let status_prefix = RemotePath::from_string("status").unwrap(); + let shard_status = storcon_client + .get_timeline_import_status(timeline.tenant_shard_id, timeline.timeline_id) + .await + .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?; - // - // See if shard is done. - // TODO: incorporate generations into status key for split brain safety. Figure out together with checkpointing. - // - let shard_status_key = - status_prefix.join(format!("shard-{}", timeline.tenant_shard_id.shard_slug())); - let shard_status: Option = - storage.get_json(&shard_status_key).await?; info!(?shard_status, "peeking shard status"); - if shard_status.map(|st| st.done).unwrap_or(false) { - info!("shard status indicates that the shard is done, skipping import"); - } else { - // TODO: checkpoint the progress into the IndexPart instead of restarting - // from the beginning. + match shard_status { + None | Some(ShardImportStatus::InProgress) => { + // TODO: checkpoint the progress into the IndexPart instead of restarting + // from the beginning. - // - // Wipe the slate clean - the flow does not allow resuming. - // We can implement resuming in the future by checkpointing the progress into the IndexPart. - // - info!("wipe the slate clean"); - { - // TODO: do we need to hold GC lock for this? - let mut guard = timeline.layers.write().await; - assert!( - guard.layer_map()?.open_layer.is_none(), - "while importing, there should be no in-memory layer" // this just seems like a good place to assert it - ); - let all_layers_keys = guard.all_persistent_layers(); - let all_layers: Vec<_> = all_layers_keys - .iter() - .map(|key| guard.get_from_key(key)) - .collect(); - let open = guard.open_mut().context("open_mut")?; + // + // Wipe the slate clean - the flow does not allow resuming. + // We can implement resuming in the future by checkpointing the progress into the IndexPart. + // + info!("wipe the slate clean"); + { + // TODO: do we need to hold GC lock for this? + let mut guard = timeline.layers.write().await; + assert!( + guard.layer_map()?.open_layer.is_none(), + "while importing, there should be no in-memory layer" // this just seems like a good place to assert it + ); + let all_layers_keys = guard.all_persistent_layers(); + let all_layers: Vec<_> = all_layers_keys + .iter() + .map(|key| guard.get_from_key(key)) + .collect(); + let open = guard.open_mut().context("open_mut")?; - timeline.remote_client.schedule_gc_update(&all_layers)?; - open.finish_gc_timeline(&all_layers); - } - - // - // Wait for pgdata to finish uploading - // - info!("wait for pgdata to reach status 'done'"); - let pgdata_status_key = status_prefix.join("pgdata"); - loop { - let res = async { - let pgdata_status: Option = storage - .get_json(&pgdata_status_key) - .await - .context("get pgdata status")?; - info!(?pgdata_status, "peeking pgdata status"); - if pgdata_status.map(|st| st.done).unwrap_or(false) { - Ok(()) - } else { - Err(anyhow::anyhow!("pgdata not done yet")) - } + timeline.remote_client.schedule_gc_update(&all_layers)?; + open.finish_gc_timeline(&all_layers); } - .await; - match res { - Ok(_) => break, - Err(err) => { - info!(?err, "indefinitely waiting for pgdata to finish"); - if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) + + // + // Wait for pgdata to finish uploading + // + info!("wait for pgdata to reach status 'done'"); + let storage = + importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; + let status_prefix = RemotePath::from_string("status").unwrap(); + let pgdata_status_key = status_prefix.join("pgdata"); + loop { + let res = async { + let pgdata_status: Option = storage + .get_json(&pgdata_status_key) + .await + .context("get pgdata status")?; + info!(?pgdata_status, "peeking pgdata status"); + if pgdata_status.map(|st| st.done).unwrap_or(false) { + Ok(()) + } else { + Err(anyhow::anyhow!("pgdata not done yet")) + } + } + .await; + match res { + Ok(_) => break, + Err(err) => { + info!(?err, "indefinitely waiting for pgdata to finish"); + if tokio::time::timeout( + std::time::Duration::from_secs(10), + cancel.cancelled(), + ) .await .is_ok() - { - bail!("cancelled while waiting for pgdata"); + { + bail!("cancelled while waiting for pgdata"); + } } } } - } - // - // Do the import - // - info!("do the import"); - let control_file = storage.get_control_file().await?; - let base_lsn = control_file.base_lsn(); + // + // Do the import + // + info!("do the import"); + let control_file = storage.get_control_file().await?; + let base_lsn = control_file.base_lsn(); - info!("update TimelineMetadata based on LSNs from control file"); - { - let pg_version = control_file.pg_version(); - let _ctx: &RequestContext = ctx; - async move { - // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the - // checkpoint record, and prev_record_lsn should point to its beginning. - // We should read the real end of the record from the WAL, but here we - // just fake it. - let disk_consistent_lsn = Lsn(base_lsn.0 + 8); - let prev_record_lsn = base_lsn; - let metadata = TimelineMetadata::new( - disk_consistent_lsn, - Some(prev_record_lsn), - None, // no ancestor - Lsn(0), // no ancestor lsn - base_lsn, // latest_gc_cutoff_lsn - base_lsn, // initdb_lsn - pg_version, - ); + info!("update TimelineMetadata based on LSNs from control file"); + { + let pg_version = control_file.pg_version(); + let _ctx: &RequestContext = ctx; + async move { + // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the + // checkpoint record, and prev_record_lsn should point to its beginning. + // We should read the real end of the record from the WAL, but here we + // just fake it. + let disk_consistent_lsn = Lsn(base_lsn.0 + 8); + let prev_record_lsn = base_lsn; + let metadata = TimelineMetadata::new( + disk_consistent_lsn, + Some(prev_record_lsn), + None, // no ancestor + Lsn(0), // no ancestor lsn + base_lsn, // latest_gc_cutoff_lsn + base_lsn, // initdb_lsn + pg_version, + ); - let _start_lsn = disk_consistent_lsn + 1; + let _start_lsn = disk_consistent_lsn + 1; - timeline - .remote_client - .schedule_index_upload_for_full_metadata_update(&metadata)?; + timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata)?; - timeline.remote_client.wait_completion().await?; + timeline.remote_client.wait_completion().await?; - anyhow::Ok(()) + anyhow::Ok(()) + } } + .await?; + + flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?; + + // Communicate that shard is done. + // Ensure at-least-once delivery of the upcall to storage controller + // before we mark the task as done and never come here again. + // + // Note that we do not mark the import complete in the index part now. + // This happens in [`Tenant::finalize_importing_timeline`] in response + // to the storage controller calling + // `/v1/tenant/:tenant_id/timeline/:timeline_id/activate_post_import`. + storcon_client + .put_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + // TODO(vlad): What about import errors? + ShardImportStatus::Done, + ) + .await + .map_err(|_err| { + anyhow::anyhow!("Shut down while putting timeline import status") + })?; + } + Some(ShardImportStatus::Error(err)) => { + info!( + "shard status indicates that the shard is done (error), skipping import {}", + err + ); + } + Some(ShardImportStatus::Done) => { + info!("shard status indicates that the shard is done (success), skipping import"); } - .await?; - - flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?; - - // - // Communicate that shard is done. - // Ensure at-least-once delivery of the upcall to storage controller - // before we mark the task as done and never come here again. - // - let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel); - storcon_client - .put_timeline_import_status( - timeline.tenant_shard_id, - timeline.timeline_id, - // TODO(vlad): What about import errors? - ShardImportStatus::Done, - ) - .await - .map_err(|_err| anyhow::anyhow!("Shut down while putting timeline import status"))?; - - storage - .put_json( - &shard_status_key, - &importbucket_format::ShardStatus { done: true }, - ) - .await - .context("put shard status")?; } - // - // Mark as done in index_part. - // This makes subsequent timeline loads enter the normal load code path - // instead of spawning the import task and calling this here function. - // - info!("mark import as complete in index part"); - timeline - .remote_client - .schedule_index_upload_for_import_pgdata_state_update(Some(index_part_format::Root::V1( - index_part_format::V1::Done(index_part_format::Done { - idempotency_key, - started_at, - finished_at: chrono::Utc::now().naive_utc(), - }), - )))?; - - timeline.remote_client.wait_completion().await?; - Ok(()) } diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index 34c073365d..5b9c8ec5b5 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -53,6 +53,7 @@ use tokio_stream::StreamExt; use tracing::{debug, instrument}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; +use utils::pausable_failpoint; use super::Timeline; use super::importbucket_client::{ControlFile, RemoteStorageWrapper}; @@ -79,6 +80,9 @@ pub async fn run( let import_config = &timeline.conf.timeline_import_config; let plan = planner.plan(import_config).await?; + + pausable_failpoint!("import-timeline-pre-execute-pausable"); + plan.execute(timeline, import_config, ctx).await } diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs index e7aa8f6038..34313748b7 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -190,31 +190,6 @@ impl RemoteStorageWrapper { Ok(Some(res)) } - #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] - pub async fn put_json(&self, path: &RemotePath, value: &T) -> anyhow::Result<()> - where - T: serde::Serialize, - { - let buf = serde_json::to_vec(value)?; - let bytes = Bytes::from(buf); - utils::backoff::retry( - || async { - let size = bytes.len(); - let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone()))); - self.storage - .upload_storage_object(bytes, size, path, &self.cancel) - .await - }, - remote_storage::TimeoutOrCancel::caused_by_cancel, - 1, - u32::MAX, - &format!("put json {path}"), - &self.cancel, - ) - .await - .expect("practically infinite retries") - } - #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] pub async fn get_range( &self, diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs index 57c647cc7f..d9f4da4748 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs @@ -5,9 +5,3 @@ pub struct PgdataStatus { pub done: bool, // TODO: remaining fields } - -#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)] -pub struct ShardStatus { - pub done: bool, - // TODO: remaining fields -} diff --git a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs index ea7a41b25f..371fc857dc 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs @@ -64,4 +64,12 @@ impl Root { }, } } + pub fn started_at(&self) -> &chrono::NaiveDateTime { + match self { + Root::V1(v1) => match v1 { + V1::InProgress(in_progress) => &in_progress.started_at, + V1::Done(done) => &done.started_at, + }, + } + } } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 649113b8ce..8d459cab9c 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -157,6 +157,29 @@ async fn handle_validate(req: Request) -> Result, ApiError> json_response(StatusCode::OK, state.service.validate(validate_req).await?) } +async fn handle_get_timeline_import_status(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::GenerationsApi)?; + + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let state = get_state(&req); + json_response( + StatusCode::OK, + state + .service + .handle_timeline_shard_import_progress(tenant_shard_id, timeline_id) + .await?, + ) +} + async fn handle_put_timeline_import_status(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; @@ -2008,6 +2031,13 @@ pub fn make_router( .post("/upcall/v1/validate", |r| { named_request_span(r, handle_validate, RequestName("upcall_v1_validate")) }) + .get("/upcall/v1/timeline_import_status", |r| { + named_request_span( + r, + handle_get_timeline_import_status, + RequestName("upcall_v1_timeline_import_status"), + ) + }) .post("/upcall/v1/timeline_import_status", |r| { named_request_span( r, diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 554ca375f5..817409e112 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,3 +1,5 @@ +use std::time::Duration; + use pageserver_api::models::detach_ancestor::AncestorDetached; use pageserver_api::models::{ DetachBehavior, LocationConfig, LocationConfigListResponse, LsnLease, PageserverUtilization, @@ -212,6 +214,7 @@ impl PageserverClient { ) } + #[allow(unused)] pub(crate) async fn timeline_detail( &self, tenant_shard_id: TenantShardId, @@ -357,4 +360,20 @@ impl PageserverClient { self.inner.wait_lsn(tenant_shard_id, request).await ) } + + pub(crate) async fn activate_post_import( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + timeline_activate_timeout: Duration, + ) -> Result { + measured_request!( + "activate_post_import", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .activate_post_import(tenant_shard_id, timeline_id, timeline_activate_timeout) + .await + ) + } } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 9ffcf9b9e6..052c0f02eb 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -1666,6 +1666,39 @@ impl Persistence { } } + pub(crate) async fn get_timeline_import( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> DatabaseResult> { + use crate::schema::timeline_imports::dsl; + let persistent_import = self + .with_measured_conn(DatabaseOperation::ListTimelineImports, move |conn| { + Box::pin(async move { + let mut from_db: Vec = dsl::timeline_imports + .filter(dsl::tenant_id.eq(tenant_id.to_string())) + .filter(dsl::timeline_id.eq(timeline_id.to_string())) + .load(conn) + .await?; + + if from_db.len() > 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + from_db.len() + ))); + } + + Ok(from_db.pop()) + }) + }) + .await?; + + persistent_import + .map(TimelineImport::from_persistent) + .transpose() + .map_err(|err| DatabaseError::Logical(format!("failed to deserialize import: {err}"))) + } + pub(crate) async fn delete_timeline_import( &self, tenant_id: TenantId, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 193050460d..05430733c2 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -35,12 +35,12 @@ use pageserver_api::controller_api::{ }; use pageserver_api::models::{ self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, - PageserverUtilization, SecondaryProgress, ShardParameters, TenantConfig, + PageserverUtilization, SecondaryProgress, ShardImportStatus, ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, TenantShardSplitResponse, TenantSorting, TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateResponseStorcon, - TimelineInfo, TimelineState, TopTenantShardItem, TopTenantShardsRequest, + TimelineInfo, TopTenantShardItem, TopTenantShardsRequest, }; use pageserver_api::shard::{ DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, @@ -61,6 +61,7 @@ use utils::completion::Barrier; use utils::generation::Generation; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; +use utils::shard::ShardIndex; use utils::sync::gate::{Gate, GateGuard}; use utils::{failpoint_support, pausable_failpoint}; @@ -98,7 +99,8 @@ use crate::tenant_shard::{ ScheduleOptimization, ScheduleOptimizationAction, TenantShard, }; use crate::timeline_import::{ - ShardImportStatuses, TimelineImport, TimelineImportState, UpcallClient, + ImportResult, ShardImportStatuses, TimelineImport, TimelineImportFinalizeError, + TimelineImportState, UpcallClient, }; const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500); @@ -3905,6 +3907,38 @@ impl Service { }) } + pub(crate) async fn handle_timeline_shard_import_progress( + self: &Arc, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + let maybe_import = self + .persistence + .get_timeline_import(tenant_shard_id.tenant_id, timeline_id) + .await?; + + let import = maybe_import.ok_or_else(|| { + ApiError::NotFound( + format!( + "import for {}/{} not found", + tenant_shard_id.tenant_id, timeline_id + ) + .into(), + ) + })?; + + import + .shard_statuses + .0 + .get(&tenant_shard_id.to_index()) + .cloned() + .ok_or_else(|| { + ApiError::NotFound( + format!("shard {} not found", tenant_shard_id.shard_slug()).into(), + ) + }) + } + pub(crate) async fn handle_timeline_shard_import_progress_upcall( self: &Arc, req: PutTimelineImportStatusRequest, @@ -3943,6 +3977,16 @@ impl Service { Ok(()) } + /// Finalize the import of a timeline + /// + /// This method should be called once all shards have reported that the import is complete. + /// Firstly, it polls the post import timeline activation endpoint exposed by the pageserver. + /// Once the timeline is active on all shards, the timeline also gets created on the + /// safekeepers. Finally, notify cplane of the import completion (whether failed or + /// successful), and remove the import from the database and in-memory. + /// + /// If this method gets pre-empted by shut down, it will be called again at start-up (on-going + /// imports are stored in the database). #[instrument(skip_all, fields( tenant_id=%import.tenant_id, shard_id=%import.timeline_id, @@ -3950,59 +3994,80 @@ impl Service { async fn finalize_timeline_import( self: &Arc, import: TimelineImport, - ) -> anyhow::Result<()> { + ) -> Result<(), TimelineImportFinalizeError> { tracing::info!("Finalizing timeline import"); pausable_failpoint!("timeline-import-pre-cplane-notification"); - let import_failed = import.completion_error().is_some(); + let tenant_id = import.tenant_id; + let timeline_id = import.timeline_id; - if !import_failed { - loop { - if self.cancel.is_cancelled() { - anyhow::bail!("Shut down requested while finalizing import"); - } - - let active = self.timeline_active_on_all_shards(&import).await?; - - match active { - Some(timeline_info) => { - tracing::info!("Timeline became active on all shards"); - - if self.config.timelines_onto_safekeepers { - // Now that we know the start LSN of this timeline, create it on the - // safekeepers. - self.tenant_timeline_create_safekeepers_until_success( - import.tenant_id, - timeline_info, - ) - .await?; - } - - break; - } - None => { - tracing::info!("Timeline not active on all shards yet"); - - tokio::select! { - _ = self.cancel.cancelled() => { - anyhow::bail!("Shut down requested while finalizing import"); - }, - _ = tokio::time::sleep(Duration::from_secs(5)) => {} - }; - } - } + let import_error = import.completion_error(); + match import_error { + Some(err) => { + self.notify_cplane_and_delete_import(tenant_id, timeline_id, Err(err)) + .await?; + tracing::warn!("Timeline import completed with shard errors"); + Ok(()) } - } + None => match self.activate_timeline_post_import(&import).await { + Ok(timeline_info) => { + tracing::info!("Post import timeline activation complete"); + if self.config.timelines_onto_safekeepers { + // Now that we know the start LSN of this timeline, create it on the + // safekeepers. + self.tenant_timeline_create_safekeepers_until_success( + import.tenant_id, + timeline_info, + ) + .await?; + } + + self.notify_cplane_and_delete_import(tenant_id, timeline_id, Ok(())) + .await?; + + tracing::info!("Timeline import completed successfully"); + Ok(()) + } + Err(TimelineImportFinalizeError::ShuttingDown) => { + // We got pre-empted by shut down and will resume after the restart. + Err(TimelineImportFinalizeError::ShuttingDown) + } + Err(err) => { + // Any finalize error apart from shut down is permanent and requires us to notify + // cplane such that it can clean up. + tracing::error!("Import finalize failed with permanent error: {err}"); + self.notify_cplane_and_delete_import( + tenant_id, + timeline_id, + Err(err.to_string()), + ) + .await?; + Err(err) + } + }, + } + } + + async fn notify_cplane_and_delete_import( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + import_result: ImportResult, + ) -> Result<(), TimelineImportFinalizeError> { + let import_failed = import_result.is_err(); tracing::info!(%import_failed, "Notifying cplane of import completion"); let client = UpcallClient::new(self.get_config(), self.cancel.child_token()); - client.notify_import_complete(&import).await?; + client + .notify_import_complete(tenant_id, timeline_id, import_result) + .await + .map_err(|_err| TimelineImportFinalizeError::ShuttingDown)?; if let Err(err) = self .persistence - .delete_timeline_import(import.tenant_id, import.timeline_id) + .delete_timeline_import(tenant_id, timeline_id) .await { tracing::warn!("Failed to delete timeline import entry from database: {err}"); @@ -4012,14 +4077,113 @@ impl Service { .write() .unwrap() .tenants - .range_mut(TenantShardId::tenant_range(import.tenant_id)) + .range_mut(TenantShardId::tenant_range(tenant_id)) .for_each(|(_id, shard)| shard.importing = TimelineImportState::Idle); - tracing::info!(%import_failed, "Timeline import complete"); - Ok(()) } + /// Activate an imported timeline on all shards once the import is complete. + /// Returns the [`TimelineInfo`] reported by shard zero. + async fn activate_timeline_post_import( + self: &Arc, + import: &TimelineImport, + ) -> Result { + const TIMELINE_ACTIVATE_TIMEOUT: Duration = Duration::from_millis(128); + + let mut shards_to_activate: HashSet = + import.shard_statuses.0.keys().cloned().collect(); + let mut shard_zero_timeline_info = None; + + while !shards_to_activate.is_empty() { + if self.cancel.is_cancelled() { + return Err(TimelineImportFinalizeError::ShuttingDown); + } + + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in locked + .tenants + .range(TenantShardId::tenant_range(import.tenant_id)) + { + if !import + .shard_statuses + .0 + .contains_key(&tenant_shard_id.to_index()) + { + return Err(TimelineImportFinalizeError::MismatchedShards( + tenant_shard_id.to_index(), + )); + } + + if let Some(node_id) = shard.intent.get_attached() { + let node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + targets.push((*tenant_shard_id, node.clone())); + } + } + + targets + }; + + let targeted_tenant_shards: Vec<_> = targets.iter().map(|(tid, _node)| *tid).collect(); + + let results = self + .tenant_for_shards_api( + targets, + |tenant_shard_id, client| async move { + client + .activate_post_import( + tenant_shard_id, + import.timeline_id, + TIMELINE_ACTIVATE_TIMEOUT, + ) + .await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + let mut failed = 0; + for (tid, result) in targeted_tenant_shards.iter().zip(results.into_iter()) { + match result { + Ok(ok) => { + if tid.is_shard_zero() { + shard_zero_timeline_info = Some(ok); + } + + shards_to_activate.remove(&tid.to_index()); + } + Err(_err) => { + failed += 1; + } + } + } + + if failed > 0 { + tracing::info!( + "Failed to activate timeline on {failed} shards post import. Will retry" + ); + } + + tokio::select! { + _ = tokio::time::sleep(Duration::from_millis(250)) => {}, + _ = self.cancel.cancelled() => { + return Err(TimelineImportFinalizeError::ShuttingDown); + } + } + } + + Ok(shard_zero_timeline_info.expect("All shards replied")) + } + async fn finalize_timeline_imports(self: &Arc, imports: Vec) { futures::future::join_all( imports @@ -4029,78 +4193,6 @@ impl Service { .await; } - /// If the timeline is active on all shards, returns the [`TimelineInfo`] - /// collected from shard 0. - /// - /// An error is returned if the shard layout has changed during the import. - /// This is guarded against within the storage controller and the pageserver, - /// and, therefore, unexpected. - async fn timeline_active_on_all_shards( - self: &Arc, - import: &TimelineImport, - ) -> anyhow::Result> { - let targets = { - let locked = self.inner.read().unwrap(); - let mut targets = Vec::new(); - - for (tenant_shard_id, shard) in locked - .tenants - .range(TenantShardId::tenant_range(import.tenant_id)) - { - if !import - .shard_statuses - .0 - .contains_key(&tenant_shard_id.to_index()) - { - anyhow::bail!("Shard layout change detected on completion"); - } - - if let Some(node_id) = shard.intent.get_attached() { - let node = locked - .nodes - .get(node_id) - .expect("Pageservers may not be deleted while referenced"); - targets.push((*tenant_shard_id, node.clone())); - } else { - return Ok(None); - } - } - - targets - }; - - if targets.is_empty() { - anyhow::bail!("No shards found to finalize import for"); - } - - let results = self - .tenant_for_shards_api( - targets, - |tenant_shard_id, client| async move { - client - .timeline_detail(tenant_shard_id, import.timeline_id) - .await - }, - 1, - 1, - SHORT_RECONCILE_TIMEOUT, - &self.cancel, - ) - .await; - - let all_active = results.iter().all(|res| match res { - Ok(info) => info.state == TimelineState::Active, - Err(_) => false, - }); - - if all_active { - // Both unwraps are validated above - Ok(Some(results.into_iter().next().unwrap().unwrap())) - } else { - Ok(None) - } - } - pub(crate) async fn tenant_timeline_archival_config( &self, tenant_id: TenantId, diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 5c15660ba3..cd5ace449d 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -10,6 +10,7 @@ use crate::persistence::{ DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence, }; use crate::safekeeper::Safekeeper; +use crate::timeline_import::TimelineImportFinalizeError; use anyhow::Context; use http_utils::error::ApiError; use pageserver_api::controller_api::{ @@ -327,12 +328,12 @@ impl Service { self: &Arc, tenant_id: TenantId, timeline_info: TimelineInfo, - ) -> anyhow::Result<()> { + ) -> Result<(), TimelineImportFinalizeError> { const BACKOFF: Duration = Duration::from_secs(5); loop { if self.cancel.is_cancelled() { - anyhow::bail!("Shut down requested while finalizing import"); + return Err(TimelineImportFinalizeError::ShuttingDown); } let res = self @@ -348,7 +349,7 @@ impl Service { tracing::error!("Failed to create timeline on safekeepers: {err}"); tokio::select! { _ = self.cancel.cancelled() => { - anyhow::bail!("Shut down requested while finalizing import"); + return Err(TimelineImportFinalizeError::ShuttingDown); }, _ = tokio::time::sleep(BACKOFF) => {} }; diff --git a/storage_controller/src/timeline_import.rs b/storage_controller/src/timeline_import.rs index 6dcc538c4b..5d9d633932 100644 --- a/storage_controller/src/timeline_import.rs +++ b/storage_controller/src/timeline_import.rs @@ -46,6 +46,14 @@ pub(crate) enum TimelineImportUpdateFollowUp { None, } +#[derive(thiserror::Error, Debug)] +pub(crate) enum TimelineImportFinalizeError { + #[error("Shut down interrupted import finalize")] + ShuttingDown, + #[error("Mismatched shard detected during import finalize: {0}")] + MismatchedShards(ShardIndex), +} + pub(crate) enum TimelineImportUpdateError { ImportNotFound { tenant_id: TenantId, @@ -151,6 +159,8 @@ impl TimelineImport { } } +pub(crate) type ImportResult = Result<(), String>; + pub(crate) struct UpcallClient { authorization_header: Option, client: reqwest::Client, @@ -198,7 +208,9 @@ impl UpcallClient { /// eventual cplane availability. The cplane API is idempotent. pub(crate) async fn notify_import_complete( &self, - import: &TimelineImport, + tenant_id: TenantId, + timeline_id: TimelineId, + import_result: ImportResult, ) -> anyhow::Result<()> { let endpoint = if self.base_url.ends_with('/') { format!("{}import_complete", self.base_url) @@ -206,15 +218,13 @@ impl UpcallClient { format!("{}/import_complete", self.base_url) }; - tracing::info!("Endpoint is {endpoint}"); - let request = self .client .request(Method::PUT, endpoint) .json(&ImportCompleteRequest { - tenant_id: import.tenant_id, - timeline_id: import.timeline_id, - error: import.completion_error(), + tenant_id, + timeline_id, + error: import_result.err(), }) .timeout(IMPORT_COMPLETE_REQUEST_TIMEOUT); diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index 05e63ad955..0472b92145 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -130,9 +130,8 @@ def test_pgdata_import_smoke( elif rel_block_size == RelBlockSize.TWO_STRPES_PER_SHARD: target_relblock_size = (shard_count or 1) * stripe_size * 8192 * 2 elif rel_block_size == RelBlockSize.MULTIPLE_RELATION_SEGMENTS: - # Postgres uses a 1GiB segment size, fixed at compile time, so we must use >2GB of data - # to exercise multiple segments. - target_relblock_size = int(((2.333 * 1024 * 1024 * 1024) // 8192) * 8192) + segment_size = 16 * 1024 * 1024 + target_relblock_size = segment_size * 8 else: raise ValueError @@ -413,6 +412,88 @@ def test_import_completion_on_restart( wait_until(cplane_notified) +@run_only_on_default_postgres(reason="PG version is irrelevant here") +def test_import_respects_tenant_shutdown( + neon_env_builder: NeonEnvBuilder, vanilla_pg: VanillaPostgres, make_httpserver: HTTPServer +): + """ + Validate that importing timelines respect the usual timeline life cycle: + 1. Shut down on tenant shut-down and resumes upon re-attach + 2. Deletion on timeline deletion (TODO) + """ + # Set up mock control plane HTTP server to listen for import completions + import_completion_signaled = Event() + + def handler(request: Request) -> Response: + log.info(f"control plane /import_complete request: {request.json}") + import_completion_signaled.set() + return Response(json.dumps({}), status=200) + + cplane_mgmt_api_server = make_httpserver + cplane_mgmt_api_server.expect_request( + "/storage/api/v1/import_complete", method="PUT" + ).respond_with_handler(handler) + + # Plug the cplane mock in + neon_env_builder.control_plane_hooks_api = ( + f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/storage/api/v1/" + ) + + # The import will specifiy a local filesystem path mocking remote storage + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + + vanilla_pg.start() + vanilla_pg.stop() + + env = neon_env_builder.init_configs() + env.start() + + importbucket_path = neon_env_builder.repo_dir / "test_import_completion_bucket" + mock_import_bucket(vanilla_pg, importbucket_path) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + idempotency = ImportPgdataIdemptencyKey.random() + + # Pause before sending the notification + failpoint_name = "import-timeline-pre-execute-pausable" + env.pageserver.http_client().configure_failpoints((failpoint_name, "pause")) + + env.storage_controller.tenant_create(tenant_id) + env.storage_controller.timeline_create( + tenant_id, + { + "new_timeline_id": str(timeline_id), + "import_pgdata": { + "idempotency_key": str(idempotency), + "location": {"LocalFs": {"path": str(importbucket_path.absolute())}}, + }, + }, + ) + + def hit_failpoint(): + log.info("Checking log for pattern...") + try: + assert env.pageserver.log_contains(f".*at failpoint {failpoint_name}.*") + except Exception: + log.exception("Failed to find pattern in log") + raise + + wait_until(hit_failpoint) + assert not import_completion_signaled.is_set() + + # Restart the pageserver while an import job is in progress. + # This clears the failpoint and we expect that the import starts up afresh + # after the restart and eventually completes. + env.pageserver.stop() + env.pageserver.start() + + def cplane_notified(): + assert import_completion_signaled.is_set() + + wait_until(cplane_notified) + + def test_fast_import_with_pageserver_ingest( test_output_dir, vanilla_pg: VanillaPostgres, @@ -520,7 +601,9 @@ def test_fast_import_with_pageserver_ingest( env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id) # Run fast_import - fast_import.set_aws_creds(mock_s3_server, {"RUST_LOG": "aws_config=debug,aws_sdk_kms=debug"}) + fast_import.set_aws_creds( + mock_s3_server, {"RUST_LOG": "info,aws_config=debug,aws_sdk_kms=debug"} + ) pg_port = port_distributor.get_port() fast_import.run_pgdata(pg_port=pg_port, s3prefix=f"s3://{bucket}/{key_prefix}") From d47e88e35305da95b4674d6ef48f6422df7d9dab Mon Sep 17 00:00:00 2001 From: Elizabeth Murray <52375559+bizwark@users.noreply.github.com> Date: Wed, 14 May 2025 00:00:59 -0700 Subject: [PATCH 094/142] Update the pgrag version in the compute dockerfile. (#11867) ## Problem The extensions test are hanging because of pgrag. The new version of pgrag contains a fix for the hang. ## Summary of changes --- compute/compute-node.Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index e6e6053554..17e50697db 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1117,8 +1117,8 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar. mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ echo "#nothing to test here" > neon-test.sh -RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.1.tar.gz -O pgrag.tar.gz && \ - echo "087b2ecd11ba307dc968042ef2e9e43dc04d9ba60e8306e882c407bbe1350a50 pgrag.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.2.tar.gz -O pgrag.tar.gz && \ + echo "7361654ea24f08cbb9db13c2ee1c0fe008f6114076401bb871619690dafc5225 pgrag.tar.gz" | sha256sum --check && \ mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . FROM rust-extensions-build-pgrx14 AS pgrag-build From 81fd652151c9dce2d188ff2ba7c0ed2723640efb Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 14 May 2025 16:32:55 +0800 Subject: [PATCH 095/142] fix(pageserver): use better estimation for compaction memory usage (#11904) ## Problem Hopefully resolves `test_gc_feedback` flakiness. ## Summary of changes `accumulated_values` should not exceed 512MB to avoid OOM. Previously we only use number of items, which is not a good estimation. --------- Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/value.rs | 18 ++++++++++++++++++ pageserver/src/tenant/timeline/compaction.rs | 12 +++++++++--- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/libs/pageserver_api/src/value.rs b/libs/pageserver_api/src/value.rs index 883d903ff3..e9000939c3 100644 --- a/libs/pageserver_api/src/value.rs +++ b/libs/pageserver_api/src/value.rs @@ -36,6 +36,24 @@ impl Value { Value::WalRecord(rec) => rec.will_init(), } } + + #[inline(always)] + pub fn estimated_size(&self) -> usize { + match self { + Value::Image(image) => image.len(), + Value::WalRecord(NeonWalRecord::AuxFile { + content: Some(content), + .. + }) => content.len(), + Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(), + Value::WalRecord(NeonWalRecord::ClogSetAborted { xids }) => xids.len() * 4, + Value::WalRecord(NeonWalRecord::ClogSetCommitted { xids, .. }) => xids.len() * 4, + Value::WalRecord(NeonWalRecord::MultixactMembersCreate { members, .. }) => { + members.len() * 8 + } + _ => 8192, /* use image size as the estimation */ + } + } } #[derive(Debug, PartialEq)] diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index e7d39db70d..37c1a8f60c 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -3435,6 +3435,7 @@ impl Timeline { // Step 2: Produce images+deltas. let mut accumulated_values = Vec::new(); + let mut accumulated_values_estimated_size = 0; let mut last_key: Option = None; // Only create image layers when there is no ancestor branches. TODO: create covering image layer @@ -3611,12 +3612,16 @@ impl Timeline { if last_key.is_none() { last_key = Some(key); } + accumulated_values_estimated_size += val.estimated_size(); accumulated_values.push((key, lsn, val)); - if accumulated_values.len() >= 65536 { - // Assume all of them are images, that would be 512MB of data in memory for a single key. + // Accumulated values should never exceed 512MB. + if accumulated_values_estimated_size >= 1024 * 1024 * 512 { return Err(CompactionError::Other(anyhow!( - "too many values for a single key, giving up gc-compaction" + "too many values for a single key: {} for key {}, {} items", + accumulated_values_estimated_size, + key, + accumulated_values.len() ))); } } else { @@ -3651,6 +3656,7 @@ impl Timeline { .map_err(CompactionError::Other)?; accumulated_values.clear(); *last_key = key; + accumulated_values_estimated_size = val.estimated_size(); accumulated_values.push((key, lsn, val)); } } From a8e652d47e3dec7e588b3bb3dddecc20302a0f98 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 14 May 2025 17:25:57 +0800 Subject: [PATCH 096/142] rfc: add bottommost garbage-collection compaction (#8425) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the RFC for bottommost garbage-collection compaction --------- Signed-off-by: Alex Chi Z Co-authored-by: Arpad Müller --- docs/rfcs/043-bottom-most-gc-compaction.md | 194 ++++++++++++++++++ .../01-basic-idea.svg | 135 ++++++++++++ .../03-retain-lsn.svg | 141 +++++++++++++ .../05-btmgc-parent.svg | 187 +++++++++++++++++ .../06-btmgc-child.svg | 184 +++++++++++++++++ .../07-btmgc-analysis-1.svg | 180 ++++++++++++++++ .../08-optimization.svg | 158 ++++++++++++++ .../09-btmgc-analysis-2.svg | 184 +++++++++++++++++ .../10-btmgc-analysis-3.svg | 81 ++++++++ .../11-btmgc-analysis-4.svg | 81 ++++++++ .../12-staircase-test-gc-feedback.png | Bin 0 -> 145516 bytes .../13-job-split.svg | 176 ++++++++++++++++ 12 files changed, 1701 insertions(+) create mode 100644 docs/rfcs/043-bottom-most-gc-compaction.md create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/13-job-split.svg diff --git a/docs/rfcs/043-bottom-most-gc-compaction.md b/docs/rfcs/043-bottom-most-gc-compaction.md new file mode 100644 index 0000000000..4bba758b31 --- /dev/null +++ b/docs/rfcs/043-bottom-most-gc-compaction.md @@ -0,0 +1,194 @@ +# Bottommost Garbage-Collection Compaction + +## Summary + +The goal of this doc is to propose a way to reliably collect garbages below the GC horizon. This process is called bottom-most garbage-collect-compaction, and is part of the broader legacy-enhanced compaction that we plan to implement in the future. + +## Motivation + +The current GC algorithm will wait until the covering via image layers before collecting the garbages of a key region. Relying on image layer generation to generate covering images is not reliable. There are prior arts to generate feedbacks from the GC algorithm to the image generation process to accelerate garbage collection, but it slows down the system and creates write amplification. + +# Basic Idea + +![](images/036-bottom-most-gc-compaction/01-basic-idea.svg) + +The idea of bottom-most compaction is simple: we rewrite all layers that are below or intersect with the GC horizon to produce a flat level of image layers at the GC horizon and deltas above the GC horizon. In this process, + +- All images and deltas ≤ GC horizon LSN will be dropped. This process collects garbages. +- We produce images for all keys involved in the compaction process at the GC horizon. + +Therefore, it can precisely collect all garbages below the horizon, and reduce the space amplification, i.e., in the staircase pattern (test_gc_feedback). + +![The staircase pattern in test_gc_feedback in the original compaction algorithm. The goal is to collect garbage below the red horizontal line.](images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png) + +The staircase pattern in test_gc_feedback in the original compaction algorithm. The goal is to collect garbage below the red horizontal line. + +# Branches + +With branches, the bottom-most compaction should retain a snapshot of the keyspace at the `retain_lsn` so that the child branch can access data at the branch point. This requires some modifications to the basic bottom-most compaction algorithm that we sketched above. + +![](images/036-bottom-most-gc-compaction/03-retain-lsn.svg) + +## Single Timeline w/ Snapshots: handle `retain_lsn` + +First let’s look into the case where we create branches over the main branch but don’t write any data to them (aka “snapshots”). + +The bottom-most compaction algorithm collects all deltas and images of a key and can make decisions on what data to retain. Given that we have a single key’s history as below: + +``` +LSN 0x10 -> A +LSN 0x20 -> append B +retain_lsn: 0x20 +LSN 0x30 -> append C +LSN 0x40 -> append D +retain_lsn: 0x40 +LSN 0x50 -> append E +GC horizon: 0x50 +LSN 0x60 -> append F +``` + +The algorithm will produce: + +``` +LSN 0x20 -> AB +(drop all history below the earliest retain_lsn) +LSN 0x40 -> ABCD +(assume the cost of replaying 2 deltas is higher than storing the full image, we generate an image here) +LSN 0x50 -> append E +(replay one delta is cheap) +LSN 0x60 -> append F +(keep everything as-is above the GC horizon) +``` + +![](images/036-bottom-most-gc-compaction/05-btmgc-parent.svg) + +What happens is that we balance the space taken by each retain_lsn and the cost of replaying deltas during the bottom-most compaction process. This is controlled by a threshold. If `count(deltas) < $threshold`, the deltas will be retained. Otherwise, an image will be generated and the deltas will be dropped. + +In the example above, the `$threshold` is 2. + +## Child Branches with data: pull + partial images + +In the previous section we have shown how bottom-most compaction respects `retain_lsn` so that all data that was readable at branch creation remains readable. But branches can have data on their own, and that data can fall out of the branch’s PITR window. So, this section explains how we deal with that. + +We will run the same bottom-most compaction for these branches, to ensure the space amplification on the child branch is reasonable. + +``` +branch_lsn: 0x20 +LSN 0x30 -> append P +LSN 0x40 -> append Q +LSN 0x50 -> append R +GC horizon: 0x50 +LSN 0x60 -> append S +``` + +Note that bottom-most compaction happens on a per-timeline basis. When it processes this key, it only reads the history from LSN 0x30 without a base image. Therefore, on child branches, the bottom-most compaction process will make image creation decisions based on the same `count(deltas) < $threshold` criteria, and if it decides to create an image, the base image will be retrieved from the ancestor branch. + +``` +branch_lsn: 0x20 +LSN 0x50 -> ABPQR +(we pull the image at LSN 0x20 from the ancestor branch to get AB, and then apply append PQ to the page; we replace the record at 0x40 with an image and drop the delta) +GC horizon: 0x50 +LSN 0x60 -> append S +``` + +![](images/036-bottom-most-gc-compaction/06-btmgc-child.svg) + +Note that for child branches, we do not create image layers for the images when bottom-most compaction runs. Instead, we drop the 0x30/0x40/0x50 delta records and directly place the image ABPQR@0x50 into the delta layer, which serves as a sparse image layer. For child branches, if we create image layers, we will need to put all keys in the range into the image layer. This causes space bloat and slow compactions. In this proposal, the compaction process will only compact and process keys modified inside the child branch. + +# Result + +Bottom-most compaction ensures all garbage under the GC horizon gets collected right away (compared with “eventually” in the current algorithm). Meanwhile, it generates images at each of the retain_lsn to ensure branch reads are fast. As we make per-key decisions on whether to generate an image or not, the theoretical lower bound of the storage space we need to retain for a branch is lower than before. + +Before: min(sum(logs for each key), sum(image for each key)), for each partition — we always generate image layers on a key range + +After: sum(min(logs for each key, image for each key)) + +# Compaction Trigger + +The bottom-most compaction can be automatically triggered. The goal of the trigger is that it should ensure a constant factor for write amplification. Say that the user write 1GB of WAL into the system, we should write 1GB x C data to S3. The legacy compaction algorithm does not have such a constant factor C. The data we write to S3 is quadratic to the logical size of the database (see [A Theoretical View of Neon Storage](https://www.notion.so/A-Theoretical-View-of-Neon-Storage-8d7ad7555b0c41b2a3597fa780911194?pvs=21)). + +We propose the following compaction trigger that generates a constant write amplification factor. Write amplification >= total writes to S3 / total user writes. We only analyze the write amplification caused by the bottom-most GC-compaction process, ignoring the legacy create image layers amplification. + +Given that we have ***X*** bytes of the delta layers above the GC horizon, ***A*** bytes of the delta layers intersecting with the GC horizon, ***B*** bytes of the delta layers below the GC horizon, and ***C*** bytes of the image layers below the GC horizon. + +The legacy GC + compaction loop will always keep ***A*** unchanged, reduce ***B and C*** when there are image layers covering the key range. This yields 0 write amplification (only file deletions) and extra ***B*** bytes of space. + +![](images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg) + +The bottom-most compaction proposed here will split ***A*** into deltas above the GC horizon and below the GC horizon. Everything below the GC horizon will be image layers after the compaction (not considering branches). Therefore, this yields ***A+C*** extra write traffic each iteration, plus 0 extra space. + +![](images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg) + +Also considering read amplification (below the GC horizon). When a read request reaches the GC horizon, the read amplification will be (A+B+C)/C=1+(A+B)/C. Reducing ***A*** and ***B*** can help reduce the read amplification below the GC horizon. + +The metrics-based trigger will wait until a point that space amplification is not that large and write amplification is not that large before the compaction gets triggered. The trigger is defined as **(A+B)/C ≥ 1 (or some other ratio)**. + +To reason about this trigger, consider the two cases: + +**Data Ingestion** + +User keeps ingesting data into the database, which indicates that WAL size roughly equals to the database logical size. The compaction gets triggered only when the newly-written WAL roughly equals to the current bottom-most image size (=X). Therefore, it’s triggered when the database size gets doubled. This is a reasonable amount of work. Write amplification is 2X/X=1 for the X amount of data written. + +![](images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg) + +**Updates/Deletion** + +In this case, WAL size will be larger than the database logical size ***D***. The compaction gets triggered for every ***D*** bytes of WAL written. Therefore, for every ***D*** bytes of WAL, we rewrite the bottom-most layer, which produces an extra ***D*** bytes of write amplification. This incurs exactly 2x write amplification (by the write of D), 1.5x write amplification (if we count from the start of the process) and no space amplification. + +![](images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg) + +Note that here I try to reason that write amplification is a constant (i.e., the data we write to S3 is proportional to the data the user write). The main problem with the current legacy compaction algorithm is that write amplification is proportional to the database size. + +The next step is to optimize the write amplification above the GC horizon (i.e., change the image creation criteria, top-most compaction, or introduce tiered compaction), to ensure the write amplification of the whole system is a constant factor. + +20GB layers → +20GB layers → delete 20GB, need 40GB temporary space + +# Sub-Compactions + +The gc-compaction algorithm may take a long time and we need to split the job into multiple sub-compaction jobs. + +![](images/036-bottom-most-gc-compaction/13-job-split.svg) + +As in the figure, the auto-trigger schedules a compaction job covering the full keyspace below a specific LSN. In such case that we cannot finish compacting it in one run in a reasonable amount of time, the algorithm will vertically split it into multiple jobs (in this case, 5). + +Each gc-compaction job will create one level of delta layers and one flat level of image layers for each LSN. Those layers will be automatically split based on size, which means that if the sub-compaction job produces 1GB of deltas, it will produce 4 * 256MB delta layers. For those layers that is not fully contained within the sub-compaction job rectangles, it will be rewritten to only contain the keys outside of the key range. + +# Implementation + +The main implementation of gc-compaction is in `compaction.rs`. + +* `compact_with_gc`: The main loop of gc-compaction. It takes a rectangle range of the layer map and compact that specific range. It selects layers intersecting with the rectangle, downloads the layers, creates the k-merge iterator to read those layers in the key-lsn order, and decide which keys to keep or insert a reconstructed page. The process is the basic unit of a gc-compaction and is not interruptable. If the process gets preempted by L0 compaction, it has to be restarted from scratch. For layers overlaps with the rectangle but not fully inside, the main loop will also rewrite them so that the new layer (or two layers if both left and right ends are outside of the rectangle) has the same LSN range as the original one but only contain the keys outside of the compaction range. +* `gc_compaction_split_jobs`: Splits a big gc-compaction job into sub-compactions based on heuristics in the layer map. The function looks at the layer map and splits the compaction job based on the size of the layers so that each compaction job only pulls ~4GB of layer files. +* `generate_key_retention` and `KeyHistoryRetention`: Implements the algorithm described in the "basic idea" and "branch" chapter of this RFC. It takes a vector of history of a key (key-lsn-value) and decides which LSNs of the key to retain. If there are too many deltas between two retain_lsns, it will reconstruct the page and insert an image into the compaction result. Also, we implement `KeyHistoryRetention::verify` to ensure the generated result is not corrupted -- all retain_lsns and all LSNs above the gc-horizon should be accessible. +* `GcCompactionQueue`: the automatic trigger implementation for gc-compaction. `GcCompactionQueue::iteration` is called at the end of the tenant compaction loop. It will then call `trigger_auto_compaction` to decide whether to trigger a gc-compaction job for this tenant. If yes, the compaction-job will be added to the compaction queue, and the queue will be slowly drained once there are no other compaction jobs running. gc-compaction has the lowest priority. If a sub-compaction job is not successful or gets preempted by L0 compaction (see limitations for reasons why a compaction job would fail), it will _not_ be retried. +* Changes to `index_part.json`: we added a `last_completed_lsn` field to the index part for the auto-trigger to decide when to trigger a compaction. +* Changes to the read path: when gc-compaction updates the layer map, all reads need to wait. See `gc_compaction_layer_update_lock` and comments in the code path for more information. + +Gc-compaction can also be scheduled over the HTTP API. Example: + +``` +curl 'localhost:9898/v1/tenant/:tenant_id/timeline/:timeline_id/compact?enhanced_gc_bottom_most_compaction=true&dry_run=true' -X PUT -H "Content-Type: application/json" -d '{"scheduled": true, "compact_key_range": { "start": "000000067F0000A0000002A1CF0100000000", "end": "000000067F0000A0000002A1D70100000000" } }' +``` + +The `dry_run` mode can be specified in the query string so that the compaction will go through all layers to estimate how much space can be saved without writing the compaction result into the layer map. + +The auto-trigger is controlled by tenant-level flag `gc_compaction_enabled`. If this is set to false, no gc-compaction will be automatically scheduled on this tenant (but manual trigger still works). + +# Next Steps + +There are still some limitations of gc-compaction itself that needs to be resolved and tested, + +- gc-compaction is currently only automatically triggered on root branches. We have not tested gc-compaction on child branches in staging. +- gc-compaction will skip aux key regions because of the possible conflict with the assumption of aux file tombstones. +- gc-compaction does not consider keyspaces at retain_lsns and only look at keys in the layers. This also causes us giving up some sub-compaction jobs because a key might have part of its history available due to traditional GC removing part of the history. +- We limit gc-compaction to run over shards <= 150GB to avoid gc-compaction taking too much time blocking other compaction jobs. The sub-compaction split algorithm needs to be improved to be able to split vertically and horizontally. Also, we need to move the download layer process out of the compaction loop so that we don't block other compaction jobs for too long. +- The compaction trigger always schedules gc-compaction from the lowest LSN to the gc-horizon. Currently we do not schedule compaction jobs that only selects layers in the middle. Allowing this could potentially reduce the number of layers read/write throughout the process. +- gc-compaction will give up if there are too many layers to rewrite or if there are not enough disk space for the compaction. +- gc-compaction sometimes fails with "no key produced during compaction", which means that all existing keys within the compaction range can be collected; but we don't have a way to write this information back to the layer map -- we cannot generate an empty image layer. +- We limit the maximum size of deltas for a single key to 512MB. If above this size, gc-compaction will give up. This can be resolved by changing `generate_key_retention` to be a stream instead of requiring to collect all the key history. + +In the future, + +- Top-most compaction: ensure we always have an image coverage for the latest data (or near the latest data), so that reads will be fast at the latest LSN. +- Tiered compaction on deltas: ensure read from any LSN is fast. +- Per-timeline compaction → tenant-wide compaction? diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg new file mode 100644 index 0000000000..7107198c0a --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg @@ -0,0 +1,135 @@ + + + + + + 01-basic-idea + + + Layer 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GC Horizon + + + + + Images + at earlier LSN + + + + + Deltas + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Deltas + + + + + Images + at GC LSN + + + + + + Deltas above GC Horizon + + + + + + Deltas below GC Horizon + + + + + + Deltas above GC Horizon + + + + + + Deltas and image below GC Horizon gets garbage-collected + + + + + WAL replay of deltas+image below GC Horizon + Reshuffle deltas + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg new file mode 100644 index 0000000000..792db6d69e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg @@ -0,0 +1,141 @@ + + + + + + + + + + + + 03-retain-lsn + + + Layer 1 + + + + + + + + + + + + + + + + + + GC Horizon + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + retain_lsn 1 + + + + + + + + retain_lsn 2 + + + + + + + + retain_lsn 3 + + + + + + + + retain_lsn 4 + + + + + + + + + Dependent Branch + + + + + retain_lsn 3 + + + + + + + + Branch GC Horizon + + + + + + + + + Partial Image Coverage + + + + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg new file mode 100644 index 0000000000..9593ed969e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg @@ -0,0 +1,187 @@ + + + + + + 05-btmgc-parent + + + Layer 1 + + + + + Append C@0x30 + + + + + + + + + + + + + + + + GC Horizon + + + + + + + Append F@0x60 + + + + + + + + + + + + + + + Append E@0x50 + Append D@0x40 + + + + + + + + + + + + + + + A@0x10, Append B@0x20 + + + + + + + + + + + + + + + + 0x20 + + + + + 0x50 + + + + + + + + 0x40 + + + + + + + + GC Horizon + + + + + + + Append F@0x60 + + + + + + + Append E@0x50 + + + + + + + + 0x20 + + + + + 0x50 + + + + + + + + 0x40 + + + + + + + AB@0x20 + + + + + + + + + + + + + + + ABCD@0x40 + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg new file mode 100644 index 0000000000..b8a93d5b5f --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg @@ -0,0 +1,184 @@ + + + + + + 06-btmgc-child + + + Layer 1 + + + + + + + + + Append P@0x30 + + + + + + + + + + + + + + + + GC Horizon + + + + + + + Append S@0x60 + + + + + + + + + + + + + + + Append R@0x50 + Append Q@0x40 + + + + + + + + + + + + + 0x50 + + + + + + + + 0x20 + + + + + + + AB@0x20 + + + + + + + + + + + + + Ancestor Branch + + + + + + + + + + + + GC Horizon + + + + + + + Append S@0x60 + + + + + 0x50 + + + + + + + + 0x20 + + + + + + + AB + @0x20 + + + + + + + + + + + + + Ancestor Branch + + + + + + + + + + + AB + PQR@0x50 + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg new file mode 100644 index 0000000000..65034226da --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg @@ -0,0 +1,180 @@ + + + + + + 07-btmgc-analysis-1 + + + Layer 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GC Horizon + + + + + Images + at earlier LSN + + + + + Deltas + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Deltas + + + + + Images + at GC LSN + + + + + + Deltas above GC Horizon + + + + + + Deltas below GC Horizon + + + + + + Deltas above GC Horizon + + + + + + Deltas and image below GC Horizon gets garbage-collected + + + + + size=A + + + + + + + + + + + + + + + + + + + + + + + + + + + + + size=B + + + + + size=C + + + + + A + + + + + + B + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg new file mode 100644 index 0000000000..16a17ec56e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg @@ -0,0 +1,158 @@ + + + + + + 08-optimization + + + Layer 1 + + + + + + + + + + + + + + GC Horizon + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0x20 + + + + + + + + 0x50 + + + + + 0x60 + + + + + + + + 0x40 + + + + + + + + + + + + + + + + GC Horizon + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0x20 + + + + + 0x70 + + + + + + + + 0x40 + + + + + + + + + + + + + + + + 0x50 + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg new file mode 100644 index 0000000000..243f038c88 --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg @@ -0,0 +1,184 @@ + + + + + + 09-btmgc-analysis-2 + + + Layer 1 + + + + + C + + + + + + + C + + + + + + + C + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + + GC Horizon + + + + + + + + + + + B + + + + + + + B + + + + + + + B + + + + + + + C + + + + + + + C + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + + GC Horizon + + + + + + + B + + + + + + + C + + + + + B + + + + + + C + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg new file mode 100644 index 0000000000..1e49ec017b --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg @@ -0,0 +1,81 @@ + + + + + + 10-btmgc-analysis-3 + + + Layer 1 + + + + + 1/5 X + + + + + + + 1/5 X + + + + + + + 1/5 X + + + + + + + 1/5 X + + + + + + + + GC Horizon + + + + + + + X + + + + + + + + + + + + GC Horizon + + + + + + + 2X + + + + + + + 1/5 X + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg new file mode 100644 index 0000000000..510d7a0c3e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg @@ -0,0 +1,81 @@ + + + + + + 11-btmgc-analysis-4 + + + Layer 1 + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + + GC Horizon + + + + + + + D + + + + + + + + + + + + GC Horizon + + + + + + + D + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png b/docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png new file mode 100644 index 0000000000000000000000000000000000000000..c106f3ee899d46d90c9e1c0b630d3e73075b1f67 GIT binary patch literal 145516 zcmeFZbySqy*D#ELA|NFtl7fgx2`C*RAc}N{h=73P&>bQjf+8S|N_R=ijC2i*gft8x z9m7b>(BB#FUp(vk{<`n)UF-elVJ(YD#1z*Gcg3@W`G#R?x!3 zBhJLbBPh6h3EVLza2>|OBPq3!m)CeAFVCjo0g#jgVflwo zA4q%_m6N!T6T z^#$eEB=DApbjCMdgoF^{D=cp!m*iPr=WQ|Xu01!RLLpgtDqn2vfbB_yt_qsGSHPv!t1qq(l)o4x?{MR{g zSeuE>n>Q(XNX?$ggm)=jJ0yEHJS;4BnZrFu@G8dxF9})cc>(YEvm{};)DIoX@xM2p zTM?oz$TbadX5x2PwSLwqHas*c8Y#+T&I3$*9afL6!+YwGx;{qZVwUM@jk#-6s_>c1BG$0=uyJAJaiMHV zU&1m%JvJh9(f+y452k3XHiBgfsoa3iVk(oeA5j~ErejKt7`H*wgFXx8F)L0b>=VpD zDk`hb8ool)@z{6KsrlQA1$!>~4KIZapPW&|z09;_p8e>mwp_dFcl0g+4iS3{c@-z} z%h$Z{_~(K(9uUm3T{j>gdO(!KCKR0}c^Zq4xFVPN?qfo_q6J%i%u|cs6Lh3Dqe-K+ zcq&;F6Z#7?!70_3xffUb-%#N((Q^!_?m}tF@V>GgzyFzxVYqSHYmM(@BW%nI&)ipK z<@&|XhA;k0MB}2-1Bz=Lb#Lxh5Dk21UK?R048BZBs)r}?_|n>g%P!w-D=yFkU9BM7 zB-DK&TW~SFMbnAt>l@)Y_IeV>H!MyRQkMu`T>D0Nb?1W0L&}igZpw>Vp_W#Fz8%le!c!V>+|pr#Z`$BQt6OOp9NR>_815t;9gmCI@&FYv?PDc$BAp_(MFyWAG?t+*U zXKQn0OLEWC?)i{kZf?m^S>hMnGIupQm9#HgMg7D4V>0EZZ}Jo@ehj@z{MvUfGBzwW zE7oA;wOZXkPT70M9LFHX>TPQ2FDxbkn&r>R<<-BGm%fE@lh_n zCFJvav#;~F+Vt8qIcPcZG@+W=nlQnM@qV%4`c7;a@o0!+ylvH3P8W?xjj*ansYtWs zrL>7OH2ffSpTC{g?OC^}LG=z)yeHi((_!|N%&X7#RIfB&4BDc7-k*J`SDd1E=RdNt zu}-lvv_2eH9_&RHeJ`3>FRdxOKh-#SR7N(@*=3cXn%2)~ENoo1B{N0jLDR(Pk+vzZ z37wQ2C*4Zh>|)wuUSK{Do6gr>yRpZ&XBT@-iK5Fr)|yFHd_vMvoZ0hecw$=oXG4p( zTjikn_CjNaJbxoU??(cDg0g#KUSq5#MAL5529Ea~5!3tA;@dD&&^N%gkJA%aoGLL?<(UT*vx#ZJS$CIrp1@i1z%q(ywt@9_Pw@UcqPlKsBezQ($ak(-gLgp9Pz={?i;S-r6I&x!BT26^sM(-2=0 z$`JCjQVzM$8qEIq>$8z0Ma8ge{>nF|Iorn8DS9(D@bpuJm$#j|2@V9<%jzgJ4 z+0uvQ=H=@KrPV%d>n-b`^uXprnMjf>=MlfoLo*2C_)X>`ru-NN7 zc-Y^%A1aSL3>Y)GX?&Bn5!T~9e1;lC?m|(s_SN2)Wk^T*0wZS7N=1Zr%(=xHer$A(`b4`+rxLur^IKMiV3UKWjF?26@;9C97m zcxC*-dpO8yX*u3vE*U!>owXRaEy?2aZISR)Vfab#2VXT!HQKrdeBONVqrXOjYR7A5 zhtKjQCo`thPtDPNpM@o?ix77*_@tGj^t~L{{mu^2q;E*e`EQvpts6|Bv0CulrjPOh zo~Fe0E?Y&2pHDMl26o4um1Q(+yPfWl9lae*I}kN)+-y`kwZ&`{Iv#H)8I#vd`Y`!U z`qE=Eo3=cT7nsW>^-)5o*>Ci2<#hs8sFqVp3=xYHON@+)FLqh;AiJk)<5=*tL;Y8l zj?_;tqusO~NI$^=fi+R{rKZeKSzoDk7N5hWRg6GeD53|!^XRM6x>BhUBIYn=|C_;) zIW~80=~wX=S(>x)-AU|zkL=9(W_7*PRva6>?vA+MUX9Q6r)?G7|5bG$Eci9iB1; zUVCn}@P5pLYUBlB!ZT{bz>Kgt1ztRfLKzX;<-y6d_jHvm+jt>t*XPM!FftYn^uvU! zkG@j+THFD^*pzJK*{< z9^r-Sco)Id1@J3#f$=}rk1pKCBlvS49}h3w29NMxW7NPW?&lr&#m)KalOR3}j|hCb z3Vyvl;s1Lyapot2e_s<6fMzshn2nBsn}Mo|gayQb-|QvC+>+nR!3hTkPs&RITsl~~nX!2}*gLvPcuBMW z86g3#akmB7+5QZ1vy)~wP}N|Qhqzd>iSXa$zsoK|!p6oXLQCP%zovt4((JF? z+?*r?1Ux-G`8|dCAuiSeg5u)h0(XT3goOCO2tHSDM>jJsK1WxMzkvK3j)JADg^P`o zn+?Q~4F}iE9OCXK&CZTn=s!Py;c4k*^WT*mUH`Q$ut5Rb69GZ~y8{1#4W>%r?n-Fb zcv;#TD%dyxnt?TB#Dztq{*3?sJo)d6|6``Xe`gBa6A}8KQ~$@K|Myf~S4$Uphyz&D zP3FJ+`q$+D`S4#8r37$W{~x6Ii_U-U0z%7>ND2IhX)+`Q9ZIZ#k4!cS>N?;PNEz#)MeWA^;#?d+)5AOlq69qXPuM3+QMBjAoA9NAmduZD8)bkwFL^qd^wa;p7 zp0&)9CMj-4(b9H2*Hi3xvsx4}Q`6*88QolG;!98X8zCAu{juj4Ib=`z0mk#Bd-~=| zf5xF|ysOLaCa%&FY7Q#=hYv1azHoz$Fyu8Q<=^}ALxIpJ(rZ*9C+F5bnEkI6SuRi^ zMyUS&;{QhY6F;GXR52+(H|amy1P)Gw^*X^nCxZq(K6~cRiysq0|275xy@?0y!u$Um zeaZq-K*aSoPkaCQCfF8Ij{Z6Nmq$VY5#3ujEpPm1OMhFc&?^4_SQOa^*B?G;uWp=+ zull=E{9lA4yz*rJ`|WQZ3ilgq3x+06PYnO|&hh`vHw7te*1vfO{|CZBjg*w=68`!e zULe!|kPPgZg%p2#=l@1OfrVNsskVm z|Emrz|F1g0fB1h@9lQ{_ao1<>mFsefb*lbz?~aq-eF9eR73l<BBYF1*654IWG( zW1w07hB7%G7$JM*qx*Y?|3r(QaZz5A4fvK~TT}>N$qIC|%5WboLQQ{`C)yp+R!=!x z$$}NB(ms5l`~aAH?s@$m*8jv_VzVC3ZPg(~Od8z;k$$_X2fhN;aG~X|!)n4ajqZK5 z#afqD9D!e%HVu2dE$|Pl>5gpp_wM%`s)$T!GQzLvZm8Lr&B`v8=UiiD5;yKPP`$eE z7^&;9Eg!=uIuOGk+$maPH#VAkVD~|L*By59J9^AvIPa~^SXqY6&m!Hz?bX5T{_#qS z=I2wj4o~;*ppXOkbiCS*N2jF>(mwE)O|bi;F^8s5m^%`=U)ZyMJoUu+ywK#_^L**0 z)^wn)XP~elO4fb5__*=O?rGY_nT3hzX-A;r@;$Azq$g76D$?%M+p+`wfeuqCle?zU z7}iBuo?Xb+aUA4Ypra#X@^t0|wd`_!YCLvyrrmuy)rdlt(VdYYS*Kb(ZHM!cPA{Ab z&L-6?VIaMjtnSmUsW32g;Z&*VxGwW`;Kr?KSZPyW({&^)^El4kKLLeIIL_|6_G;)f zC{nxu86u5l6&QB@7yJeoN)$w zI+Zd?9xfzTK94D1Io>#HH9al2SE+$xEmSdmA%_r`7?{sCBK0@DEb2wE9YQvH=0te( zSbH_82aBfOMr~Hqo!=k*YSC3Y0@iolc$YVWoG@MSJVGIhMo)|p7|7Xi9LwV99(ka> zkMYV_5;1mhIGGp;(^I){GU%`fl|9qn0DCFKu&xB2gUQExuzS<2C&sEh7y$iRadg_LJEv((!$*sq2|NC`ScZ#tk9qQb`6d2Ner};soSCd8ui_ftl@L)VejZ(O ze6UU;SMFyqI~|W*uu+n5>n7 z;mZ598Mu|aO+%yYl2s#e2FpsmdQ4G5V34oIoA(QIFO^c;E;3n*X*ktDrO>Tq@%r2k zzu!Gvd9Eu1hBa|AN8Mi`$XWQ^wks0{J-TZaw(Wb9_uB8-b&T8^vmWthq}-`A8bfrx zWklY5Y``&QbesbjvYmEgu5Pc^(ft1tAmFp%qLiS2=-C1%_5VPW^2)vKC~dOPNZnFo z#$J1Of6O#3J5FXY-Tz?1@MWyj!sF_$Oy6YL^Fzck)E6bp>bDcL)@*^gGvhfMF!juP ziP9Bb*@dkgR+pS90cOA|lJjH1Q1g5kBYb2lPlORtRe1=kgzEX(Y5ym=;G(?(v6G#) zei>hsg+%{M&&h`Axe#mMN#)wlV*SKzV@R`Q8Fby&bx%;~xK>+T`lo$;%O$JI4*K$; z=Ch-X{;$I-b|RBr9J|0mBvY%2wWND5r|A0VFpZ+_RwK`jQ(~DVQ+>A^x=vPx)%(fM zF@d^`5a?lE!$GQDzoruF($uANq3`zJ(4AI;9dxBb%pNm$g-@>xr;&+H|L{ATSXut{ zL0yCZam^rCNf9Q0oI-HK^kO;Hj^7ny;+ck!IfmQA+71BBZ)af*j^pmj-5hay4*BVR zQ4?;%s{QQI{93t^ZmSx@0eJ^jr(~m|S*I9bU|8^6kG>BV?-i^bW1!1(^5i0}1LFA& zy91K_`rDXEkEX36H&frO`@0CnGaGW*1t+A(P$X9hU+s7+@5xwbv0IBx5_xs<{-`nH zWJUH|kU(8_Cf#S!2JHOg^c^e>hB<9Yv4|CiGuI)_pohzh)%~?oI%`mgbzRqCRne3- za#?s?hF5E(6|kuHNcp~LByQi=H?;u9NUwU^XCqZz_Hg981`(?{x%8ff z@G(<$+7zT|#t$>&X|>@{TQ6HstAB?g)FMy1uUyEbozX;*ljJp8X?nU zXg%W&=VA>sACTDKb?tqvjbr)h_h2}c5|f!#hJ&8#RB2rbYXG*zzGh!fw^0DLSJkjp zJEek@KR&8G>)IEcg20+n0)q~V-NNggl>?2MjwhG1DP)CKGCVWI*M5D#0jxj-Ef-|R z3Nc%4y?8V&)S7jF#-D`R?j%I;-IxC_mU zZd=Km%*ng9T%=E?R+p$$j*F5Z+7s+za*aU=)LnsfI9ct{N!Jw&`J+SW`%OG&EXfa# zZ$P&UFrv-As8`Dwkl8ykJp8a$gtbYN`+e4L=RpmtU!vi}T$woHogHsMVr9>~h69!l ztg!u}Y0isXY5k(eZ_1d%l}u60tZ%va>Rhnj)Swcx18{BVh1FNQi`MH_HBsCceAoERRGnDnGI`5ukNd4O#cAwxs|4~CbPRIyo@|QtD87~ed^EX z5e&-!N3z)M+!-wkXOHGJKdYTY!i^ykP0sGwGZBehO-LW4|H1wIrbC;?wS3{qnTY-R zfTOkiev3GnWrvGPZAbgAW5>Qta^gwRCV_4IrH$)F%l)G1GxfEDj_p2+oo{zxvbAQD zHu?Aen!DbU52sd_F*}&?G)2-(z16|Ak_y}do5(S=Ow@&Ml*6`>nC|TBn~zE)4?b2} z1!9;7q;{_e@826v-UxecZIy2Gs5+>+xKQe_{lju$ewNU2{n;C)NsNZ;yNzhqvo-a? zB36`1(zdytkvC`XO#S>3BM@*F&0NvW3^D|k)oqmRys68BLpR{0va7!uA#HasX(#X( zDkr-GqJ8{zC5Fx&wEZ}zwqJDS+4x%n&pkwTe-IJNs{x5)-oT`WIz~s9&hjUr$g@NG zOpJe~$mSzhKzQoJv{Y8F_e!RRNXx5%>< zPQ*Z#ojeMTB{te^&yEHIWo@7PtSMF>v-o0vw-7OJ`g3hQy0en*A8Y8HnAq>q$7fx^ zSU2laEI#5tsuOTBujJZ&BRts^AvyNdac0u1X>f4jZOxdPZa)Wwtofs6iC%>GJ)fp| zzt7~%5;|*{{-YCCKVhY|NhThKy9?0*sgkJr>I%ln-xSF6xebKpL6556r6$3;U4kZ$ z4Z7mPjc`_<@oxu5c2;91p8J+ZWUjsNYWTBU-_cLiC)2~0Dg3u;z-$cC9~Ic<+5n^8 z>@+?5O<%1yR%3oY@bpJ81%u!B49|&)j|`$+7)S+U;cvDTyRwF5Bdaz3r)l(=fwPB+ z=f7q2iZENE$Rp<#RNP&4&+X}nk{Z4JQ`Y+P3%dhNsN@Wf$({{!Q8kIRo3hXS755Mf z1PCXXnP~Mdt*0p?jtaKRY2`5VnY{sHy9dYU;?ddhwkQgA9#AO~vGu4Z8df(TL1q^` z;@IA38n|C`PR4Nl*k06Jx(wx7i;-XH+gqRM8%K|BR-XCAw&yF!C_t1192i#BD9gV)qpgj6N%HKCrCr3``SVO;A*#X3qYS|{0R87o=Fu+RNww`!3C zCqm2gaX=^v`8P6H()RBLn*1tXj^xVRsTIb}f}Nk1i*Jn#!)?fw2xTk#hMr0`Rl1Zc zm#4@Z^i_#m6P}!SD+3U|Z`o8hrMtGE?l4gec30m93~eS4IDBv;RE+SzJu`>bS@878 zna4{9Q!|ruQ(-kNt66~_9*dt^9djB>MC)dYz_iuuxK#S*UY~(PZF6PS*94^%w@%JY zyCUxMNTdbLJids`I*hMrQI|b^nS34$NWC^Z3ETze8@fbSC*pJ(ThoF;211Sww(Q=3l`Bb`efJOSg2P#S zx{n{U0MTfd;BTyOcDTC=m$Q0%fAsG7Tb(eZ)O0CX=V8qSJB1V23sJ2Zi&&Y?&9A#^ z0sMhGK}2Mg(Jw^JXD~=#FK& z=_za4UKqW!dkrEw>+I3ypQ$tJKUPXMZNynZR>3%vzGY&&maC=fy7*Q!bvcf^d2#fT zz8B+nGP{n12Xd9|&x} zmcb=4+NPoAX6v)BXGdf}q&mVge6G_Ey9j0pz_3H-R$yRbIIBl}7DUox{Z$2H%W;p4MY)IhnrV6OPe4s=D|)PHHEO$Sw@ zZ=%vy>MgHA+B8^gtzNy;N}6AEE0)_#^ipG`%eqaz-kK%|T$IPd+Q=p0rK(oN4B9?R zOZxB#Dvqd{7Bs)w(@lz|_r1FEVc44InC+SVE>>}%=0*XJOh$Fuxg%T9XR?yMPfGY{ zH-{Vs%7i|0#H=xUjJI_0Yt8VOTnW0yX9tuOi0x7S`8shfiYh%5WTPo3WraC*!B)}S z4K)`8AxCzl$y$vfY-%vz2KqL=4;b}w*g5k>i8nbvF$2PIdN76P`ZiSuL0RN|6)w~A z9ynA4epr@i@X#YBC($}6I2$e#mLF3Nm(viXcBA*k$$I^|(%bEVFEtHLZYcuqAhnF_ zmbVJGA>&94x1t@v=Ao{>iSGDYZ1iDGWc2%x8o57u%3oze6EN@eBHEW~yRHpDMJv*$ zbcX1e8HkpsULtoB%w5a7KSt3st0Fq%X4UpqBO|5$w=E9rK6MmC%< zkpf!nqcx>E5pnE`Hlwhb3m%K=fwtq(k7YMttcP`RaiQQ=aJ`5xRUu`zfjcd|wu`g$ z0lOnC8+aIvvW=(I_Of;jdZi|Sm2cIU0bPKGXAKzdNJyus*l?saS9Vm5L=DX9-k}xb zG&gcA-$V@{9$UI}B^YE20pNhtLx`jA1pIk>9*dKBO)b#tV5xNHFk?nH2Lr|&oN79c zMec064jlaa zQA)B3C~n8L0L*uDhdB)UGbW~{_fbgH+31!hr1~xQ0sAeW#KusbkR?Dq(|(j=bMes#z# z^Th~cw9~N@fOrysNgfQqtvKz+viYc}FThhkRBgpwrUEy@2&bFDq=(=D*!B2?_^(Hw zJ3AMuBQR|I8PTdgU9*;dl+tSdKA!p5aJn5|Bj-A8iW+nyj94rCwVe4(rC|+aelmRp z!&8UU{}$(ZRKk&K(C*-4AZmEDF)hpg!xjXeR>)r>a;2o#evOonYE9YXrR2SGWjs3o z+CiWf9qQUx$`N;Xr*6$6qcSRur_AKjMB`jYxoevI=*0^>lkzPW8B2^Ggnn^vzkWOf zY}gXE^*wwuR&sl__}Sb9KNP#*?AlQ?m0rhRfZ9l(ui?jP4ziFq4);sADBLzu8mV&B zZn)h=Kd@><|6oUVprJaJkWy6tnWXis#KSnSjPU6D7k9uidQ+qf}{|#pO_= zghkSo90Mg~v@UtZ!JDWZ?7#D(F~3>1O4?cm1#7Z;b7y46RZ)X-YRs({$}pWdHRu`E{3vB+|1;GaPc}_6U*+8Zb%Cxp1KbG@{o@zz_{*x zBFHN4ktwaB$MzVG)YMk(0*{8XqVj-`G~4+01AzD=JVSfEm}%|11ilKup?}6h zpim#0Ts*0pLWbq^JSLb?bMYLO==R|&#a#RD>Do|}aE>aYS zu8x&wxxZ94UFz2P*PMa()s8n7H)z?>bBG&Xb#7PGLJVdtXZd2Yfs(fBE% z9AsWLN=Wtezjj@O4ud?ce$ZNJrq8)#x~5AxZi96q;VpZ-grz5UB#a`I|J--cva@*et~A=O*eGl;_Xrn z7hSN#(N~TY>>Z_>wfq{CqV!{x%c4Z{@O*4`$E6*m=Z|keT$=|kERFij@tdt zL97F?r<@uDL>okAjirleg}KM(LmEL1p_Jq5@`xX^$GveuC;amCdbUs#+rnJG4^!BE z46C${o7zu_hhHqoA7P~rP5KBW*YOLXDFF`ydx%Bwjf3jvqF^wNo@Jfpt(HqSV&Ue@ za+1YLoFaVVgYKh-T@pi}e9OqNdHXc;3?bJ2#9oPcZM0D}3oyDZ778ryS*rJ&Lw2t5 zL0Ub77E%AR!I-kd$^08dEP2 zYV&CC%a~MTj}t37BGwl;7I@gR0*(sZ2`5Te60w=sPZ7r5#&s#n<*w&1l+dMAn>ieH zDtKe8`jN~_(92l_P0w$QYeFenF>R(e99u}wvsEgx=VIp&VpY#liX5hD zOV6b>=P>SsStVst-|v9^bWr71`{+0V9735pvRnz#S~G=tsl7v4R7Z8Bx2@e7=pBcL z>ja-q=RMbpQ{KG2NZvwkCAHj6m31mc?h%823rnZoyuC}%>KAlw0#;vU4)MMWli{-H zirvRp%g84qnHpsBZJE6rE3)AqtNTTa0T`ue0Wjz8uCv-3W=k2RWMs;U>l+iNoGZwX zW+#fGUpO|>GzHI*TEw|5Tz+*XSE%?{BHKqOy+@_=<_@@rlzzE@ zij!8_2@Mv=T>tHleSZ1ujc)~qE_d%7&F1ZZ$2%RQp~UnpvA-qRu%a#WS)a7N)suy^ z##U{6QWViTtX=d-ID12>ei`ajv=R~W-^D(jslXoX&`Wx5rx(#!b`mdMY@r~7dVRh- z-zrVBf?{1qLxKfZ)?0ZNQ=H$<2{frPC|y>>dZJ4Rv`NQq3Pe!AztTVsDS zFpJ*=D48^Q@NM2kN@Ev7TSG`k1>i6_10)Pi2tbitg zz+JRdvf9vj?l5E9`{MSeYl1(-tywLK*$gdlc8?OYEo? zI%>>~fUhOyW~jnLuWhc@`$SrhbMv;d=gouAVT?N?HVG?Fxx7%79M3UfprnE>suW;4 z6A9o{Tey4k$1!Ca;-871(HPP;XXa!hWT;Eif5cSE>q{&@1$YxAybH}XQ%>Yg^YVj4R2VkP{^YAr27ks0`|AEw8g2HZ)>9D7 zykU;^Hw*mmE9+h!jo-XNKZ42(a;rUIr^odws=poTq((2vDse%1G1$f zfCFBtG;}(%2U$$kniMJQ10EgqR9ebN)_&k=0Jhaqu+*obB{pfRKLj&x6Ktv~MH-#9 zV>M5!#Lk%++M)V`xEvPhMMoz(-`vU*tD`K%LpoGLh`ypE^ye`)X1<2=O^DbdZZt2i zW2=9g-BU`0!Q-5E;q=7xuY~BB*DwA|>5yw(X3wOe49%guIeQlzjhak0Gwn$0y_y6s zy++x@`;>y&HtYLgnBPwf%@^Ry`1g67XN5CSSokuok9 z@(10-SC(UwdGXTq;p;p09zUN-S!iZZzX<^t*(c{85fZmq={L7mSD0l&xfsp=;HhP@ z%`DQi4j_{4?nx70&12_f{$e(471+JgAeR(H-&F0C1p~195-N8ruT2+z-!-<7Q?bVO) zMRyw_tec1VjS>{4()`+m0|-SJZ_%T935mJDU;EmNl9O8P)m3c3>V}2g*5`hFdIY}Bmezm>&US9iAfR+s0KwYJfGs3zZqP4)l zs3C3xxQ8xI{;uj*%$rBsm5>_kl@!elXiF39W3H^qi6ddKtXu@-VHo@gkdM_3!Jss( zqoGQ4vQ|ZQzoJxMM?$OoHSSCM?Fp&MQf;w-A63a$(J>Vhw2NNj1w7ZYOxr`OWKm^m^_}jbNJy&%rAcNS|e8 zy|dZh4upkeDeo#`Az6#<59j?OxkH;%#_84eZ;K}4P`+p?=5Km=`UdxdP11aygvxB7 z>SeEF!P@LV zngv6zWTDM|725?vTP2`C!Fd%xmu#AKj>JBc;_*#J*HI!Kn!g`83kl~Y2x0`VSgcAu zdWjTTD02e|$fLPD4cJTiQ5udlj<5#$s6$pt#n4Jg`&8qsnF`Tsu1r~P$9eHGQosDe zr6420TpyXB2y5ye>@RLC7`AEOw;{TUE;$s=TzV5?sUAK`eUk=VBD84OJraMg_UxTT zdv%x!AGaQRCiBoH0|Ivv*gi*C_kX}`VLis$GMbSJaXmPMJ(GLU(3Ozbn0=AK!?<18 zLQ(2?qREM}Pf^MafI9lho_Tr%hRZ#qLMTYtt+&O1#*xkhaG4on&7K+QCBn^OMT53? zT30))iU4?Lo1b z!+Tk<0m9L}g8m&H)gmyu*DEi^I`B@|MJ1C&Y33r_jRROFnpY}AqCp6(Jc>lH&j-6i>^O_Ugn7plc&L-G$Wc6IWXIPx*e>3pgp+CE zqf5{okW)_HxCM2ba;Vz}FLs_*h-a$lc2;-hYo`cUr#aMWorog+Tm#z$NnW1M_@bJt z8~3F$l$SMP`{izV@V4fqIojTk;*bjYoGs!$l83s|BP(I6t>n;2=*V0n-sHbk<45Oj zDP-H{wF2O&*A;c=(irQDG}`T<5!Rft!8l<(1(o#Q%^Wi9yj%RZinng3u%#5~ z>6?6Q_mSlU-w`WV!0Cy)`;7b8#P&>6cvk7U`>gj$4Tn;*vY=Tag8rKcw+p;;8SG1BmN9`hH z>%blUS&rs$cYP{E_q{MDH@8x@8;nR?d4J5rq2}l2&V1Xc`w@b#Zra>NDFH_ZP&mpl z$+_L-v6u5ftqN2Ltq18$aXmy+R1se?hMvUv&g+F?b=j>cQ2Vm;oe!nAavv;QG3zvC)E?4I3$Dg?3FZ!F z^9a}E1U^vVPJa{(iH-Cl5-A|yR7AYG{P~X&x1x^yzPqcIE4)MyQl@1W8+$0 z$(=NZ;*zQ;$T6sTIe>iJb>AemHW?GExwovpEV z7AMm5+N;H@PD7L}rKWzw-yjDnihx3=Q<&n>?>He~E5On1*NGviItOEoFS4L75;|55 zu=N53Y*?Bck5kJ)X|rdV@hot(-tNM`@sI(Lgn8$O?x3e@RY}>bopJ;ViqP&IR0_j;AkX^Nj4qaj+rb=wYo|$+;g0tShZbn$EP%!1T z4!#r*N+)>2)twM9vV0?)X+gF~D08hql?ow3+*Z5pJ|MpGwH$g>R8Eu+9PK+U-`+)7 z_xTvKh8<$jIEIC|sLb68Uhgk}m$#jq<9E4K0X_@H+7H#m&T7-U?{8E zx$&SoKozBh%1T%HL~$`Z@ySl{BYG0OKqdtc)G5P6UP06WFF1bAV-|k5a_CDnLr$YF5RQ# zNznW1fBcX%63EB8sLp5_pOhc`OZdTQ@Bt0TjJ}{@>7})D zw;`~;kipSLcRNO*8k$6E{smr&m$o*x@9qwa#6v;bo2&!z?Mr|8V<0yrPa!>Wu)(#! z-~}9tK0fu0gh-(GY{exg=5IHDmJhf3-1-OzXna=~nJX0{um6Dhj)ml%g$Wyvk0u`6 z9iVMF@6Kicj#$!rKjbH%E+9WN`BrpxysjCfZaqM^Men+1B+Jod^?Vj5@Xi!V%&#oq ze{>QNcfDm?G5%vM+NsrV8j&foJf3SIBs8rDZI@(ZDS?&QI9H$Do z;MJ_44ed&6>_J?D@6pBVIi9F@6@BHI%+YE#Sc6-yl9?7KmcSMZZ?QwK%oE{@6SCzz zrN`04cLWoCNdF`=UzOsVb)4oj3^!i|M!=F9(TaY^@hGMO=eeVsa_L`^ zlL_Vmw`4*t_)_j2XgV%>1jEa!Uns?iEnG(4Trgaezb0PNo!;8U@c4xCQPJ^E8^sjp z&Yac~5I*iqF5Q zS&WmN73}~CI?q_}?__1*Jy-dN8(?jfRKH7_|5FO;FJ1?{Ex9cTv<7`P;5{#fo8K0qcwFvIJbSm3pB2!%zFmh5s0}ZzMzC~yfLiy$Im~l@(< zxC|=N8w%lbygbhU2gSt}>bO-iR(s>|HS2TL^SNjNprr~{*jy3Pb^gE?eOS4$zgdkq z*pR<-BayeNo1J#c`GRCrvjPb6vdr70IwR~({)!4u&BH)Y_Gj6pv;WANTJ(M?sGrn@)cBdR@AWJBh%2vS;e5ywjA!QS_k-_d4B& zC6MXq!9Ol5xrj~Z^t;gLXzd`5lip~}=w^~veK*ke2 zvfqmIE+|-D$N77yyn48B4slt*qI*QhV{^P@`?>Fyr8?-jLOu%A>%>L);uYZigi+SR ziz4iqr7t0{(}QQCm7=p=*_<3Ktziuj)^8b?xnD3nxdti_z+wYS?H}Tjx)c;s7AJFO z!H}GX3NcWFYaR^`+k}w{d~+hdWm?yH-EKS$0-Z4Jng|@0eNnC2Ud=#aV0)36h@9z_ z=d8DN`SEuG3Yrw7DA`#4?ki8j3|?mWD@aL#v%=waT>$B`!8i3;KARbYkCW}fRcCB< zGahLsz*sT{sgxxdm`A?(-VFLpCxr*ZI0j$chW_s5p;gjT2h9;la4zxrZd*$!$#!|_ z<5uyKhz}tg#f#y0wPvid{A(Aw*RkV=I{G;6!eBv31^n@2h=kg05NPuPzmmdl>^r?4 zE}rS@rxu-d3f>wmO@sJ=2%-eH^`#@?KaL4FixaCF{tWoQ4X_OSP;A|7S}RJu*Wa)r z68P@j=EGqXVb%KZ(tWpeL+cF7C(f}_z$_+>tXQ)Di2~H6_jQ`49GkqmC+?d1tmXF2 zVbs%iei64`(lvY+8%h||z(%-vd3xzrg0@;E==ThVGy?Z;v#O6j4R}Uhaq_^B@kd-h z-cP~|csQumSinlzlO1`I4+33=EVuxF6r3;ek_qdq87S!ChDO4@C}~~U7bZ+RLTUAO zK-=}<3awH|uGd!MMp=@I_T`lPRw^@~F8&zoZqX33-Kq}NZxgo`Cemmbms!58vI0-8 z=H;Y2nMDT1xrB*|W>5`{I{4JZ(@ycG{aN*Mb60N1xbi{y zEQ#23>s@I_im~aBF?N3B>tZtR=d@nMnS!1%x2Feq>)YRU)g)KaU9I+Cg@5XyYI?Ri zbEBjkMOy}l!xEuVkP*fjQ@>)6q z$@A>POcd&TISsUg4P;0jtEPhda25a3IQzO|^S&NyV8qm--Q{m!so1elLR^$+jXKAr z4vxcNoX30&T=L=pq2GHdRY%^dMrS`-Jiw+}%neAQq>uv=$?opcEM-cuOlw=OZtrF? zanSdoH;B2?=T61e+1&eu5mJWIlf+k@?{ST_UpSccMV0WUOig~Z(f;bbKGI{sh!lM$ z94j*8rd{6WJpR&;)M0HSil^+=jde{JFe+G?ck3F<-9J3GCTUUf@2<4!s9}I{Mk_TB zb%v2yaONv98{RWea%vt{lk{#?R>J_V3nN(iqca(Buc-?4lx_<|5Yu(_b54);cLQ#Y>6^GGn%Gx%RQY_B_q#6i zG6E9rHCzWlmhdZ;IwPD|Ma~LHm#~TX-wydlD)Sat05)VIFBF)QgG>PN4Ygl;-`+KT zXDE@dI0jx68}CvoXse|fjs{$p85*6jRj3Ez2ch!5Ud$41ugsLRO~qD2^p~M8lq^ie z00Tg##m-FW21_L?9U%I6tjiZ7>@WpDJPLFVXm*{EoZbPPskC`FTo~;?Zx9alV3^rJ z4!!Fu0Tj~a?FruoD=CMHmP-I*3sAv?2fW;tuFPg~S7zK*E7mIo;zTXqN;G#JSLfGLKkT*W+#e`y&rOybxawI-8O=+IcIFUP`?d$I!*Q$X0LCx^oE_#VP{KTNc=Z%8aGb6Ce;9iYa4g&Zf1D>|@0nS)$jV)IC?wg5lszh& ztgN!LLn1pNrOb@%oz<GVTNsAgSK~H zoEH1+zPvbE^b+#{PBhPU;CME<38Ob-MW}6b5`n*QzC~Ji8rkxiEsL)CxHNbB-|Aw* z2#ofyVAmUYWZrTKYr$)yibnfq+ROg33%;#1)^|te?q{iw<>ia!tX}%L1<@w?&z|e+ z0LRDU%~@AHgAhmL1b;_7EE3|J#6*FmiE(64uT0VdYPB*J!z9iVL5FJ|pi9X7MD{U~ z;CJ3C!OTC94&rbg)|C)czb0eUI^CbzevZ- zx(vw-GisZNxVVu88`FhnM}!S6-+f^UStnRCsb6i&YZ8xt>8en^ItJT+XUFkSHt0BE z^!Qiv&E@06=$i}2k=eQ1{^Ruzc1su9SMbX31~S*<4=o&bD^0R5HmA~euAN_L^wwsW ztA|A-pHBKb(RZX|5?Ozv(jk9^LjNb*)>y{G<->dXcPOx*{rgt)*xQfl^LMjv+a5~L z9Y0<{V{%OnfBebVp4Cd%n~)z9_O*X@khgJPhC%b$4#iQ@(6g=M+|ByO^~Z&^{zbw3 zxu5-;rFbUAim^j_hZfSx^2zw*uGCxam3gf2^jtj># zCit;ODf>qjCW1$4j0`uAj(*qrR}QT#V2g!m<@`Os`>6$mf zVW7_(HtH&1pq~F91BFPOBbUR(ck2bORwTRdl%YeBQyEM9Q;s+B6Ttm}>>*lE^<28| zV)5iIi%#@>ki`R4kLV{1?bHcG?Ez1un^qb`g{+!%@17(($CC!c(RaUe6mqA3WAIX3 z-CRU{ll?*ie+Ua4&ufeuSaWEgr*FM(vfSPoDp?^=aDamCkt{AcpXNq zzix5rHT8qf2(oiribTAj^B{L9gEG*Cz|MG=%TlyZO2YSsh}ODzig4u9knw1R^F@2W zj}cnarvd#|U7NgAmvHV=X33NnpQwiOO&e~|pfAfM>1{r6Qntch>OcLz|M83Q%^dup zYRK|A{V}wZQLtTK1k5u4_Qu~4GxeQRPoVgB*_QwR0FEq>iF0;QrCCvrYEz*#N~@Pj zZZPSkCu{<16L1K9%C#j3ta{AdAaxr-WO)^cJkcGDb=RVH6)>_#ZwS}+S7bOf5xIjx z>gysF$gAm;ZQnXnZ&=C%;#~%iA5tg_<$?bnbHGyiX6oi?413zT0+u*aaDDm1y$9TXVUtK)YN$q^nuAQd3{puC?Vh#k2>VbwS~yy6 z|Gqc;&i=dIup$HAyYY#eNDS8s-}SN?XL$dSlt#;J1ieOA(nJ%U7vR0i;@m>D9jipX zmV;mo?jJr4s@)1-490nV6nV#B&HP$@ciy*=lg1FUM+sdS=0R zdMZ{RyWg&yj_z87(|+1Lgvpw31oMSa;-Ma_5&`6g*JMVUm0X}AvdbA{?lWbJ>iY{h zoj~|dFpJJ>pSttk855l(LQ+tt!-Q0*LVXh^}R%TNE> zdF(Ie9cfi*eb~C8dc((GhRCVwy@0+T9C&tg!n$rM=x|VyA))+lleb+C0c}f0GXXTD&RCANZh%)4%1Q zz3%>>gZ2$iS3e*+-dV?1tthch7jmzlD`R|rZo0N?c;4_iPOR}?z1FwVvP6QKOJ)Qr)_5AZr?C=U8G#)AuL; zGImWyPO|RUvC1Eqdg@HaHNsmo&BsGO|&| zwz*la$=7!r!1{h3a;yxD?IJ{7pXRXWX%&b$dZ=#*YUg+Z&YJ>(*9b_bU$ogLr69!U8bDG(%3A<5-??5T z3JGTWvgZqZc`?Zl6Ly0#RX67VPAKzB)-Rx)rl|V>6wNlO#O~J-7w+Up-|yv^7Nbpu zI`Wtd{J4#E%lvNJ8v$7X{d;b=ke`sDFb$OowqsOcZKBHWA0fGU>6dd6j2?oKj;``= zork--O^)}61jYfwXs!X-=^W(u;!uIh^q+uut_}pUFK!zT`!%7&pcPr zU#L6A9&^asTGu23mUInky*>Q$J7^#8;6>?T%W zOAVQ|t9`ZJ9;HLZ-b^nCIw}QmZ+(WY8_~|p6LUy8+#aIz_g;(ndEGu6J-2fpd%y|A z1~VA{^5^^!gN=2nuirbhlO%(QYA>kUUuj;3t|pW&`43vNG} z#v$-4BL2JeuCQi3e1+4YZ?r;E>A??Z;n93qz14m@8FpKz2)RlmquH;RXq;E?_QWQJ z?BibD(l>0~TpZ{UgytG;R9D_VV$uXvYu)j^2al8oYXDF1t9jGe%^s=S_7ke^QPK|G zF#~!rV$(a2>aY6RLgu_W?VRDeov&kcD5m0~*&-A~>KbQ}bl`GGG3~MW>l@Y^aJ z+0zp99;zr=jG;hd!lyTAxa%?Q=4S`YjzK>bF_XM?2n85N_Iju)R5x95r$y-6q?b3V z&6-1!5v6fVGYoYjw2Rp0mj?m}7{V|2fVk~WPUo&$%hieb_84}@T`cr~`DDhDJH^5| zO>fi2x?wd&1ro<`0GvUWEI#o~<(|IUH0o_F>j<32_m7IRqt)$LhB1|#O_8p0zU&q( zx!~RXMghKQJGrBK0($PtH*26eZyZ|`>UwQ{Q#m>eV$E^wE?$9Wbt^5cuCBLbTy>H_{Spy28>&CXdQY>sibLH%o?O>z>DOJ?p4nw7)b5~>ql8aL zD&&vat3pX8NrAAIrYd{)0Qh-OceHI3vw{?n(6XCP^@_3&(|z>q zNGe6!XSm9-2uWL1A2G25M<7C{C}VsVKnRPx#K{Uwt5N69K{ z{7J&A!E3clq4{Ibae?i|5+9H)xM57)XA90J6>0#ISHImzFx!Gpd${t9wYA=I8BS90 znm3^CdsWozroc7Wo)!VdEabOFb}0W8nl|COI1)BSpP#xKlC3Z_v%}BR=Mm-q>;Z|N z^vQHv2?7~lJfMM@6{9|U{0n-Q@e8S=5!u*+zFguHyc+vUQ9LA6zBM0JQ-Vy{yb z*k$jY_E8?>IQ-ZxE_pURgv|10@OW#v7<*omOt(*`hXNx>BIE!)i zNnN!xekny>GG{*2*nL+pu6s5y@Pa#?*@a2jZPyd)c!CYAt-XjZFq)2Or!{M4yxaK`G)qi-_J;5)WwRy!<4!xHEsEp$^c?R7uIIOX2NfElP{%sdG@NCuW z@G>~(b6mvP*on4Ml9~TR{6tn4mF{o$oVOc`z@2uA8a*_L9PzfT$*wN%5)BpA;hK z89Zq<_J-(h8HN#5CK}x3*$(sVqpEX?6^%*n^nr*UziZLWD%qGTO0cklAzZ>K9@Q(d zIo!r*7tQXp)6bX-VED}Mz~?e|v#T!4?qsff3jy8A+1zmXL}K@Z;&vNSupUx7*Mrfm zD*^WqHdl;`9fB1Ov__k=??yJNkb+F0Yeli%(jG{C2nZ%;5$|k{_(>v=9{~+@2OFwz1vW* z@b@kyAE)3`2zTBA0m_P>cDw$yc6py`PVZO<#qT%eU)Z4bG>0FqDJ0Fi1Vp`_pwewn z{s2*upMAz6b)Sv#Y0kmSLx;%>nmH@;#~9H$x%~?pcfo$E$(uSp7fn(_uFOkgQf*<9l%Oa9OVo!sSCbgmE;*z69Hc0e}_>l+dW?8ON}LGZXF@57e|Be`goydZ=((+tFj{Koe&vm<~Z7irg=c2AHCl=~g57r@$@ zXEWaF{hfUkSEn&r>?O5p3&WgiLhFJ#Ag}8FjqU2(pAf%+tpDE|PmzclC(r9#Uk0b} zc-IvCW#f}AMTqo2ciXhk6mcUT#%CABXxrh{>t4vE8~ygIwagrWHy4g{rAJalb4*o0 z?m>(^&ptGu=j7y~+=ovR*v}+~x^(6E;59^jv?SKD6jvW!2`7rwE%cAx|fa@T>vrje2 zq5%V8*W%}EI?pBhC5MpfCJwmalKw{F`=pRC-l-r;8pw|8&bkmbJ=#cA<1$76c2MYt zFHNPiP5Zj^Obh)oj>k*AHsP`8Bv6p)23dIvP7u@fHday||&RXIhWV3!+2w0@|PKY;E zDYgkCiM}IdIgj^O0q3BQ0j1MlSuvHFpWQ6a`ln$*g_m*4EUo*oBnS)-PvQADKGy|Y z@2h<@t$RN{Ho{`QPu?j-md>5_xD8)XSHi>{ccCeIO1SD~5{L{j+6=oJ%NLUD?yZWy zhuX~%r86@Ly}U47t72gxWh~^GTZ)LJgh?o?TjO>A<$qQN@jFXCJCwHiANT%ZdtBlm zehy@$;{+evm<4$E&rF)BZMjE%?<)&n}6wxe$ zwZ*XPL3_SkDU~Z_(Ry0n*cezAzJ8vpEOdG5`1_v&XIJN*nyy3U7W)JvhA^_$a6}?! z=5e>TtWp!Wm7}}G7kYoYtp(rX_^?~O*+eDxVKjAZ06CpiJt4$!x|dKE$V8dt2kn3D z+4`8N2ms4y0hLT5y@=vHVu{sliU7JQT0OB{k34&`XGb8z~?xw98=iD;Qpo@5j0b=ec%+>m+~=V$ozPp0-wRI zLgoW2o(bmp8Kq)4I6ejyPp5vb2D2t-!@0TKl+E2IH>FBAXkXCY)B1@s9dp_B?daQ2 z{j(4>J4_F|#DHEjfNDw=XVCtP)h$DhCtj3(Bq{{Nt>L!^#T~K-hSo-<;(V@o-!)CU z%C0b$`_e7N8d$2p@F{s&9&x`X|+mSpYtI z(bu{z;(_>VbqoqFtCCBKHJIUP4#x)nKK2o~_X(A5}m+ z2gNUp_7uMAztyv6@_69wv>1$wvs)x$F%0C(Yv-NgPRe8;V5*WU&$hhf#H<2|lm!$A z-Iy~5?XQ5cttHx=lS+lBG`j3(VV{4m!X-l6#}!#D;dF&iqAHg@cMk^K5Ix1ou}d0V zMh)6v$cM`%i(`ai5Gfxb zu+hjyJou1(4a12;xc6_}Zy29Ca4wN+^9b zb>EhB#`_3!nuRiQ}Q@;g5q z`C=B(O1kc2w9T&)))jFQ&-T}_S{>6hGDPoq^vO3wmlQ^vOe=ZEte&xvq0M+c2tE6S ztB$A|J%545$6uv$wP%-RQkw@BUK4jdwdG%>9eBoiq4<-snns2*BBK?U7UYfWf;`e2 zLVzYu7{33jJr3SKXNBcsfko*&u%;Tnc0lI*Rwe9UvLX@0kpg|mdRxF}!D#PfPQTgb z`;5Ly3EewheaXu-aJIzdD!Q$WYnvPKb2#H?qfAVp7IY@yijQBbx0 zAMVRMcsR`=UcWNe&f7PNI%}pHq`pfnl4O_W!4q^IbWx0reZs4Vb5YR2<@1BsSR7jo z7X`)Cfz(Mxs?}>iV&!_m!V>T49A76)vVehK7f&EqFvtw$U}f&Od~DTZZRDhm$-F@F zy?B^jn&%BGSwzZa?Q{;_H1RItJ}DG zTr^C-bic-oEq}M`5s;@&(0y+KRvuVN(BH4fIE$($0RzTo<{UEP0l4z_&{p{tr6XGvaz<}=lI7z#uv@Mj$)wSbI0SHcX zULTGA0nGxDU#I8bp=RE?rK+dYBXi9>sNn29dI)o^e*gEngX@}kA!Hkm4o_J9{L{lF zeg;A6LROjsEqH&P$s&BT8DFVeEl9M42z#i+KJAA8Vn>aBnF(5@3<$g zo$o(iN7@XupMy#l)&S26p?9CCw=k3-a0B@X>O~2NtJs9!WTUz=G`MLv7QRx6Ee}f3 zpZ&_BwrvuYX{C6_9r&LctNl{+PdSsW@5HDt8Q@W&LDI#(_O{2VrgsLEOf99xepnED zors!%Gb_(z`QLBrmQ<8|mVBi29`)x&kC5hl`-P~QK(Pg4t%sb7hAQ?TMMDrXkJGH5 zn3P**yy({K)+e#He}IBV^zeUF1W&FG(h4XE9`4ltt4oV)1WlEEiKW_kFi{9ECy}$P z%W^UQnhc4G{*V+>aBZ`o`ht}P*7^%K{!wa#&w3_9V-~ZgktX$Gib8>+CKh;en-qXP zdy}j=cPEL~JtK1kR{1R3C;q8n^16&E5@Dc8KJd8E$zs^^tzR`m&ezV9Fk36hsgVkL zRV-tQvu8V2Qc2HT{Ib2sJ;j7k7mSc|PskpAUb{WgmF_w93@deX`k#uEt?=uwe$K>i z&*~RBU12EOiFgl$A*A65Qdw9mK%lCZI(8{|Qn41XwB9Yz$h?+7%>b0^}Wy+^j+aDKF+`{GeByM*_tw| z4X50JFBMs>(z>N~Ucz5}0YpJa%{HPwCd|ge=;YyqD=^TVNWxD3N}`4q;=aJ3zp5Kl zUvumSS~+j=lf!oM*8&Sur_h>6cJzpyPm=b@rWYCH*#S zHuunD)c#V;ZVO1uu~5wcAyvBfpM~M2s*FWo`vX!v~qkS_G^378K4hZ74Q zDek&3T4cudJIHnf-jmTLzK>Xu&6w3rYi6!A4$FrpM^tusbh}GA=pkIQs>r8blMZqT_fXcWGN)*%&{(UEH(ETgO<~e!Te80N#?nmP zZYd6+oDhh3(CVJxc|vOr_KUCkmtheF{kgOxJ5E&G`B*6XImSq9Frgc+f#QU{N%eqe zc6)jBWXp)}H+7ybv~#RpMqPM6)5h#r29*88LVqaeFTIK8A9rddD~BpB7HfH&Gm%I{ z!Ftd0XK5Hq`xJqa`Y7ayq)8t;O zMe0+bOkIz7Zux#l`VN@B^=R1x@qD@PKjKu-BK|zC>R&f+>!j`u1Ziof$%ZA>|DjSF zg^P+n&S;ofs-=RkS*sV{z1N3bhOquYDiLT=0$b1sW#FR7!SAjv^3iT=W+sd?x+pe- z67{5MiYKqjdNf{BZXu<7SXsp${5I`*6qob4=WrIjOXhtBW5OoPH8q@54(f^r4E&u) zTj5~Ur$uJQ+m8hB9Kx7XF(MQZ%qC1n&1;HwdS^jZ|3>F-c}p>&^IUJlh0kArf!umC zJL0k{pF(WOqpz0avxMA1*+kRoLf=OBx&YWmWW;ATXgBU8BWQkNCPkAB6AA)0=|7dt zaUG{lBth3%xlZ7fz@(@v{lHL{_ZD)tU;sE_>*JdTz~5oVFj*&MJ;~9Jjje0| z2Iq_$68hM+(%lx^zxlZFl+-3wP||lO2&cc>U5B<>54~1-34Paqj7kAHAsZKQioM-T zNtO|hT{$AJsq=yYa~Y^RmGxirlw+b#B*ieYK5`)Gl_bMb0o=>%8QVw2)@le+0`h=CFaGnT9j{(eXf}&E|Os(t6MHU=i~t`by{G z!`6hwy~u(cgron;-~7(hf?f$ptId<4$5Jm!8#;`GoE{OXge3r>?q8D4M&Mta2?PWA z;DI0z5)1W%R>Be)?|`cDyn*$gsp@H~_2_AKpGgppx&Q-Yw*OE!{zUg_(Od&e2t!$c zVLNH)vE}OxR&wVw(2qYH?M}z|_C0+6!8(+JJzcl7V$3BXhjs54C*l`uyZhtZ8DWb3 z)Ot#ugn^`~l4q;`QILwRx#~&CiFDhbtUMlihUmeQLua9iT=6Mg15w>T;8r~lLx&>7 z@O>fj3k|aoXXss}gyUbMoiH77x$D&0IV&W-{lmT{v1UEkrCMlQy9B+>r{<=lUS4CL zkmFTh_buc0=4={kyOX7Uvg9X}m_3MYJ_#kia8cypVf{1=efkLRwuHq(KLjdHWrJ-K zDs!zh?8m}LEHniwj4;*Ye@V1Lqmg}f0e7c^sQ4oyeYy1f>Iuul!VyZZkx6eoy-T|iX!ojoEGiAzyc&ul=efa;cZTTIxZ zc{>KC4Ro5rBJ$u`OFvKeAJ_8!h=SKTRFs-wp%1IIn{f4#*l0hJfBgH1X^4jXHKFpj zlnN3e%P}(G_p7&3nc>E-K5io7#efePW$3^{=sat92o%Y;mrEWa@_fTQCHZiZj`iLu z*ZJ3o*lc~cAR_!1_(#yCUe7k z{EZN@Y-?B{4-ggO$^PXp!M$=%|NYX=X;9~1-vP8xNrV<^7)svhnJAI)5q7E(&-5@N zp>&k2f`A!`SC#c)cO99+i$023+Jjl({UWVMR}`_(kjJR*4Co~XqO7Qx0;I$BNtgrZ z*`P5rtUzvOp&sU0{$6&gUw3*TtI`kAIs&Q5I6=}K3w}Zf8U}(6rk&)uLa@33h&aHx zNj==amzSA>?5CzhH0M#X%)oMAp62r^@vqpFMeRQj?aD`|+p-N_k>>@&U4#}>XEqG& zkW;6hN{y9FSZY=ExoV*CvZq`-TX`>c_iUuz4v<;OkY3q^8sHDzB&~PhnC%S61bxd^ ziP6j9Q!@1g)|-#y4-sMLNAxdCw$roiQG*iW^62?#Ln)OI$WuP4x0O;tpOGcQE~*8q z5|Ct!fZPx(?`^|&-(moJq^=!CTuYmyZlxNi?kE zI1wC^y(J)aMHoi4?J#YAxbsQvMoVbasAx`+1Z% zVQ!(ODYy5g2HKDBb1rd_?%2GB{uM)=uoW_uFd{bjJrCqAbu0~+2A|xI1rJ?LOdHd@ zR7jlua}Tfi4r8opk{LzoQV+(~%tLP562~c}Hn}O7bH_tFd2_(jBPR`0cdlPY{%;-_WIs~&9QPE#FRuix zSjwDe%FQRY-sxS}(&>eeWIGw*ok*4WANyTTlltWNYWtJJ_#NQny=@11Vr&Q*bI8}= z&gPHzsvohW@Mri?*PH$E!Z4W#&wGsW(|BPU@pe$abx7;3HVB<1y z>o)#9)ut;rKWvmjNS{NOgGI_X5z1Z)@wcKqudc07F z5mVEQtQxJMkN4c}FX{!@yf}Y*yXyME`drU>tqJ-&toivCp%HOReVM9=iR*8;lcN}g%`%EN@;YCUgIl_6iU#krxczJHUBCBlDhqmc9BZ+1MvrqVz`I?K_xl zs%XQ9@!>YyHxm2%0w#yqI%3ZTdOyRn=ekGADy28-POU7`DAj--uJ$P0<{&ynoFfOj zp_u2O=@N2J?X}3sMPqrzdlrfwf3l5{@effzhQ7^W`1IId6L8nOqpwKEb9mngX|c4lpdRLP!h z2W?`Q4tcOn*SlNq#~zHZNxHnbc)#~bOg(nrw&SI6h6kI}d3`QVltmlq76lzTJ+998 z;4zwxuZLQu`ob!iT9Exjo$US5(+E?s%zjkAdJdf-49uEBSr#`RXeL86DZ|+B5A><6 zx6XYHuCzrNqK886f5slb;v+mX=v-oakcqWLb!YRWIgSB)Y8@JRJ|s~SLYlWazIrEv z1PG0c33vPs*V;4f*%1QccgS#;6`vJ-&$%($tlfSsg8l{k_rPRht_QI#rEh6wd;%wj z*E21*fZOOMwO(vR4OE1k1Hw;7XE*PVlH+%2Pm z7}Ks)$x`nK!V#@Xr7eP_v>+<=BJ#}C(07y~TxM4c(KcFBN-u~o6Ld1-qB$Ro{+i#p zYR$};Ny8)?413opI4dd#$<>7n~rKw%T<r7yq88$j4fc9=}G(2z>*y=-N&WD&Cf6rzpvWqT!haJ0KN zz35-lOw4dWFoHq2JP)tkGkR*i37{L^DwkQ}wz0s3X#V2HxzK7_^%7wcD;Qv(C-8v`O|pG`m6ZxPDNJg2_DQ)#h|+Iq4RH|cdFUYyxd?l7$v&J_ z7mQkBOKa=GU8AGh(giQ$c{LY(58;JyBP3fUe75^bu;T^{$l3_gKTz>~3OR9XX7dCG z&9Ya(JfGNbDwznABubfsP}9Ih8$RKnC?X3?5eDr21LfLL0SY23c$u7mR-&-U$BdX4 zX-G2$q^k|fVS3qvK4RKaK7-jR6bmDt-jn;nTfqLwJlDNGj~3=jJ9R4asojMPU!b!F z=*YC;_%TY_^x6E#@RxhNjuYErmz3gsFiuWA!aTRl!ea7{<7{K;<)N(+p8tAUQo{6m z`^bK{`BpICh=tpeo}X$?4l}l{$VXM~^i$E}Rk`6q@wl{3os@}A=r%DWtJ(Xk2leEY zh__Gy>mbPud+fP|?>Qtg0V|i@-!3E{*~ZoLJ|pq)%{{G+#+_V|z%!JMrG|@aTu;tY zb=e>>#1*Sf18}?4-|v*&&(hUarXsR6Be=qzDF@an{-0oi3@VvXF2PSsthy(L)T3}W z<9Lr<;VuP14t>LxS;#!!RIo_(QjIjbv&sgG+fdV=yNM%jrode9#trDnL~xl^?hql( zau%=9;#_CMLt7t=ae)?v3^*;sGJ)sV1+$Xk$eff?PZ2)>lh**L)}n}7lagVtjg}U2 zOVW7)I<&sZ^2+BSl7ziT7rr==BDPJY1Z=E2>1=&OF@R*(;D;GeIV_7`Z~j4HSIh+V z`3$3KokwPWJ~a@{iP&REym0T%Tz4|f2Ip-ql~~HyP@&c{=cV0#^=7>@4)ULk`r() z+M)*0NbK#9S7!?zZ!JDebrtz}h^t36zNE)Fzw{Mx0QiOqB1T__(?U@wxmbHV&~?1R z5m{wIjpFb$M6*b~K0byu+{<0GX4d0112+E)AliD*XNh}nFa0BNgE623&Murfe6;pk zizcK;TD`pJaQ@{C2)lk`D>KtrhV$X!@P&@p{%lwn#6|fPeReJ~Q_!q>^tV2)2@KigY>-c!HM$7@Sh4UdQ@ z{aWX_Nxx(vBh-=6uJKY2YAh6a0YwVWA8Y2uQt8cu613sAYXY)LK6~qX8Z^tz)&$hX zro{n4f}c8cQcnJQoPBu*3_R?*#BNo@Qxaq@K)mKIcile*15lfrPzm|Jv($UmYDJ=EJRw(z#)vPlg0_ z%96A=ENb^dnG%$uI!`r7CCqIz(b{#sIG?A4v!%5t-EB9kWt87~i9K8zKB^{UW+(wx z7YV7mamH{yxz2SjNT+?GNO1wO_yy?|ze%8C6N{w4QJz)pUx(EpY#`e*5rFA!+~fUa zk>B$shCl1kW7r4%vqaqDueqevN}ILL6`0ifJzlE2GK-t@BkC;CDP}FX(pzkl!k>Y^ z@7q^KAwZF9DM8JV@3|c^bQ@!rJ;g+a?(TRFc3q3k_isqLHE_dftkPLDp2>CQ%h{Am zj)PV1&qQ3)l0Du^Gi&1{j>vFmJtP+AlTpyZg|I~+{cM$Z_fisdBYYX0|BFT@;_WZp zwa9QAvmM=vjLGER&$}vat_sdLDxDlBJsIy?r_wLaQsM-^GfGluC3vc50GkjWS3g5x z7I*P-TP#26EE)1O_XSfvFYOY`>k}83l{!duPo?IdAWqLg9cS#^F)TI@rsdPR(vIDo zrWh{@F8F}fn(I%v_9Q@Xwn_Cry#Q_%ln{EY&pel~9sCvZRE(as#3y7O946iOvwZdn z&ZM12a=#(O+FTme4PBx(tEO2z4c zSxYj5UUH=`l!(%Oh4f^q$30%%XKyTpM@a>%qD6%hLX7)lw5dCaKbg$cqfR)t%ynJ^ zOgtQ_AagYso8m=M+n<+yaCqi^buwP9WuB;y8oth$i>}#O9JsMs%ND{#`j#`4X$mil zp#f5J54rU0FnPw(OWij!yxLu9oyn5!o$HPqT?va%n*RlAp3p#xgj?mf4&Fj6 z>Qeq!{u|D+)>u9@w=sC%?7s5j z1GnO*pI#5Xzj;;zwHzri&sEd2$e~K_n|ZCbiG09LBAoi9FlLuDyIXUA+C zBwNqCqF>8I42yWoGe2R*p(JIVGx1pqmcBWP^NiyXg#{8oLHc>C5PyOM%<{V?sK-ez zK>q>9SEeAfTkLN%_)+6!1uv^r7?mr!#a4%XH95RoK_gR&rZ`~i%z6@Us`}*j=q11A zvGhO-uCK-Y)-#gfWE&tvs&@EZjyQjmQ6?9)^}l)0f(R7Q)H^J?B4aY+7)*t#ZuWW>Gpg59*FXaQAXA7AIfbZ>wat52C>ycu~`7K0y8Yw zA1(G@Ke4!y3ad-@xV_BQe1>GVJQ;CjAN1vV<^99;q*I3U-3C8ydU9f8tY(YPOp{Q zHu1{^5QUe`CTE?G2?A`+`knJk+*D9RETnpD^rYd^k{+HPB~CzVr^Ypo4xXv3N(;51 zstOd|TXpwE(1)F)6VP+Y!?VY$D|(J+i;0~~9u}csd4zouZgWhF*Kfwx6D;fjJ`RZi zs`&P-67G~Wd*P}RzoiYKHiMi*{APU(Ex*py>YL9*HfGz>kWQ6dpkyl;-+syL!8T%` zl|4g3+Nj6ZvPa5v_+=*4%sKj4K4Bh-Liq3+GEGl&SBnIOb6+6_*RS5h8;ldz@J=uPoNliA+RZ-U6+EVZ*TG_<1H^p7RvF0EW>h>|^( zIC9r+jAqMWr3X+i!&OrMB()a;Uh$8dGDLp&zm+#7mVo3acKSHb1i@t`2O z^Gn9x^t}v5P3VtIS=RcK@7G~juSpj${Pm^d*|+g%O_9GIA+ zD$nSDx}Ppj)XS@Z!LfZ=pq$Kr^T`ZATY^jytsxt&)rJrO!JGy_4q_9A^R>vYmGhHY zosPv{6z2=2hU_QAH89*Jg$u-6bzN5vv`Wq>NEkrMhR0EcCT$>#Jws3g_C+V-Jsvz1 z^b^RD)y{g-7XlB-kTCrP9hXmIERJ=8`CStQx>JjcZ4kOaipwQ*r>r;sNq9INr>FJd zT%FQvBVVXuGA=QhTwrJ52Nt!~QuTH48~b;2o?U$+Y^wE8)YfdmQXsv!Kld5`FtR6H zpcy$l$Gbzeqx#=?A-)lW*API|>CsFb4w<_36a*Zu7x^vJ7<7>xz|R1A1ikjRJR}k^ z&;JEl(&ndpIGtaWa0hLm1~*ts#Yz06WJB4YB@Oz!GFIJHv!9wEuu_x>Y!v1oud*8! z^Bpvg;ZTolK9au|j%qBu@pJag9&53=8p8&2kp}EujqaGcqVR^<2mNoKYczG(&ZPQg zsM*C*e$OwbwJliEQ!ROI$7a_N$Nf|%Vr7R0mKGrO^R*Wdn3YiEfK(#WnMO2G)S|7o z1P=+B3~k)##8w~^70ZL2{G>xYHg?i*nd5z%8*O?v5?eU%*_%CO&1w#1Ogrfvhz7X!LU?EstQ zHxM}*-~K5X-g4SPRj`F10ll=Y7{-tlq?rU>P4@(%y31a^U+7(1VBDjVqI^973CCze zYef*_c_h&&9z=T;49U+}VYOaiTqMJpyd5XLNtjNkPzyMA=I8l_L)zDYZkYU3J@D5Q{i)XqONK+Np*8* znQvRi28p`Psjuk0Jy4ouKfy~HOT&u7yc;>T#m$=VmO}!{spy^OsbmgTZppYFC%&{z znH-+oe_6FUacy-;G2=eP-_?bhgY}iwf~-gu$?x8IS0&XFaxNxHxjhu}vKiT~hK=*x zi!Y188nK)oOzMp_V-^SApEhwA+pt03WCVtg+mDdEuTD(>Rc_{-I;Z;0_I!`_BlH4*+jAJ2iqlS5yf&z=^Sc@vG(;=ti zl?6qe#a?LHnggMK-GhLRu-%n<1!xBp(oIl}o=ZV$M?At*kr`TFrviJ(3nya_UT#kb0`2-9bS7o?8 z2Rq%gdqZ5<;#|!x*E^-}MdqM5J0ef=SpOZJlsfO0Sq#JZT)f#{NUn{ekC*Ca>mghH zlJS2lZ3z}6O8%2i__hhWHqat+?RW5TM?mIzf9K8h_?fIwHWwtm@PniQDL^oNWylaR zD7vC^Mn-%#7=8r+SKs4)%k!}6rQa7F?KF@UgT@otrCu9z0n{&nJrr^sFo8|XPC#5T z;eKQ6(KA7r6PT}qTNfR%5wa( z8$-8JC~OTb(gYDAO zyHzgzmE%_ec&H`Jw2pHibl-3C&6*m%ty5qi>K0+-{$p#vP=B@MxcVGSt!6aCkA(E1J%>{l(`vJenp`KdEPk08!U^s_!-E3#fH3n&8 z_#=PY8gh`)na{rgtYvlk82?{G=wAfj+7Z;NE+}}ZSUkyu8lfGOTTY!OPtgA`b&D-F z$ZW7lJO1mX%=Ig)c26IlTC9h_V~eh_lq^o%=4*ox9{L=hJ^b?&ND;XJ+KW6!)j$UZ zoey^;c4&!4FbzPGISKr^A!!$0jq+U`xD6+U89er&&Pi-k=?_KR13&ZVd{6gaMs1ui zlWy$CwHU7qnPddfx${`+${|!V4qrSKYu$~GTImgQGjRa#>T(bfbnK1T^^kONVrG*zALT z40O8Zy5UXCXT?6Q;xi7GSj}|Z1oW)2Es7cc5|ss$xHYNJjov)Xj7);v0>e`CL~cHv zH(?xs%!BY9c=RW_IQfjtAypmkM1AL2t*E5jm)_pw`^52^r+cY{@ZVjsVJv`$u1kQo zsq6XzDnf<70^H%*ExJFom8YBAqFLXox&L@JFI7Ao;w^oG_(A@&P&HZCoi1yn_L2t* z$VlgofGw4z;n`ry0{d^TuVM6rwyZlzkOBY#mKS8{ zysKn4?2XUr$w*VDQfxN@^9@SN`)_Zw#jqFJ`i#nNi%!jiy=*Qjj#(i`%9{0xt4ngS z5?bd-8?^rV@uTw0Am=2j&V=h0T#WwE=4VEyJpmjOZY5Qz($ZHO{` zul1~8f)Z3rJ1$N{e8GNIjE$vTv%=3;>jNhZ!(#rTXaM9^oJP(pVS1}qMglA^Y3DgW z;BPfbCys0y3uxo)K%0pi;E-US%|F(aQdNb(0V1svhjOE4N5HOp;H|b&JHpzo0!?%6 zM(+d1@-4tYq&)sCrTtE15YT%YD|qj7zqALyS^sR28bIWPQ~A7y6zC49o4x?aXiKps z%KQ5%TzAVbyJt}0aF11y=>xQfXa&9mk`p0=`&)xAfR?>Kyzx22jU*E$4SDtu`f=T3 zXwQy6J7?-n9pWg4x=Al|=ZzAdw7jQCqj*NfDoam%=w$L5S-(Gmeu&Qp9$_KWBBXw2 zhKFubO_Or7+qufhslfhhU!)*7Pd%Ah!r=_YAJq- zJzncWkR;YVI@~g^e*igAobZZ}7RGnSkJiL41zF-PY+HZ*&2zJ}v<%i*+=2$(QwDC!i_02Lk*V+dDYpVb&FeLd$S6ssldiwvlK}@9V9bX@s-z-+SNjxFMbo%G_9*jEgKS6)M#A)C= z300ylXgQbj;kJ^61oQ1Wal^Nj^K6p{>&wf)FQQ>UuYZ>2#NhU;we7rBdqlc_-h*XMQ|DN3C(3v~{qyJhvI*=a z5d$41hQ5J?G8Dm-*NO6b^|@BhXo@SH zfkIxo?Tz9paj<003mL>)bl7M0=L^>#EOCx@Q6SUSPR(@zttv{;^h^s5=Rw=TxZ-Zd zZ?}zKFcjXvZ*w->nlL&{N&Zwa7~#K0ZE;I%*xOS7L_GjuZrR)BQsyDX5Gp5o2q`B2 z19&!H+Ra;D{RDh3*F?{=Sw>JcZv$C9<7WX95N;rkh8DB*!Q-Yu(iH?<&bgVqR{S-S zV!uqmt2O$g17xr0T_0$B!WsuB^b63ZEsLCNL#q?HB4ON=#;-)~{j2XvyC2*G5wY>~ z=N;RM00{}oG`vAi{Ko#WUfNWBq;6t)J9#&@oM_sycS%k##uGPkt07XT0ClHvX@M#S zw(VXJaOT@?`H!5eZ6`maj{l*lEa1Kgg>(OsfK@h91Jle7PpsD;$ZZyS(ga#?Z1-D5 z`i8}~Pj!f}w%|1(AIAHzbVRhU@JUNxs!DoXfwl-1s%YL~!h($Vk9}eZiq4s_Qit|G zr!FYsb0MZ>UK7?3K5vbj*j!n#)cio2$ePt@>EQb-)pRm$%pYKEz!Bg6dLUHG`Z%{n zALq&J;F?aeUWr<{>*DSZK>`gxShesH{13^7H?X#Mz+a~5$YlCN(D&y)ciqPNE9>I` zyPJP#m612yQW-SP2&Ve|0Pp74q^X?l<4``iR!sZ&srXVqbtT1o!)!FqMN3xR93m`V z4%n)4y}dI{H-$VxVnXw9>Z%@MIa#nK>Tg|kW;_pa5GLw-qIk4jvha5*#V8mN?#@j} zJNj}hYYNSN2Lm4j5$kc+!>?Jb#3v(&2{4kn>nph*r&~`z>{B?OeQV+SxK(PN$%SU3 zKP%=PlrIz9qyzm(bF4hbbMm^3Mr|e%y!V7UOdN4nk>g zAU`_+ny;;Dq($RyRAimhjoWZHqte&!+E&-wWV{J=8zh*cWa#kbH{}Nl?R3n{t_yrP z*7uPO<^AwfCR%4ku8>D>ve2B|XO;8wL2c*QcH#sx=MBcHSl@fTt1gTN*s7ZtnV~6h z_52*R7J;l>l2M;)B0@cm+}shyvC)6dhyN79$$}WZ`~CXFc%NGgI+UdCv`x08H*T-? z$Mi%^TFX2k2tRh!SS`f$j#5$ea!sm;7UWwKb0pL_J0?y}S^7osPRgqN z1vam{UtLhT(hXFC`eb!*vl~5zD!t>-j>e*t^i$oGm6%X}8wAXaY@s`}ApFdg}rffOHLqSb!;nCoDh6;YJ$X5n2xT`aR zJ)%unJ~-EnLK3_Fyi#Rq9Xr=s^*@VGOIn>j6iqNHtQ7ajfwpsL@Lq|DFXM7aR?XAfPbVR1y3tQ+pk9Md^iCT4l5^9;e5crz4Q&;u z^LtN2%8`b}?otESC7`Jg*v6DxDQ&_ctRq|3VZAeb?nk{A;N=cLr=b(2tr&XE-|LEKUcLFCxN+!p9aQ3kK2J(DaO0z7Q=t9PI z>8+$)mV;H69Vp%YV?IIwt=E$lhCAo>j?CVKC#iag1UIi}X7%=tay3 zFaEkYM3hrxM{j;Mjbj{>v;PoA!gRX;+q(FDAS?#=ymbRy7Pzt-_j>WySl`Q|HG(u0 z6Z}8x7g0jxCvJxa1q904`y}S91Z6)6#@#dULV%2hr;xqKlL0C}qg@Q>8vM*)@EZiM zNw6u81VBhdvz(XtE&ePP&NDtCtUIkYuGGku2Z;MK!JR$}Nwl|LLqE9#(}_jM1`n-n zmc@(mLY9z<8S&P+()Ygnhb#{_5s5p`{xe>#FFe`ehHcOOPj}~1u(DVH17&BPblTHp zot5SeW9BdzwGZlSu|LxOywx6XPRH8Q6+aVZ0T}HLv?cg$l34o`B)6~5FBAUR+9@;T z`((eT#R6e*sK0cVH3p=>oFlE*wwx4kAL{7z{6<;_YgZ{5ZQM+xRp2XHf8o?>5DlWya0mz_gAh;KTm;C1NIwRGgqdo+_&vDp2vBpPgcu zvc>)0{gyQS9Ct= z_jPgUE8qYSg$Ni<`q$L2bwxw$oQ21*A>yAK@^z6?WU|^DkCz8{R zl=;L#!5^?=77*)WGrPh_E@E3#27sj>^bR=@(GyFa9wZrF@2t8a4s%sP%jOL9z1uW% z7sO7kYJ`dn7ByUxu&ig&0{dNL!mbl*H=*c=SjPXTdwkCv$Qvm3Y^9FEywPaSTc1tW zR&HS;Lq=Ur@aqZ73JpbOP<$pV8m6_jMQ6m{SJ5}*{=_JDP5#t|^F2@OCyop{nNy2l zfjr$YbHg_;6?gx9L~it^>)fDlmme~z-gZF1GpM^Qb{&8^^StIvZ&$N&2fVKSd<&yK zs_@`h!q7qX1R9;-*cfs$Kvb*W(wgz zAgN!w?ZD~*Sv=m(yqaa`7KilIZY)3fY5hIa_MX*u&NkgZ2;J35#5|c$J_nmVav{S5 z|C_gT$NI88idwz1BL!8>?V7XFtmvUz0a(PpztGMY&%*Eu%)f1?e-|+)2mf(cXwd)H zTD0)5LlUXd(U!wnBXu*W$}Oe^HoMjImxSbx!UI<)Ug`l zwf^2fDf7oB(9h*OFOS2k@Df|E3zNj|o$H<@MZDG_ccspgiYTsIW>e2RKKD6-zTuJk zaPCmC30Xi+_ihgN(%MyPlrH2kSsLhmYfn=6So8^(Z;fh9$z_>Cz#yKfPAaWv*vMR2 zQ0Ap;A;lVOkV53gLlyIW{DDpDV%w{Luh9{^*CSgv*#esipzGOA{$wY=Zf}Y5d()`w zJlTe~ELUI{hE5wjsJs4VBy|DO}tzYi5vM2qjM__#sGg+p1d zEuV>P+s~+7$;zhlDy=g9XryKScaX03-9|8W$^xr0TkOFOHVf+3H@Ne}B-#2vcHY6( z*Lppq?O5MA=03JvwRUeW8H{!LXw+=_!ZzbeRG0J(2fweoZ&69R&irw)uh$O#F38Hq zpgBEp1SiQ66n)>9Ox%j4wb!m-{bO7}HJ4s{CDi{B==aEgZ#L%hSpR{YLmef@2HOkU zF#<}3MMM;UhYz$pqI3UfO|xLd{2qw5&jFKJQqMNi4S#QJi#g=nDUV@B~yM{B~jh7>wol{9@k-q@CkX&ay9wm-p3PIQ%*wHyEr@;<&Q zTA03}dv%~Kb$#q~K32yQXlwplq)}%?l7{i{&Zp#pW5xRW=>7jy%lH8F4UbuZos%vp z69Hq}S}(8a#T>L}wwtcBdhtuM_a1)6l4aCe?NFDZdSs|<%Dn5?4PVg23T))SDNoo@ zCR%*~sz+Gk)%y02@~xmd#DbsFISdQw#3Z`Ej69RMXo>gmmpGk@KctRxZyJ8#8j30 zuj;iFzngu5%5-t(Cw@|F5|Wv8esQeNgkV`k2m3L{9L{Y}{X#~LPHL@$^6S9~;i%|| zHlT7hxWrz>YF-U!F!!A>R?C!PO?W7(8#}W5t%|(1f>ZbQVY4GqPEhxEmaHCwO=lf{ zqLO=VtRyeI1&NW7>wRcK3hjB}_JEZ1M8}TEV;D_V?;Y3#;d~eylnvX56zcTNUufT# zJNbSWxbx3RkNjr6YSDXS+~7F)zR07|lHbCz`M#CbV1lgG&egZ)npm2(=x^av zDC6p&a*-T-A`ep1nXhQ(rKDrki@BsS{bJk10|!+tD0HRq@=exjtd;vv=j>8_epwUH z;l>g&O1yy;m*(E;yHR==eW3n( z{}R*R!m~5}ac!gRe_=BHx0N|1jI^(Az+942Fwpl6lxhze+5{A_@af5TTx8Z#ZK0%dgb&lX@2V z^GO?z>B`s$T6BrtC&+bl887})nCGjKfEsXuUecfEc!hag}xCL+aTR>QV%1n}OXl`T{nSHir^69^*8cdR=34hyXcE$3!UQp$P!@ zB|ib)3fgU4!(Z?(pzS_HI2FtkTNmkreyYRVxPzCKa!G~~b@yP+srZwj<1u2pKL-X; zHqzKsaY#hySNQqTWYKDPa+qDR3>gy>|IJv6;O?&&u%`ADzGAU7q?&7n!_E+xThD<* zIWa9x3oYlS`*3Hc9>Wv)%7T-$UE6(6V&f&OSxWj}R*9`OPODkAvfu-~lU&_~Wn4Ws zotJQTKRfuVTP@HwH>Hu(6;|d6nbIBcZe)|me}Vnt-Fi57bTm@Z=+``1%_CR0jD=~F zQbIIAu5X?PR}Jj}`<=7Pln)7uFB7mKR<~W(HtU3k?*31o3S1h<*dJa^5Q%IVcrsTiP}!^$r05^R6% zEuBg&#-7Dc%pHTMsIvr~y=0?1`*%N=b7&V)Jj1R$zO~TwTw3ZK@fjo9aHKGoUbowD8bk;}3GaJ^-;!R9HxNK&R173GPM ziSk%xgCBMo|S2L9*cz`)k+pXov-Wp((X1-{!T1lH0%ty5R#H7WKUqwmmV@PpPd50{ql3 zI1u{kSvp3qzM}<8GQ=a(b};A~|6)yaVER?nTc5OERPWiT_?cd=_4_4F;vd+z5uojB zq8OjNBd3zmW~*nB`QB5G!yPz`}G&7fB;SRVMPBKeAFj!_THpS;03Im*Ou5v5#d%1+p@l zkbyoJXrPyY|Hyv6^*MdR8RpM5_>ycaN263UVs;Y-^c#qom4?dBy^*JG580AeKIAg- zq!z#>R7IXSEcMmmp`G5+l?8d$I|FUMWZ;xWF9C9zx1E2>wi1J_ne^Kmm9-$P&{X?= zI;Esx-vIB8nJW{_0J;hqI_bE^zWcJIK=`<{PrD7HRD26btjn zeWL?MAJ>$GQ6H$ighfMn4gVj$+yC3II8s-tuh5dvq^2G)_rfLwo>}!1Y-(FYIGzx@ zhBox{?}J@`ynmNQHg?C79xf=4?-=}bsuc4qlz8mRQ+d@kiST4x7P~OeRnaAH%#=L4 zH>&P@;qDyrb?F;G#pbj`mst*(o6tA(vOM8w#Y4*(f7;i&o#J5b4@m0|*|~7ZCVStr zPx#47Op_a{ro#r@w0~oG8^tAbm`(*sd8++YzPg-qguFSo7Su`jo&UR)lkd+e|CdmM zNPGilz3abye+Yg5=s8_4XT%4)p7<@Xz|j zw_+ZM(KDmbhsHk%KZMPiFz%pb-o9$e`j(hKK2#X~T8_vj>~BaUl&EZUFnX3s1p(N^ zdqQLUR0>kku{U3ZQJ|?$Y*6DwEmk_i|E2^KK#Yiuf8yU??DQ!WDoy`C9^|5R6s-9K zE-1*?rKApk=~&$VQ+faImvUVC&%4ljSp52r|J-dsa-XPO)8{0E1eDgm8XFD2phi8i z^!@p5Usv`Tux}hsQu4xmsJcL5_;Sp@4b=JHuSZO@|J53tO3v95wTYt@_^>kjkB_#%cuBX2mA#-X z_%_b7oTVYC)WdKT1y(vR(|#Fz;fmB-^{*CR&I_0`}!S%*~!;? zuSi^3rR<>}3ZgAPwIOu}lMLsgj3XE@l8*ef`neISUtaFyg4J)DSp8()f2SJ#dK%%- zvXg&5YcdamL^dJJpDwh`y(4vfV59sGk4WxPY<3qAv4tM~>rcm~IeTneb0QvBJw_xk zTAt-ba}uzAN1xQkwtDbXpvzw}pwuwXEsR4nFjxoE5<|q|lur>sWipk|4w#q*;xGs) zllavX4|3d!-_1!&;@ZLPW@7Z)9{)A`dB8$mf=Tl+WH`1AewkZV#3tT7&tv0g8brFx zts9cGK8~!%mZ2LRgS5hgN2r_S-xW?^%Xgo2llw=<$S&Hwlsv+i^OGtz?WiF2$VTo1 z!WIt9p1CCbo4gZd4BDSyd5S*)8sA~ED?%~EldRIyG_``+9c5@KN*Ahf5PT$=Nv|HK|S-WJKbiT>m<>PA66;_sp5-$=QD3HCoqtd&$H7EdK)sxH{Zi5MKk z@2N8u9@TeWkSd^!^}NhEZ7d4S(_$bq(J=-+Y>~svlMT|=_ksr%dwCl6DN9@G+kHpE z0RSnFq0hfhV)X%0VT(%#)OA_x;5jn4=*92YowpK=_X<#~6KpT(R%Y;=3>P^1@$5wC zeJpi3wp8m`#h;D&TyiOeU3yn^rxC8@jNpZ)BbW7eeaGRJ%X7Fb&p!n#vKQv@>QhX~ z$5=Y34xupb{04Kh6{;C;chXklk;CiiQpfU+)-2}tgSD8UtJ3H1ZhX}1#Gg5($^z0- zs?Z;I=iQ^z6VKc+B6-XT-I856uqJ#R)R_0V2&UloY$~vD7^-B~7v?N8y`TG#W|U;<@0>uJT*nkX#~pNJI|EO0nV z_r$s|{`3t0)FO}>3w@1hLp^$`C`sY;n#z18n}Q|BmI;93oO;gSVD`4S$nxEt1Ki_1 zV)0IMU>-21#n1Wq)Q+~{0g;lf_>;f%!UD|sDKXR6;Vv9&N>O;vx&?v>NMkVv>WCrsAV*592lrFQzC&1#;%JAvOYe1*ua@9-rR9AF4b zkP+A3uCs=)RMlU6LWcR<@}bEnuMAO)-Ron6BM}jORN}UanXmUR0>J)UZV?lgOilx6 zaM6I{mH~D&Eg`09>cR9*8*)y6c^2sLxB&_xVYrK!Lnt(GJps7CerB$_q%Zk0PMkma z1BZA<0pK7N3TnX~)Wdg^R+qMMi5Jo@uNU+of?2Q}q_@&HG!;KQpX#u76HejyK$|LG zXRs9;nJKY=@zZgVYeJeK;5yG)8i?qLRXw zq?)v415V|pe`2A(XeDw5e(Z4&jtU95CM&))f%?pWJsKx-rGt7UlkQGSH-z*MF}*9Y zXr`~d?oEtPj6cb$wph5&e!6ZZh02p|a*Xs^3t2b>OjL`RX6(Tmj+}if<(@b|(Cm5q zL!yHT3ttA8`v$oW_))h$4gnK9NxhBG;UghIyfEn4yPGaijeZx!Vw3DtwW39rGE~Vt zb>@V}Cu{y1hvkqD==@X((msd4t^YGigv_X^XUGk{E;5mrp0r6!3bGthB=GhS$1kQu zWHmlF_*PiKP5uMgzFZyM+9Xx#?32D~8V!^s=4X{82(+_jj-yzKY}in*w1U7pHO=y1 zcK8i%d^X*onbK{#1ADnu8kw2krn?-Em!u%hL$H)I!$2uUgxYOz5|al%ak?o{V+%`)m>NhqXGncHg$^iq;^ zN9^;B?jAHopoM{6^SOSrbigpu5fsr~1#OFh>xg%}#7`dF4!Lro*xZf(7~W{kOseBq zuYH`Z148kV-a&V$LFFmM_bods{QW;{s~m-7o#9wz0qc$LD!ZTXwG=(DRWa*4Mr zSi)S9*ZVs0Rd*$QgHQR5&uzLTaFWrrh#c7X1zzrRU6z8xP7C870KVO;f8%!>DHjZ1 znv~UZ*r~h!gAPy8r8?Oa4Y};(GdXR8wi?do-lp?jg7S{n!S=W_$0iS&?1)XT-x9Uj zqTH~=D((^RcmwZ0s`I{7zks2k(I{&_(=&q`MB)@^+xrJHZP^EsG)6Pj#k%n5srbZ` z&;7ui3A`=}*sCA2EQ8lKq=`3x(^7o&{OTpK4}C&nRzZ&sj`ekoGpDGk1+0gW2}xdX zfd5dw+HRfw8XZntatV*j_J%7sEnEv5|E=fc+%t-ckL~D?_5o3mqJxRXg-JI;-;8xr zmhx%^IzHK>K@y4~ugiL`oy20L&f7!`0Z@)(;pL!O{;2~m-f@8w)F(Ni?>O;r+omd< zU=7;>*Cb&X^!X8y!x?Gh!A69XgWZY>U7rWa8|OLD0>MGEMqHWzPkD{OrCmFIPlbsCVV<+Q4rpXVvgzOATk=4D=CrYxo7AIH>nFlcz49Q#^3{p08rO$9XITwx+&e!+c`_s<#KY-L(PWiCeg~m#R>u z|3vs9sgfCVQ#VB@dL{Eand;c6o4l|O z+aot^xajrJSb)lI!wGl+yg|t9DtOxm6nmjk@S@RXL_dtIySC+3u~S;SZ~zq8G9IHSl+M_`xo>f>fUIq_xzMUGjc+ zLQ)c3SZ1A(R%V7pFV7_fot14N23cjT`<8P}deS9^4+d4pU_aQGl@#=8rjqJBP|yOg zZ#Zgyet6@)jMG_?E6;rnFl9+p8DAQ}*MA`o@DLhQHBt4hQ#w^IIt~}k*mkrk++rVY zg7heEyV3cf@Z?(UtWc!(?F7xf90@~FsXzg4J-P|raSV9rhau*ReXkSO27JXHKqJFx znFxOg$lmJmC5?(*yicJ;R&~4{>slH#2E%5lj&|T z>LYiZ4@km#k;fNy=*J3_0gv4ha`7g=k*8XRFOarRf{&|(na|qZPfm~fl`{Qs-jx*o zFRRdphwnl&5{}Rnex|}OK!YdbHx2`0{fuOWR{O+l-bF=e%}4X?0>KM+tQLwr7orp?8D<@-kGrA zIC|jJAMG`sETGhGwRR78#Lv)V32k3N6Ef`l%e-$Bos&l8@{r*$;)7O&0F)eBSIF0; zG&;MfE_GG-Z<+0D`IX#$?%DD(ZD-FXprBF6&GP7Lf0<^&LJs)*s~$pmz<4f=>4@++ zsM+3UPm15MUmr?1z7B444>gEkGe5S<+%w9K!BQ)%wrq4we3_-dE(QN528oa*bVinP zxizdHZqujbT=rjIM0*`M-LUNKPJ`BZ3_;x(U^33l9(MMNpNidM12IO}ItPFyw{Kke z&6hCs_58Xt>$651TK>rb*b@7|g1IJ^LSI9YVvtY*VG~l=j7Bbo+umx1C_d-1aCE2z zSJbyfTtbq|l%rK8Of$ieaXl1!#fIRO&-6-G0TSHz`bA*YEg}yG7tOWt(3s6<+$ZpI{vZhjFVc}RCse4JKUY-nG4Xme#D7IUgD8e zUnytLBWG+MPzs0AQMh5*S}F3J`7qqN=i+@wtu=+dv%^q?OujGt#5^W7(NQQFz^`y| z6wh*4BJ8qg@1-oOtL;AedL2*K{Y^aE_EI(TboUb)HMd8yIl~9;9waaZ93|*sL}?wd z@*&?pTa-;R ziLf&pwQSF3NK{Z67u{qqS|LE|8tM_n`uvUcGqx z$Uq(~OPG!gRV0RSqxj+FRH&2AhpXVV%G2&n9cNrfHp~ezUjk*(WuWmTaoUZ0zFR-5 z(8zxMq%H2ZT28!y`pdMq-~NNrQw@+4jja{+xT@UEE9Y;?%)ktb-!kp{py>CnPVX-e z^!ud-c3~xJS~J^MQ(v1nZuRf0W7!vPiWKD{oYSW+TF-+iYB4M4xlIftlF@kKJefIF z98dD+hS+151(C4>uixGw9H=3YeBtdziqnfDOO!P{*+IaR|L)}@BHuuPo}CuC1`$f!J$g|i zG{PS5HN7SWzCY6b5TKmu~!`E0M;><{(qDk(YIIH*pfNpUlW(bg8>}Y^Gu7F7A#NpX)1Oq%7Rqoes$$i7h`sQQrHG})wr`ta z<1ZA-f0>E=5W2CpTZ|;?yi_PoDT&2e5ZCmqrx{nA1!7PKG&$_5o5mT%GN`Cz}W+c@q_^YgF$V=lRsS8VV_ghIRBiUlO8KDbI zv|pa3HWNY#!UHXI{SW6}Usp6Fy%?QQKfhfB@58`pFdn^pNo3~r(pOw?@3iZ?4T^GiCB)A-fZGX3vpWsVq8WgMUY`u9DA(ieQNpv z=uv7NauC_#5z^Z4e-uuwch<^T`!zVlaeJC>XtGo$clBUX+7MTu^`WmAW? z4gru$Q*NKuN8~N&R~Nj(?(e@qOlBnD(M^L9kK|pg_GBZL%GAw?4Xcw*Egu!1u*tXU z`28ZXnwB;sMqXTq+osiCwU3dgxBX(zTw*4*EkC# z)svMyjVkF>87pss^_up(i5I7G({gJ54QkuYVPB3U0FRBSYST4}_GccsM6b5YHptlg z>Xp2F#;Y~0kRm^b$bpAJZ$Hc(<-cECoI&!TD^q{P3zzXWXh&8<6Z1X|G-B9Yh>-mU z?mw@lbP)GggR&O#cq$9lAinemT5jzvTm{KFdum(<&9hOcWHos;#WO;AlhehYGf`=S zEDyPA@9bYkb*ZCM`DHv!uuM%3-nsRx_%wY(wxp3xg1T9dUpy0pYU#AqPQG6-QWh1Mp zpQTzAMzg)=HpV|Z@mB2Y?d|dJa!d``jm{yBQ7hSR`oyr&_^NGKb}m{;JwE=XYwswl z9d{lYTrC0}A^NJcIRupCfl@XZ5F)1j(j`TY9tt>EZDV=2oQGot+z5{F;%@J$q;%|6 zA(Hz9)lAJ1d)L@A$FFUU2Ds`PJf@D_qWR~HU~uNWidlS4^RQ0LaoY{Sk6D}u5yA^gLLSDEoG9}XW)aOhN@04))~gS<$P*w zJ5nW!t_JRSDusZifNs;}oMZ}k<(vkq;jYEkbi&U>|V=shh_1SFh%x``B6>grEu9Rjq zb!9EgNjVi8_D41PUWBdJ+_cc{%8YH3Tx`_+FgBzM6+ZRMHo1R8LMmMe0`c(I`i^qw zB2YE!qTE$IW?RLx&J@7i>*86zF5f9;jujTrJzfU<*Y>pX{+(4`5COh=e-{>AMpH() zCrh@JB7p5+`kPQn@#{r9F|`v0w&2?)k#d+&NOH&~bH(PI;HO7*!EP;}J$*g^SbDI6 zOno#cWb1hYBIZE4nvQ+!vq{2k(e0y+Z4EIv6u;#J+)I=WJO~X(P+%5Q91(!KmwM}R z;7jph&Uo;-tzv%ml-g&lcp*71c!MSrzV=_DMiC#Wo7XxsoA}Fl7AcdxQxX3_&;J)0 zyGepg{57KmiB0QtM)i}~4WzheQ5b~??j3_*Qvh$$3@*2CslQ`ll;(0lt_SHNX2LhpQ8rh~2ZWg2~h zIkVI@4{+E1$42V1esOOwQ^-2rlWh3EjBfXGJwk#ss{N~2MenFie@oB%q}L9$cFd!h zy|Ww9n1=q2#l+~g(3n^_u1U)enH}n(;p>emU0M*xz_Lih6S=1=j7DCQX_7vi@+_5; zQbS3Z`1VGFcq%kfe8$3zvh*uq-Q1q9QAv59Cq(9s(_R~jsm>MHD-L)%A&3&3FzE1* zY~W0Ql@Y<=0J{y)vhtsaW9pb3AuV!0Yim;=m7$BtV<6J}8o6F=;0Jvlp5T?UkKaB) zvAKs>SsEw0<`B!LkD*ONmdpEKkdBn=UAs;Y8M~D(!(z@TIJv%kr#qGm zTCnw&fzvKBqf=4%Vj+>(y7Mvb?v;W0FDK2Nn8_4YJigjWNRL_rh|DL1VlSq7tg_M1 zRG;7g*yq-FWJ5PE=VnWiA$6}jysOJ~{w|7aF6CRG&?hD&a}!Uk*sl20r}}T?F+urM z$55h{D=Xd2tvsl(YaI#ZJX0c$rNg1u& zY{Iu{n-3tQTpxo}sU$u+IOVvk8=iCkj_-hj<{*I*2a5{qkQUm7Jm7rv6@J7bAJhl9 z644}E6JEo}Ox3qzjh-@tBAxAHRwxor7xIW)$ACzFbEX$%vuu@_{~RF=G2G7YJC=*M zz+KKpf*v!3M|}c5c|s?*bgE_1AVf8bla=y_94J{!#hK_9sLjbYHwF}s4!V^;>C9tHzhc&kOj~xJ$eYoz@v#D9o z%+-HhtauvA76(n%_nV;M8BDj7?@eRnfBB~Jkqa{$$BmrBaCV;)T=s7wYOUT?$z_*~ zm;1DCg-aIE71}czp9QZXWM7|)_JoYM@Lvxm&A>5Mb`z7Rr7?8@FD#fSoz4y0dabx0 zTP=V=M;98VcApDa;W7GnJj0qsu( zmR>yE(+ELlEhbW)$PbMAd5e5#61Rb_{@iPcc(&?~mKXTsxvIbTBG8+v7za(Y+~?tF z8ziRUpgayAK4%MAf(OL8C7^WYg|(YQkn)&&4-JdrhypT=2%|%b-&;_%heYldZW|Ux zGE1vt0V#2dN`CH~AM87hLu76uA7hBqfiGa@E37`|jKQRcowsV8y`{{)KGQ{3HNLye z^HdM;Sn*}OTN^v=HG27tlB#m>@mFeBD(L2KmwIXI=q7{471Z)2gYe-4vI zJsc!QR%IKz0P<38737$Hrr$3msm-M0w_Jo?d)XaH^|?R4jmHF4xdWcPh!%c*@4VM@ z8@`Q-7ij)aD4#b00@!ai3avSxT$2SszvlKGK=-UPlet2$>$h*Yhc%_*}C}tJCaw-{Y9#N zcMcIudxVq9yNP8v5(2j!?szSSzY}(yiV2yhRbd=kOg>g|DDaCzP;>q0-1ER;eW?0H z!H8GFS{JEcFp~04ZKUmD3V6!CD=FONOP z5pisJ1{8o(cfj15*d+NCch^)X&V7S5*~r!LK?l}MGo#liK(~*o(Y&!m_kS0^7%IDW z+r*XFp0wGL0ZQV`(K@V!^^B{D@Mo|H8tKPho?p5%Pjd!&8Y`G4-47r6OPr*CyaS1y z^X&5XHh|a~%=~7F-%4y@WkU>ks$^iavzs)xwbLiDqpYG^-G(O_zvs10B0c0=vuOdf z{+qc;P_6jV^1UinozY3*(y}s_@mEEar#n@33!V(*tn4xq54}MRi45>k(fA$@j^;ai zfGUNKLh+q!Hr&h1Nozr2rr``Rg$o>QKx!*%##KZ@L6wF~c9IZ1kGjxVf>mufbU5N{ z2?)ro)%VoyU!4XCifENjMc1NYQ@e4R1UZLm4ZHXC1b3h)(7|ZKoi$YTmq8}yVMC?vP|leck#aP)5C_j;eI)* zEp_jF16&Tf*sN+A-WiTLa9~plk})dS9cXJQE9G6_fZ@WR$li3yWpTUyY4saiz}BLd zj6YH9W6Ntp9%B)v#}a){86XLk7l*?^Ll-16tl+R{ZDrN7*-^C~QybM*r*5+G3yIeO zD%lL3HZ|6ABFM=+Mwk=`SbT_~*Hry)u}+lKj|l`{VZ7o}(jn8v2Px@A)29$zbk_4C zF+cF`38Q6z#G>Uzio1dDv4Oi|m(Wa80pYT}7H?bUu^5b{SH64nJhXdIfV{-?@b=3SYB*JT)E~7S*D| z=2S5qZMa$$Hx-(MIIC9$AKlIM_(q^13r7j?4y9{LSZ2TWK>nz6aJT8z?TQcZG}AtL zZmWOeDE@r_U+pD-(V9)m4Z(V2>Gt)X<0P5NXoxuH;5-M{>AfZr$=g<)r+xhEa|A7W zmn~&!bR7!?Ckxc|WvI4jjSMvW9yahJ={FCv7bHjfMp++*`v3VNc$@u8sj&sraLgAn zUUeJT%0aed7lE3>$OM2>U|gOY$%bMnzkDQTC9chGM` zZhG%tURy}S{}j!x48G^BZ9>Aa?C7=|@O8ekd zCg(mm$6X_=qK=lJc*vPGo!c}Y_{ttzMrB#RHlMU4%hDO+Yy(Y-T@{hmT?)6Y(1xm* z?xDrgRWgAMA#8#hpyX&uBZD=*r?Vj7bW0Mo@Tmid3($`-LSFrGZS+V;wY-&f6-=s) z#lrow4I-P6V!XBAHsM*zk>$!7rU!Xq*rMuigy+65$yrmtrWi!iX<~$sv;paXC@cnd zyaoIe@-2s^Pr`6?HoSbF9cKGT#8G=(#Ni5eyp(NND+}O%@t6&&mqg}k#%7O2QA?m< zqd2N0s`x$%Jk6sK<`B?l`TnL370Gnr`N1sW9+E)fyb8RJVRSP;KpaD3-RIc)TmP;& zFw58?VW(PIYi4R#TD0dWmE(|6sqgb$kdZQWG}s3zhOGEPf9X{z*=Hc3n|QfBFh zS;I)*U{F0c|3;b+s&Z6py6Qj2w0yp;Z2kJ0vD-lFr=$q!<0REg7DJGH+v`323Qnn~ zuKjwsCLUM;>KZsm>ff4sUae{_b}TIXCE~ZxG1Qa`=yoMEZ;iJeS?ue)Wl+N3OEQLyT&~~^>NfX zQ~vdcn?U+q$dh0bmV5BNK&?v}RYKT0OJO`WV5lay9K9I9B_)CXHHpccXHH%spBo0{ zW03()L4E&ZZEePlQ*w3ZjvzR?yG<3PPj(w$C!XPQN-OP?8mPH{{*APRn%f&OI9)LW0ALfzV-vKD>yFJA)aRiOZSh4-21? zn|#P0wuo7%z^R;oWe#!k8rU z&`%wK$|M!Ee>EF17tR@Qh^KOu}M~qTIRq|Q>HMKnG)+L>V*ZP$uF9UPond3Xv)bFnH zYC9HJGj#j6@Ht!jQLcWPVR=^kBFewNUS>P#EnFQK0 zukgj5h&*KDH~z?L(RdmtF^%IcmR@{Eao;I1OUbj_Lxuuk8e)aAp;>*b)n`tN3@Mw^&BYu_nEVtd9P$>#g4qFT%X@;mili!93G)7L3}Xhv0qUuAb6MR9dyC#du`RL8Mw&Ls zzCg5Uz(nMOPt*tqmN=x=bS@2RHzlW+TxH@gcX^{pa&oo&`Fv#@R+2mWU4l){GPjTU zVsxzfnZ+U2#yG34jjj= zaok&H;QhhPs*fXf-1ef~($Y6mf(I2IFee}LgR>o)w0lT29l)6!?6%rP{<{5iB}&e^+Gb5?i8QpuBk zm-;t_2ZKXK^d=fM)QTzHD!VFQxGgE1|A2Y8@!Gn#oy8{oLt!=$Eqte_v*qAed%JYZavmIMrPkInJsnvYu;SqACsX_ z$gjyrYld^4-q`YO=kN2yAG2F4huv=hv+Dnt?Zv-sRn4X&Ttat<ec4i$2oz(NwLwFX$NNyl!$Lt?uj-03X6=ZLu`_k z&eN#Z$Pc=G;NH+4LJboE3(CPQXR6EHb)Z$&E!RT8Ql(O+LZ(5TN}=KiB!J*$7$d$d ztu62BM<D{rECFF3sX>Jq?YNJjt)5IL-)3IeF0|Gfui4%;pULq(`kl;eAo;&S zrss4E$$Jv|V__s-Ynv$>omSN_adnZh3ERoJRCW+MSZf0MVg6Jf_R8A zuj+zm)`$!HaqcBuF1HC{C$UDU47HZG#=JxLL$0LxnT8R#89YK)NLgIa0L91jy>y4$v{Gb?ubs(yEl z&s9JfNm8X9WvoL_%A|~m$Dr@g#%m)h^c>{kpgpNt5z0f56XRU7w$~AG)?2Mh_6*=_ zsFFtPqL8y7|H|&7M416b$46n^ZRB9CjvMlpCaP&jyhnwgYLuxAk4sAP%d}Tl@nC#S z*B5N=Or7S+kE9GW+P1p?RF0C!bP=go@#DDJmX-6)dm3xywEeslZB@5dd|~11xjt*W^(m1mW#{*Z)Fa?jL*7+~2kS0CRkYernaJt(HSGe@b)JlJ}8$ zXpMI3ZR7Q7fl}7mfF=8uoXJ?+n1qg56uWM` zesvHv2?K{lEdIF1OwdRm?-RpS3GL&0`_r50Y4e1XR+92rOL3gYj%;NXdOMghMy3@GC*m3hT{^1{vw4TBDLn zyNkyAk3Tf50aC#4Ff7>Cuy=*w)|eC^$Egp=yn%MLw7Iqjsp*<$>e27@J)h9wq?s!H zOd0F_IHanILKiG~hsc1T9ny1{bcwUU0EBmQb=scz4>Kz(3+SJe`Oo`J_0Wn6YGfJ< zxKt@`=8_TR++_iuMOb-9>wWSb?iM&owvim)=v(i$BB>Pw4tJ5}^;?d*A zD3u_0fzU;0*i(|EHRTws0|d*Z9^%TGk!|^C;4yzGPK&BdQgUy;kUtL3$Hn$(i|X?` zJcSOsnO_=xUTF6&JXT|L+aH`uG(`h%32}{;**nd~Z=d>@ zJ+{|Ue3=|#zRuakbV)ur3}MMA^Z3{3HEhZ_A3!f|L%o!E`oZ*_)6}|oMFOPc)G-qT z9?-WqzR0`S{u&|$w@|Yl&ZUpfi0cOZ-p!L%*_w)@4_+mm1W1t~2MXqQ}qiZT~-fDnwRMNm40FDwINqP>2#4Q6WVd$R3rE zjFwRu$xcFKW|O2M$%>F!X4#Z%&+9mKeZSA|d47N0*SN3Jd44{}@qWK%Y*6)Qd179dC{oik*YWzyP;<8M-q`hTeDvjrQRCCThBF-Nz11i!jpnq(s8Q(ub4;ExAGOM( zNTJLke^4wJfN;z9fZU5P2wr^Lze7tP&DX%{{c3hjK0Hb26Z|$ekKk5XSD}aI3$wkl zt@{Ng^{)?R7F+ezeH0UnsP_ptl~iJz?$go4N|U1_@S2rB!0wq&)K!Fw*s{>adp)v*JP0)@QdOS{EUSd13wv);KASh{FctWzPiP=@uY z#j204+*U?h6hkS@%nlbH&*XRh-IB#X93WB8oXT9Tk{5B!^p14lK%So9~d@KI7<3--C9YmRI2UZJ8tY76~DWCz4Jjyq`Y{y1~{#uzbPm zsO(zmc(Jn!@|AEO-BH#Cio+F@R`^Q&eXb;#smqhhg zSMKl4>8xh5e{jBJ*-?P>=greuOnTGC_ZgE`_$V_j;$5MNvgFY&!>J!IwVS`t_ZQxO z*QMm*#bv9qOI?7G2x{nVT3uC^zS{n}6@x5|<;>>*(l32bTcnag#ag_kkll4p8LI$i zp)aYoAClT97U+pR#!oqyBUFSE@Scmku`vv?=^*9Dr(KO6v5?=w2+5N@_PmbvcMA_6 zGS^QBtb2|_u`BA;SwhvQv`DfBc?cQ7LKiHbza6svY)0i@r zF{K~7d=H$VF1@RYGb-4qpi$2)TbPrNay`Q^V;@nBe)^54gD5v?F%4sSsr?G!my^H9!(97xYAEnR?M4N^XPjSC8NhW zXeVTvP{ussxAJv4Xh6weDU+05$Oa^*6q8*8J{|Z}TuJ zjdm{I9><{d(FklmM*ux5mskkRLvN#Tu5&b2&kxf?z?a8){x&zx3j@RG47zI7(C9lg4)T6Z@Ir-DCD-lQw-jSn&ptYucceK?9t3;>9kwEHH-%x z+=k_YY9!Z9Gtc-&kXLmwh`M4eYFja2=X8@9qDR%kGwAG_Xa0I#MgS)=dKE0JV{K-K zv_eI)H&Rhs0V~E^V~>wNZU$*VpYuQjGsv~FWA4VM0ZO&(CpGC(yZW}-qJeO^hDgBd zU`uX>RF<#|@=FJ<_pS^2RuM+JByeKIUeX0GZw#d~nShN?!HlKE7mqGh4Z}AYi0TqP)KedriG{kWXgz1ilQ`t#$&E0;; zrzL%53nT}QWyjPmg=0C{zv@>U3`mUADqvp|?l<^H@1ExT1bq9x@qk*X%es4w{JSoypG!uo$vPF8>GVxppu((bV?T`M=Lbg--3tXJV?_GAwzQnPS?+mV|hpVoj-1eM99P3(O7S;@2&J*_Fgp7 z>?$5eu%wpYIlYS57u(7W0RkGEm*j+s@jFhm*i`9p=1#y1 zz61B2Z7Bq~;r9{-+tIz*wUg39BK)Od@e_ZzjMy-Y$?EH*j2n%ACVoFSN;Jz$@#Ef~)7SgMFA=(6C;O&~HyB&b;$8d(iVHo>CoIA1gL` zl~V2b!tksACo}X<4pye)!P;pvw{qc!*6!FXQ%33+d_H}ijI>!|uWMrOzrtkjpz$@; zOes5<#cJsG!sc=Dq|ZW<3+bT)9s_u++PZRjUGXSsJjl7@sPJYTTR*Qtx(`)q))GZ5 z`GOJeht+mz-R@`t`r5L`8bD_5%;Ag2YAE7vCkbOg?lmg<7d?vcN01G;%N-1CDGU(@ zIQg3~!cQ-!AgJ)S@o61eMk<>~{wqsEuW1q=GPvtorUt8OFsGnyY7~+hcqJwfD7hk_ z*`3$m&`6SE_3^$WIN}gOojraW^RzMYIOPoG^eI+mS8rcc@Z4<-{I-puq;OaL1x3DQ zG@MzzuH}Dw#`~IHIuen^UcdDhpmPMHskBdl-TSubmSmQai~|v)oy=C&?>-q3FouJ? z)P~M~0z+Mk=BE?w9$i6}p~i|&q$H4`IB%V)09Q6v35Pr=FOjuvU#%@bbMHi|$jnYN z*{BAMxt(UE(c#FyyXn4(IaDNS=h{62E0-{P>S2I?4ccoALWLX)0ftVO6?a? zEGeA#&*vXx{N9e+x3)f!>GobkBV?K`i$`*kzyDJ>2@Im7R* z8a~wwY;&~VTNs)v3s1y-S@Ld;vu(xGej_(67}JfyEe|&%h3AjVfjnabPToXeSgP&U zYQ|gg7}7IwNxN6}antNZHNujPsTXTDXJ4l|2K9jNffMgV%Ws@>rkVe{o zXn~yx8n-$^Cj4X+rw&qGYSdx+Wh)1dLU%M;ZZ)KE1*8d16OkmY(NdqoZ`X$g{9v}) zjQby3W<*DmJg=ip+c74AR~`8)v~DZl#?q1F=X|VlnQ=;GX$vt@VBYlb;-`V&;`NG_ zg&#-3pyS5`5muU$@S~@O)9}XkO5ip67pmZbvBo)fhxNYK)vw9=p?o{f&C!{pqjxpO zLNJ15mB1UXfxk(PJqxlOypSJuA2hwuAk~I9b9939&ca_pFtCuzf?SoT-Y{0&czqzP zBYg8KG%+*{MWsyt9o|UC=*o826pp^=8P`4xU9$Lz+mZeAlVBUMn6+qz?xWbj82KvL zkK3`;XkMs1imQ7@_@Fz6&U_2<2%u(cNrxf{BE={%Kr5HZ02;^Lbo`d);o!q>GDJX{ zyls(W{)jf!x7as$OdpvmKhCJ@*@K%}W}V>lnvSo4iDZ|wGa*WOXsj~yRUjb#6>U1u94cnA4T4;l7bP&aV(pSmwo`Si3M0+b zGs;CD(fD|Ix#jezs!8Wvx`I}OfnFW-)epVKX{M^z@y2vts@?+b(31W%)WZQABPL?? zugp#F?$B1LFJF(~ab%*Jx_UwRw(z&}W#s?mD_rD=k}^9~?aH8A5QLc6-gbHHkouUA zu_Pv=ot-B_(e(U2BM~o}y}?i8(Dv{=#4bx!WaiuTZ@smQ&AwF9Xv4ZQ&y*j;?_3h_ zJ<5z@4>AE<^k_kQbgPGpuzgO1daImRM>-HNKbo)kNAih=9xZfVBRk>NPLVYvl-#uB zU5}~RDVQCl<+xRHkPG%w%G1;cyMi%PCbVq`&ciKiCN*}~2cV7*S-a{XG(Fq}tTY~FiE3%4MSZth4%E=p& zRr{Ha&;Cnot-B58tcd+aDC>Eoxn3bR&Yz}(ewOh%!allDqr!^+f|Apaq}S=Yo3&HH z>+y-HvBf`;VIKRfTjs@rM%mE-@j<@8=qrqMK4q^u-Nd69C%|cZq$e+MU~P* za2))MNUz!Fo@J;My_GFdJ=%SR^y!WpMuzC4*anO9$wR4WGd|*NmZ1&mOa3}$;1&gH z**Ul{rbK=HrkJQfF#icG(&?qg1=(BuU~j)hL4a#!POmW7R#Lt0oaWe(3=44`VfD|UFxzz9-{ zV7lr$8)u9T>_qsC7XcQ0cN@Hc)MFG2tI)~|r5MA6g}Z0~3B+!}gME_V5aipP^AOY8 zK#((L8yb^dZ)Ps{5(P8p*d7WSzK%Bx4)>E}+t6p++S!B2?-5^{F_q5n@-+DdgVvnc z4*w4a9kENqie;X-S9sQ>tM-zFV#O4yVL` zHO?jKu4tpD4K<71R|CSuPVSW$waeo*Vp{mW7_|SF1>huoobur|wMcoUHg{Q7LQbW; zS3i5hwh9@h=FyzCImx@dms;55CqQ=Q9c#*o1hWN%x_Ak`oGb4qzUSqQmG6kzQzPMb zhI5Ro@r_*pR7#zwxEm!EclZ6sH?qjBdM7QZQkSIvind6(BO(6L>$j-XI4+%BadM3j zx)5+y39vmwFm|5pW}Jvgo;wn{oRivVdZRsP2M7;tXaT)hqW7Uvfg?AIXa=Sh+Z}Mw zWtNw$K6&PQkLunBbvJdKqJA`4$R)H(D5SHzVKYPhmHW)8Fe9ZZ*tS|PwQMUqagWFE zkf1%K*bJ@_WG@puaT5GOh~^q< z8~C@81u?LbG5a{^{Ihyr;fb!Bsyn%r(RI4<5B7Zf*Yht)Ve9>6izvcB2A$yldZ#R7 zk!b+8wu6Hp^=hZ?74lC3roBB!VmJN>P#fb`+I+eo z8v%`Ce>9y}`$KW5se1eMIJ6_5W*zl65*8=uN)9XK2Jj5~^Bqkreoe%QS2yQ1U|GHn`quArkA*H3SmZBrLb^&Q3mhCI7Wf=FmOR zsjqdIemnt8{`t`Su_8cwM44dX;txf}MqBm;OD9FSw?I@6pGXqmUvV<%EUdUoHa9e1 ziVI%@8pTz zJ6oFhZ8*>c8=-wKA!F7WijBlB#RCsej)8^ahST+%uddVQtxSIBhp~V?* zC$xJhui|?r)COXDxIkfK!19&{HbBTcbq!qS{#^LV|3t4)>JI>pAb7BITvxF7FnsW2 zUM}RQ<7Ff*!)bbr2PgdMsvCF*!MXCM#2}r`kF1W1(0T}NyWZvHXx_cvlpBkQ`oW2N ze;#Q&34pJDhGAvNGsQ7R>a zuXBcROHtAkyuz5juI##awhSx})6&z|cu<+3cjEAl{${L!ij==7PulL5BJU-5bVT-C zqS8457C#?vtLNuctj_zAGPv{IKilW86Wk8OC!`TDOS^kqEg1EpB;P|c8m02qPj(~w z+5g#*eAtanXPMpkHhoZbE&k25`?6kj;(SF$rz)Ew{|+~jNT_UBZgTWDzkb=CCR!o; zxB`(4sr`JF{$^M#p%&o0k-)(9(EQ2m25d+3pvoQ~%rW1Bui^p2?F=TNoP$`nt%o4! z4ElCdRh7v=kW8`zr~g_mjtlAr41dqAeVrtB1>mBb(p)x&$o?qnMw4LuHT)I&-q&jS z`-WN(LjfX|eCGq3NdZcIlInW3P_bh$eRfKYv;<;)d3VIdW6xTF(dAzlZA2^frF|N( zboZdDeqa!)(-&2ShI$~Zdhp$!pCqCLqxOtt83C0hHw^F6_8)OMLzs?#Nhe8$WiRr* z{HMLzks?iTI!nRV=P52Kf2X3?D@vU{HXVCS5I6y<;^v?pCBuro8W$36?}i(Zzo+#3 z!QA|$hzAUxMd9on?d8yEnpSZ){DYr_9Z zC5R_%Bunh<8%l1cuT)De(YgbOztvgI$XI{)9wP8Kcby`C+==zi-|_-;Xm>%&O#WpM zj|~d59;{TYIkSOVi{^2LANJ)(&yS&oazAu2=$aWASxkul zB@8&P@VPCBS}djt`8e&=E^RDDC12c+BCAft?dnk!ul*&vr{$kF)F4ay%bahu#P6oh z_ho?PEMSN&3Q0$^5v==ujSG2KQuOo1E{XE1m`8W%l-Z^qgFxf_%Bvp~cdcdbu?odh zxa$$@^@>98wlbRuF6YwPfSa>Zjn~NcmaAVupxl4}LO!^{?aK?YwUt zxs=Heb)omr1%xD+WGVTzk(i&()8tFky*#e(U3*?zRQGHlnGb{pNann);|ZA zbVi$o1Ouv!Ub(K=@&L(N{_NX@0H5waZ^a@nOHOI-L?#DN6F5WFj>=nxZ~-|!{_va~$aEmQ$i;qYubhRR~a z5&wVs3@rPFR4>dYRDDoiuXFadTOm%+{=GP<*gJ-Daqqj#2adQP|2}$l1Sq2lJq;S@ zjVpMx-x)TkQoqt4aRi0gLuI`1r&-BY$NyGdvwBGFFkv%W6A<#UrAv_JK^0Y))J0e* zV<1S<-@?B=!;94$fib90vPO`N0F2GyumkwVJyg}Xf&-zizzGqiZuvw{ZG(eX;qTWUW`(_3b!n^(J!8;2y5r6%tFx2{L<09muQiLTBZkL=rfIE&klk^zoWQ zQjV!+xQLN=(p~Uf+=%E$zaC5WG3FNJdb9O-#LkmvtURSy3&ZTz5Slrw?HCL8R~x#& zq_HU9B2q-w((SG(hBF+(BSeOxN{lAjtIYx!s3uPJUM~WMZU_yIbn4-1V4*V-VDx6* zeaY;C!u@pJ1-d7?nYG8$UjT!8&lqzM52lPg?Rc()e6nM6DO^-M? zFKt2nNPhF})$7b?hP345SuFe0$D<>e+^z%18>%zUZrzF`aYhynvoM_8hPKIYwxqi_ zUwbP})O=9!U-(d>Ax+{J$In2Cz_&xieA+I6)KTxhW(#@V^Hbsh_}4n5f(NZ(xGZ z;gYnyc=2(wL_Ez9@tp>+$$Qm~svo%d6N-W~1YYxI+FsP1O_);Tc)R1&y56AAgH6ytNS*HOv3I8to@G2UG59*k#>E++2Usa2M8>YP`%zkIZFfM{X^phhv8`ykVJuxtG42NPzUXp~dgDt^iY4 zA|Lu%`b)W6^dNU$M6Nr62gzV1^aWT+c4YD$3s?}tEJD!H=t61G4hdd4=?;7ykjT)Af5b!j(KDpRfq#?!H{%Jw$UCaJ5knC)O@ z*jLQjC-Fs)O?@lkJ{+sH7vV0jB#Vuh1hAx-_o9p zRq+)fg8O-i1DAx`>@oDn7ws-VHL*P)bk(pP%WsF)R*VnEGZd;g!zW(&%*!U$YtGk07q^MZ(yn0w``E@ zO3mg6A`Ry)?|T@dCGSl3vB%A6L>N@suoBkM$dAP{m-amM+Ra>qxMs6~i#}fhg!cLrj&kj<4 zu@Y+_P$mFk-*J(>J(Hp#k=5TBG8qek< z$8qWJTJ@Q(ztePB-%IXxmMj(>L_)Z31Ap+DA5Y@jZk=eh`kG~anl!;mvCwG?Bz(|A zaiIzNF^BdB=+R7intgkfrdGkPklnRN3sBVxR<@xeT88d(cB5UbRZ- z`BJ(gy5PZ#QKAYV4V9XK9opf`5|v@3!k<*mK_KTqaA#U@hhBvtwZVyc?`FgHnz@GW zu{?d?GQQ^pB!_}M9T_gFC^=}^zGz@z`pohB(qjE`2x!A85IzsgK2@|PY>T{+z9XE` zbqfR9X%FBMG}KuJ^~MtofAys;jr=a-pT(OO)at%|%o5aoQ3mnO>hLei)*%`&ON|up zwm!0D-bXvo;~~cnIsf$5`+VExvcGF@h_a{IaT+Tyy6wY-2kwDG2K0cxk9B8}O+`@m z$0)g5N%zh)C&wV5CK$dj`cj#R#`Er79L$X4s7{mxmZPgga~bEiN#zA<{}fn@U&-4s z07_OcshS4`LepZA+vu{ zT^Z+$-c^CiV9lR&>&()YrM{WZ36WfiUF5qiU!toremPODdV)0@xk0$tTQ(7Cscz32r zY2&lIj_VL|Dsp4Pg_;qpT=%ux)3#~q;+tZmcp2ty>P{(o9?V($tLQ6oJom^x|E)w{ zbt9dHaY=w8V~Ef8*yZdO>=x}z?xuw;CSecm!Z}OOk8FU!e^HRk6w`A5$>6|gwt*U-1J_(NbK?A;59{YarU?YvU& zy=o?v&+wFx0iNJJs_!$<2sxl4z6%Wfj>&mWQzF=bVsgCl0EpsM(rlj0Dh`crKw))- zSNsH8v)@H}jV!bbz$=QO`F#Nit!_ZDR5t;HX1ID%fl@O=ctNzOr$cV=2jTdo`N*w&i zTWLw{Ypg)LGr5{$>8=Kz>b}|0ls)w6J>Jw?ujQS12kyEX1#2N(3J7#nJl$w~lvxdKgD7F0jVhV>YBjl|{UMsL+O?-Y9OIjKz)d>d~P zV-Z!`&-kak%;^{QNw`JH-sN%PUf!V=-T;XL0nn%$&m3Kr_R`;o=g(7$x~LcRnw>y* zRPGmY9WOv2`qZvYhUv87KM=sjKG+jN%M=c{<-$8u{=VgORQo%n|0?rX1YE)W9C$32 zFRSREiuPMX0*;Ryo#9JFR-j2>uixxn1aXdX5r0)IB9hf7& zQ8Dzr0+8Zazr+A{TJU7DnfIYVg3!xJx|@d6vIX+afR5+}Ip?R=jn#WZoG~QGJ$&C4 zvSMxlv7?=&t1&{i_PIlS4(lIx;o2v+y{FqStk&Ot$NM_!`gWazfeap{Tv119YmMCY ztvt)dwdpu*PvIq`KlHYzm2j`v&fFlT68Wx)j`K{1)=I00MNaopu__GGkUi$RrzPUA z@=n|f7ZDirKE!hrlbzMYrCXP2u__!f2>KQ_1VS7mF4UdpQ8=`zah5`)i4w1rEo>m%)UU^Bat#j zak}vs5Op;AAkJ*Jfk~7)1D*gpwC<)s(ppoS&9==4q$s*%5_?SdN}+%)T_eh(7&b*NhXKsgG@BBlB1R z-aI&dF0A4^bVM=W0y9_7Z4?s4?wVbQE?B)M+zjVawQjhX!jv0s#cB~wWzsy=x}Sj0 zTs<3cFoIoty3-x*v@vZ=ZV<6A+Nt-FY?KNd5?8cSVsya3zL&gfEsYpK`~I(=S*^_T zedV2v@Rk;*shj8gdXd<&KYjsi*n2T|oiudLbPQbfa{h{JhxP^>V!di``Ae-?*ywKB zAep^WKTiwBxpYmw%aB+eLUOGQ*->ZWqBc{Yq|m#}5vuhg$xQEGhEI+Rq-KQ%$e$K} zp$L9a^<&VkP>}7ArF>5Ws|T1-==J2e;^#H~CYg6-+>|jTo4g9q*x~&PDA@vMatu70 zwc)iUy@QQVG!!;3)9)ZB8jX?pJk3$1xEc@m0PiWa3%Dcdn4GN5UKUYDiY?$n>D^ zPiG+1wLX!c7H3#noN6St66N19dzp>`@|JjYZ`JnA0$Z3Bw(q0R^Iq@eAqVWvLgUO` zpge&kfeO<&Dc?L@mJJU?Lk$@iRBXJNDGq>zS`!ci!l>e08bY4~*&o%g`@>76GVBh* zho~@2&$5)&h1A$2y@V49#2)VdVyC;lk|++U)l~*|4ZF3TP$Io5l4I}$QJ?$h$i#-4 zif@aNGh`QL{>zPvA-mS9r}h8U8r&5P151@`xOO$K(g$0EFYD_Xad2g1_X zOU-0BS@k{O@WGGyS5l6l9&djZ(g@OxO6VP40DMCHPC&Itwj%Y8`Uo*8Yw?a~FS$v@v?fqDmgvZPn}FW}jO+2jnoB~V&hgA05QyN7 zX9HL_arN>eb70NB_1&H_lO0Q;B1)dWf}{r_VI*;(Q;G8nv=&I$z3p@9=fircb#rx< z5L4?l{m8b8c>KARrG|`#*;L-i`;%cwK9H0cr58_!@h__C`r}X5lse3SzL?m}9}s?= z$~AP5qQ%y|OB&r?F!5q!79ft*mqi5A$Hb*Mln#N8Gi*Y8;lVtQzL-xIxwF%d%V z4jJzBBah&VTJ7BBX6w>0-H)c2`-f<3UFvd`a|yDoa@!mq$|f* zFR?=_-XFyZ?jNlPz|JAvyentn9aoftfbknR@pggc^5qr;(o6!z^2Z@*3Z@-hpmA_tDQyLcU_V|4Zk8|IvNFfgP4R1)RC%oUcU=D0`OQ> z^SJ)=17dZ_fR|R|aPhrIQfn=s-Ic&?M$FcV;WO7U-ro6V6lQ&F1g>$136y7Oy z3)%~a+SpkhJW2J`T8R<#BPZ!hZY`jtG+uH%3{AE#%x6#H6hwO*cCBYbroMixh(71) ze;9HjZ@1^~t;{ntxFHJ4Fw}QeV?;;Gb5P^w0Epm@AUt9VCk|4L-r}V%$J}bQ8)=f~#l_WCuy~T6p_W?39 z1EMpD`rIGI%qqO(=7Pz9D$cb8Ffh(jk~8rR$Gq0wH^kM9AQJMALEbs@W{{W{7%JUO zKhaj1TX{9c`E757H*LwJpz-b44ZGe$uef}!?R*nNF1pudvgd@51~5mk*EXNCiH9yxdwxqc1GS&@3S339g1>fIUHt=xbka^h{i(bgNTQ`Bf-P1#J#N1eo-zFAJFVhx@Th_~| z=l8e*zS40V1WP(Tf?Mq6sZS&w&R%C{+)>YQ0`$eS^G3GRc1{7;}x z%{KG`I(mK127S9=DU&+Rh-78Yf`@h>QP14~_eh2ys)T)Y)hYNCV6xN~^7r2f11WVg zr2=RuRnPe(kFlel*4*8cx&u`1gO3}(zEfER3DyfsN6K=q1-QrgNMtP5C1{q~EuAEZ zO`=jtH}VnI=pMC>*@UQ+dJtu@fW1pd`S09ch=(7in|vwu`5-tlXMG1>|PVOkca|W>_GQVEUhx;5j zDsm`JMgM5pvmFi9UF+f(J|ti+f&J9%shbf8M4^tWo#T~HjGD!SLPaMOh<`70TfHDQ&n?cWafTr?ngw1{KCMKZElGK{zDmgMn zIylMS_m~)_udDjfSiN#jytl$ApeeW)M@_`;7MBA6)YW zsTj`#bl=Y|bz@Mdw8Xm&m=O5t2W``X3+X11Lc9Kexc%OE#b^VpPd!)3_Z{LYk~`HC zO2?+nI|CKhp+xCTM!w#e{y+m?*0HYxax%7ZB^*yPjp?+F#m}#}Ap+%3qK=c>2`2m2 zcOg2Yu9P}R63kG;ploZ!%am)s7gMddlJ7yLbSAlI{@V3Bhs+6Tkn|ktBhfJ*nS)Nz z#F&tbgX{>%p5H&8b)ua^q9|q@I-WI=BFER+rallg-K-psbI=UqKr}7G;e%@d!}Pe+ z;WIP18EoY8B3-6GLSz%Z1BvXc1d&Go-s>>9MnFMmro;#w7r$5?)uBuF9b`=l$gbT( z_A-3$hOm>K1l-Aoh#Djuk~Vck;G?_OK0xil{4ne8Rx$^S*{M4@QnN*_fV62b&#!ud2en+UKIL^DaO= zecF#XBKqzT!S_eHK6bv?-MHWKRWogsZ{Fx>t37JD@Qc}Nx~#9H?q-9-z>MzGV9 zrU)UU*Hm?P6ut!_C|w4+n+`N5LL61+$0HYVzU;&9jM6uFpJ=PLs+M^DL_(>3o8-X7 z0W~`1`W&nCe-k04oteR_5^HQiiP8UkbyWCz^161h(-U z8TLDXwUv$PY{mtqQ#&Uo?*FX#ugO|?E%)jo{D|mX>~H4pO`W2Io<($6p+RcMgS(~* zVW$xtm4pvzVEyPIAvx*zQPhNA4CNK2#N)o0qYJE>C^ z6aX!;q|1`mQEZ!S=n8$PhMU%HM^88GYKOQs7fNN~_bN^!#ACkS`Cvi7Y2VnQYz6EK z-};LP>m)tqVb4q{KvGMs4M~erE2cg zLk%sXK3Z)Z^{$;_qPUZ_y_k6s5V)8)lDV57mrHyfdUCa$0N7hKZ8!&Uz!A#c;yvFg z8ed(~Jff;9Wc9&{!mGOGOKNS^n_zUWY+PGuA^%^z^Y;ZY590^Fa)jg8(WYI$W=;c?R<}15n~#tU`oVAIrooB6dJA#Cih8Wq+e#cTF2>+aL(d1!P@ zv!1AunMpchSFb%}8Z{sd!MFG17Y8bPg0>NdF9@VZ1|g4RUHdu&gnaz*YAP8-Zhu9u@Pv7y50zp~|3LPK=E&ETY#?74*>kE* zHVeQ_SVT>gpt=RgFLb?Le+S5WZFoiddfc7U%A^0y+091D&hQpOYttMq9JQ4}yI2CJ ztGnrI_}B;}TxhUD^KIsmag|*VV6|iJCDIM%7a6JBA~rdCv46D+LPg);c zXtElQD;wE}oA1Nn|NHquJ!$Ze_M~Fj+Ycl7t-p-6zlhF8K^`r-mdOw?3n$lgv=xT1 zMIbEu!<-sMpM~3GUz&3%>ns8B6M21T|0Q&iaR+PYc_7035D90sA0R^P#q-lE^v}y| zq|}edxtm&i7Zc7Z@YlJ#Tj5bT6ISyctrbarHz2t%#ZVS#AqoGWi_4^gyXmkBIGk{) z8*dSy?)vA;mY=B7yF+A&iVP$sDP(1NG@dM*h7j$AV|kPF&DA8|tHD=63QCVLC9E8z zio>ixeKaXI-y>@N`d2;E+|Fv;8!RRW#vNK!=SpMf$(*Jo@ON|8w9-Cz)1t_toy8fC zi&ih5zxY%pqA*k2r(LcWlyfq?Dll_ztcW+iKN(K5?qI_pRpG+Dz|J9M6}1RnJNTlD#)V;)ngd%bjm_ZBeFTo^96VGBhW9x zd$m6JZ3A9O2T^F)=3+H>Q@QN*!QmLd=Xd_k!|;FHZ+QN^x%mpngqQ#0EPNfWUYRxR zk%+5y!jV)?+}=~4gz}FXT>BZ@p{p!|%ja`=bK&*uFvlNvkex74m+*)(dDCaZfm4*a z4P3`l9Ni~>oO*F5}hxUlIpz)VNQG`OwT zlt)3)R)*bPZz)s>Di@zHj0t&jQ)5nmJY00(G5LDoLpwL>Pu?Iz!i#Ze8E=WhrKCi*cZVr}lh5A)i657)un;1J1fa24@uihpP;WVJNik34~i!r{x7Wi)RKI z44)tb*U*t=#XPWs%2BOfs!45(lmyDLf?jiTwk_fekT&7TESYLSwnjPM@GdK_;bwtl3?+XpZ){|30!dEW-dPAT8qj-*HCg+jhYKD# zu9<+qa524y;3dx>(VRZHm1oiB9WjW)B=Qp2eEraPO{b6JUU(U~;mEM^ zD`b!&|GH`FncOm@0M7euGGL`z;@pmxjof!PT}=>bUMwqYFpfxymk>Ger{(W6AhM|a z{GJ+KYZf-X0JhuQ1fEv;fcDId2k+A_vYEv;s$u&s*C~&*EWI@? zT$~|soM#T{=}PUTA%D23iC+$z{(7$06_omZk(N>8D{J>)0sgq|_m>@7vQ%qH4j*y| z;5S-PJMR{M7HmcIxv`$mt9fYxAD04SRX;5E@EiB0R79^#U#JE3%lp8~<5bvpz~vyo zHZC4Cs!VNycIlr5GaK1pI3*nbC-vpWNfVN@56&J=(2!=7vPR!I?ISC^Y|1r>2{^joMrbEF0B6=CD%hsJTnvv0*1M^ z6+oT)@fQKt5B~9wHnt<+`gX9O!~}<%wzFE%DH;-wTumVUrB4gg!SbFFcU$NzB*Xnj6iQ zx`VrFC6pC^sRR1Fp043G=(Pcp`Z(&rZ$gxWVxpBO8OU7ejk!lK;_xE{9%#FvBszu4 zGk4Qp>~~mcEYB?kk!{Ee zv6@-_LyLj>SDFOeC8Fr|4$ncnxUWe0V9$Nc7`Il9jj? zb_m{$XM;uUrAh$J^W0;9C2*MOR_)OCxUh5JzLAna-FF|aKz8DjP0QGJz9BQ1E{kOf zRHeXNE_fd@A9zZ{gv=b9Va|bOCm4;dhsM1Yf1dB315Tml1bL$3c~0@b9w~ne39fte zm|tIY5S)Vc?r=JkUUVT8Go}*oXcG@)W!91GBYpxI&WCmAT__FkeMRUFd@8hbyC}y} z?~93INYMJ=1Igu$z||zItz!y9)cb%vCs+^2%rA3|aK*y?xQIF)kSF>7m+7o({?)R< z3qm(0MUucAw6CLySZO2qVL{kKx~IH{Z0hbn)6`- zR!%*vH&}#HuSXs{P3W-eC*~)zi->YObvtut$yt&q1rp~w*nM@@4JO}>vLJV9PS%%_ zed$F%(U(sn zUmR+D@S@Mu9#5Ix=I0(SHc~Eh4qPO9f1hCV_I=2`f#25sm$+3>J^4P~&zbl-6X)9S z_`)A?BB!996#v&xm%8EYA81TdP?MSJ>Tq5H8JDyFNDVwOCW<8FfDl|6`%b*NSku^l zJQ%IbP1HcT>XTVyE6C`k;&sdOTbssq9O#L`pywUF%wguFaj z`7z31_U;{6(X9?mu%}1Lf5!(eDITH zm(FNlgy_oRu8K~->39DWhKRYt6zgJ&J?@Z}P}Q_ZqnWcqr6N|QxbV{>(muVv zh!ck36+MhtB#JzQ{A3RqLcqz5TK+$_vDXI?N84(LZ;?pjpB+~! z6)3%xF5PyQQ&|5@fG^dy2Daxs4h=#i^WBe`R}s-6iUI=-0U0agNZ4+mo;%Z zb21Y^3yDNTn4giDoLAm@`D+q%z)ue`UT^mKM*JnDCqDUr<)-2cL~|W}U9-&4Wh&Mf ze)eC~0BkF`@_0}+3GL>V-h`#$X8koW-YuZxf0_NLegS354kUgxKYqOF*KKl&yel;l zE;>3=5gdLpSlQMQvEoo%bysdgj1;2yySSZGE%7Xkbu&g7soenX;qEbqi{tmIJJd<2 zWUKsRmo>gyZ%evaH`Y+9Qm2+Y7dMJG?mj1YOR8W5vgv(-36F7Adna@K1{iAw)D+}4 zjA)m^)ZRZ!5~p}3c&<<*_9Xsy528YH#Tm$zPk3bBxccs(TGv(CWs`hwqk=bal0T7A?9&?^P`VH_gc&)hnX z90>Mh$nE$q3*f2UNI`NO zW>%>0{m0aPj9^}P^hix|`!D6p5DhZRNkGq7P4v4u3N853x`=M&v?2T_k+r!troC(8 z6XdjaRSF?+6N*P)sa7m|$Vl|f?!So64il_S$ZPC>SPyyJLo|^f!{pYE{{UJ91x}89 zk2b<5M$JsN6}sA1kPeCuQ#kfLAzsI?rNk$-go2&!RF>KH z1XW!k}+TN3F)Ej33)QVLU6#s^Hn=Vt#G^g8el^xBnk;+HWuc`zoem+6iB zx8lKduRSC}%d}$e?F_`nY2$wDQri+jd9&4+xPrCF#s*m;bz_69i0z{opP*5Ul3x&d z@$x&X@2Qbk`;$;{ty{NMvgrc8UJPuZpdE42eH6o0Lxle%w%i_li>%^{k)Cz@Z6`{?*vB##E1IkyL?TDj+<2e069HFm!8+v`~?xlqX{5 zf}nkDT4}4Ol>ny@`37@9?;#pWN3puSOR+m0_>eb|BRkMfRJS`8VN4&s$_e7Io z1RU5DoV=ip!FRxf%-g9d*Oml~q2;KRQYbmI$bQ%WH+@9<5HaaOsl4ZMoq&S@Y&;vw zLPS@hYe0v(IQfTmd2$-~6JeJPu~mlIf4hX;+zk3IX&{F~1#rp9QuaGilKj2B$svKL z`m)a@(DU-#JIm@WDMEZLyb>PmerHz_Ws0tVk^D-IAPxUiuxSs2f_gp>*mMb9rN5pV z_b(>AOiV0xFIZ@+ScR420~J8Hy!@Y`MsDFscXh04FbOy(#AcuZ>BNCMO7?7^f*nl$ zI!jy7ocX|}lGpKqa~CLz%OAmR|7yAUP(Z+k9pM@<#r(3!7Ck%J@Xn18iT#y=ps^c8q)sSW=8Jq6+rPC3s*`0+LqUZ`ErqyA+Q zLOFGV28F$GmIJ&Ce72x5il0CAk1p^xq&mKs59)=f zI!NLDkHhnNlN~>wi3b$ZuBXrM4x&cv9O-o~2?86A_lm4HJ;Q9%JRh@YmeS`4fePsl zsCec2dv|fh+jN%$0NWUm_Fc7d{S~(v%QvDaKO(gWof{N^MsKViRrS0It6Bda_TD?J z>1^8@pHURiQD(q~3g}1?r6>qUjSMQ%6;!0Fh)4+tNRtu;M~0E2lmw*2LJ?_FLyrYP zYLreA$^aokC{jX6NO;#5&zyVSQ|{sSd!P5Y&wbwej}OTAyT5y{wfEYqewK?wO}Z?& zc%TBDkq4b#8sf}sM#}-H6{-$EmH1ocLHR`ZUsQ7lH&G!wdX*-t=t7W6Rq+XKw-nPLqBcYk@QG7qB$I-NcO%bhoQs~=qX+7S5cwp2oT*>(4Nx% zXWBWWD*9@784@Zu3-BLS1hbW2H#C}d}xET~$-1NCgZ%MZE33VkG)r)6x_!sDMJV@GJP4m^dZGb@w1+!l@!;FA#(MWcbYLg@d?)+7JTInNOc1pj7Qp`HQ z3ENct4w!R5Trc((xDpUc6rP00F=B;j>@GJHF)089ywz4Y8Ui~4t&!iLUkmq2qDx~U z?prs;ges#?Nb+(MF1r>%)5AV%tYN}UmHndpd8`*~KB4o9`z}l&$THd;-yxUCQ4}OU zCd)z2W}&GS8(c6{DFg$+=*g+CO>Q@=Aoz#?hh$|VrhOClbOFS}q!|I2RAK_`(Yn>? z{o`FU&t8YrT&(&9zF2$>Y^)~l=e6WM#pm13>JwK?I%`aQq;fc6#a^ksnIz%El`_f4>6>jhOD# zJt*)hUnfIFgKh;bru;sYbzm<_6UkaGF?hNt;s8J~+hZ?>9!NoOvabrt=`~+7of7Q; z(=3QvfcV@m{b|DgT}=7ldQ!lnTVUi$`DE-oeBmqtpcaq|nRKFBPy)D*x7upoiCb^B zc}_sR)WrY<^ZuqIz-}NMIn4iexo3dSngpE!WFVm47Tm?BYFPw;*cKK~X8E^gKf5km zAc|T3+*?On0lccO#ytfW5G&LD=BwX~N}xm2DL^hy8aROz{=rh?!06gg2VD+vxzfL* z`~Zos5tjZXQFrQ4Ves&J1KcKkNV>Q;BXekIo&FB7>qsnaHFK0V!(?tA;w8F(hGT7s z+M=_*T?=eSdw{La)dAKXn0z$Q^9}mm6%d?|WHPY6R}EzgnihmA?U2Zw!FnpVK5@Mj zd?`Jv=|A9^Lwc1UfM(DsuUGPu4szun(3cfW3lsa@Q7@=Vi_ZY1BO(pLTsh@p3Z{QU=S z{zuv5Uu;>zUu{{l1_&06fE&PC@!3keta&fwI@~KT{MhFQ@W1+K0rmIAt7XS}y>(#P zN`MjbJW&La0C98UJqjl-BB%_zQrY0&r}_muAV#P?1Ullh5!@ve=z4|}?8IrW7Gwqq zK0pH5n^F!Ooiv1QjhZXqo`5W3NTJ_lb?e>$TOJBGz8pINZjS;nL_3k(vcc~t z;`|XN2{~3?iY<@G&%3^ZpVASSsF%K%_Lm>uy#3Jry~?iPfN4}0 z+ZQlvXx3ti@2dMvsu{ecUI)_WknW6apWP3o7eQX-mr6}I5|EES&~yBr>g^;g+Eqxf zo1BBZe@U=|wr%^h0Z{n$RQjJD7l@UIG9S?N_}X6ZezU%BS^ucM_m8`-{!w2cBMsnQ zS-%em<%WXVd1}cT6S(M8b2Ar`IE0oVg>bXf`RDh2nh#7twsSO+z$!c#FZ~64gd2=j zL@~5u8%XCucB&a(z3X}VRy~&u@>8b!?Pl$}JgwsgwEg%v;yaG=KYap9DUXEY4{v_l z|LlZz{pQ%nyC7%`T+2t(2hN3mrpY|IfBL&a*B5&imEZhG)s0JiyTZ4kyxaRmpGTt% z7%(Su^Dku1$z`4}0}RQ_(R+`}uIoGcvBy~@5Uft57}vb~b|&NPuD!7mn|dI$nB&bhCv*gVwG`qXF+Od!!fk>VGSz__cJ z`4-?4WV;>t-{-!Ajs#7mcK!Sh~?6tD%_BE~d zE5}-%mVy?t$7he`LsQ;(HJ@Dn-@c-F)vBkJ`OLu`dj0Fb+sW<0YMDDPG;MAAX?HH= zMD*TUBI};?9o-dptA1h0r|Bw>-qm=@N>r8F_XTOMrg(!!-mZW9+acMz6}9`HUNYI^ zz6J9_^OV(_4Y$m!u9b;h3$Epfu*`mbj^{YmGV?d79YewX&ue!pJ~?&z+7E{g@82^N zyyfsAO5zHqB*l!*oJ&@Y@|lwM(X`&6kT+EpSdQw=U?=r2^{Ooi$_`=k~E)+D88j4!}W2vH#nQM88o-{(;CS&aTPH@Cnbu@J{P3#Am`HAnk zs^^R8&g<+bu560oYv!a6k&W#@SM(Ol5QxKVXCXf1@$Hkn5Fr^M&L&UY&O*Mnz+E6> zGBV#i&dCd6W>sKDL=yaIrS^z{vH}D4S0(woZIxv<(w+0Cyz>0f=J2}A5e!?OD`pZb zXx{2No4uO=Fy7A+ziVPR5{9aU2dt;=bOmtln^=>*ExS7KOM8}=9n+;935r)Ut8Fly zSDZPsdmIUV8~J6X0_bWLr~?;momZzk!g$#K^?~y36@!W@SeBJ7d5jJqkDoXAGxjS6 zMv+|!hflC{HVQM_&imz{37i6*eeN{@)n`=n$!LyX1jmDEoYQf|lRR<6X8QM_{Kv?GMWrYqB(glezG z{fZ2jd}BLas3|=FTSW1v4v8?+a)Q$PcqoOU<|3RgI&O{x%N_eT2kT;nUP07+Tz;i$ zDlm`hnT2M7Eo?$jqZq7e|7Z zH79}XH_0%c*=m?cTz$NWiKU;$283MuHfLrcqf{33hehlv%b^0RAt1%s9Cnng7qe@X zl8$bUKIX04L>J%icH&A{33OMoyR}Jf38g%j+Z6aLsG&5-^dcI*4c{`HR5PDxwnbOu z*v}|%BB^y_(QZlE-)Lpq#5tzVQDu{XSS*DF3e~4@*jTvE>*i>b;cP~vGn_z@=QCIv zo-^){j6t@EUtc?l+37|=_lOA<*|n7b)n@!zgK&DV<3gOG`Evb`T@(@`A~{RiW31G2kV|93d3fh+_JB0g{DL(YsTiglhM(b z2iSlLYc-{oMJ3S-doAQ;bM1zTC}L6(mIckMWVKX%6zGXV0yzu87;| zg+1H{B@eyT=p*U3+YHuhSL*iC};}XBsH)Pkr=oD{D+dNe0HN7EjyE`g>dXz>YeYG@B^k&@ndLT_zst1IR5 zZA)(p0LM0DC{F?nEiuB0NM2CySa^c-s+b8k6rFp@3->{7@aknKl$a948to;v>bSw< zC6R^-XGqc2D_J(oS0x{@{V;gVI;V2jHh)?;a^N>xx$Sl(+Po!%a4 zsO;89jgWGdF4ah1?n6cZkvc_@Yvoi#8WvVbFa2l*SKjX<0K0UYop{U(O)$)#>;htU zlMjUMYek+DHPveZnZn(BNfjB8u*1};F5QtAt{#j5C*P55hL96ppHSo$_s%z#*tNOj zp+PWkVMKG7w=9uxEN#!j+=&ufWcg2~UIV|>l$x2b^tYd)gKXhPuP5}TPtkM};lz>h zC`7)@Amd4q-aixf%tU@Xer({GIA?T^>8t!NWZ;w;Ol3q3_|Z_B(d_e)ke7j1EV>?9 z9ck#zFN}&6g{1y)s@Aa8elR~!#w$EAwfTLu(Ll$x51))OVm=vFW@vxj zcAvbysm4xiIz`pV<1{SMA-KS(va5cAXA4{3RjI6}tL2h}2KnAOOHKa7lESz@pL(9= zT{1T8ZJ6uWIFfjT3=aj?dc7>DnlJw06bw|+yqu!-OR{g?`o~YK+OUR_2bf)amA4PU zWYS`~tFhZoYt&6B&H8WT^BS!3KIG(KXsPt}al5VL*O?S|;aBP|L-+bY{eSC zvmVP!&apBcQoT^tb^S+E9QC36En$lpWDE_7g;G{W9eRl>NC{g=8d5Fm$2LwSEM^N% zE%|J$9+C*7BnGzlOJqKX<`FpQCfVm1Nl`U1wwAwVBH33c8$*F76wYSCv?djF6=TI7 zeRl^4?@8@E&uAgtHlgrU@<=ljJ7G+r)ONl~SoG8}bkA)QGGArVP)THoOK_jppaw(X z$IPzFz$(c@rjmUMKb}3A*=I*C&|O3$zwK7mjW=>i_9ZFFms#iKdnYbwXl26klv3_m zRMi#P$80A{{3OclEt9n9D-Mfo^9U#dGeZfUqVFRV*@eYx1LB_XL|RpRHAio~Q! zw?)s~+4%F2>Rf`sOWi*dr^~r2dES%+UuVfaJ8I`=tl`(nZP5jq>+kVIhGM-R#ojx) zu7U6DJdWopP-u78aBxtMfk=DULM2x0%M+_sIm)(Zr$H}6S*J#cyZzguuTe$pER_UL zcs+_`>Q?4<`64^8e~fTb7VDcVk2dj@*eQ5O)FOJjm1N)S9nFjQ@5Vnx2WAOQIoc(h z6S9zh-x49b=hf62BXG*>?3BsaE%tBYxNN|&O~@lVXA1~x2OF7^3)BhPqk|Dt}oHq zGC21$F7n?$7T3xM*Li>qw}nH}eoS^6vj;Aweu3S&jo09ZEx=O4%3RUzS5qX?lkMh` zeS){$hR&dS!fh#T+w_DL%bv^N`W0kz`}bXtB!o(gDzvotRzxZ*jDD((AWzKO>1Enc z#U?yH#F9&|R^){Rw)<*FNOdUnVF6~`j)ibhdOH4@`1FJM;+mW>l}ViOct-8szevU# zm7s-wpDf;WdGNfp{@~UL=cTHYp`azDboiL_Ubn`{n+8%bZ{bgdoOl!T5PC_>eq@p1 zD|al-J&Yb^TQZ$qqiGs+v))-Z_M-{RPt!M=ZZwuRRJG4|FX-aQZ1cTtdgFvz8H&e} zMqUSBWl>HIT7WVt2g9{Ez$1-QHw_%Wgh-D1q;fz2f4HXSmr|_zN11U~seJ{0Nn@d2 zwJ*va8pKk|WfG@d2hm>jDz1!|j9G>0NF^zCQs$kAV$g?E1idQp4B2{n{V*xoK7v;f z#d_WT(r#E;7&Klj3^F|NOFs~mC_5|VwR++mcJc=;5kn0JB%bLsR8 zG4GrQy&@6`lxANwz5p-XK_+mRg)=wNw0o9T6YDg{^r(#`r+$S+3JDSfr-Vqw?uBE~ z_%O+$X>1MJiCP4#rlfx5@=A~k!W@f5M?U&`TNz}Ey6H+^kvf(JN6qJW`h~pd`S=;O ziJhL1o19{LtgrrFQ$1?B%@Bw6*LS7t)x31o9jG{!xgnV_O1sZEye(!_U`FSX#QutW;k%kQxeg zpUy1~Xma-ISIAIsT_&{KH7NGB#6ef{VZ%><{UT5j63QxQ_lIQSCK5DUf*iYgt zIc)y+E%~L-{M;J@(TUg94mG5jt2x>%*xe+yz8#gkE%pY-apFBa`ZO(s!q1h6PYFG> zwK6sLwri+FQQP#GaZyG{n`nT8zyZNzuYqKAddQPdcTP!ecB8-x^*7D~;*8d5;XMJV zLmi^q)7mMR9_>ICE#I9M0R^<=2N zz0l>*&yEnQi?^GQJLbrH`j|dF5C2JE=rSSXdD6qA7_!hyN=uou8rK}}oR?gbXU}HG zIZJfirnH!pNXQk1JhysSCc)q_>%EcNJ8o7nyvI#)g5lhg=RJT^iYbXvYL^|ob%RLO zI#0evzMeeriq_^$sUOZJRY{_&h{GwLU|K1U{dFiUvnfNY>#Z)%=@wl1V=M?ATA%BpC^va}lQ(Pot3S|W?CQ<`C({h*# zN8YjwZg!^T$2LZ{T%H`AIOU%^%d?*Kc7=ydiNy5b`Et1Ty~!4;Ht?5qVWU@y7ZNH_ zf=Q#>VaZnX(JKofD}4JFcuyyMa^k(*b3@|gZ00@hbFp{5Rq+#n683FEEkVVC6J;%} zZzm;-ByRP2FVAwtq)XOkXE$DXPPF9{%a}D^A1;LxU*9)w#@-sLLgIVCloX8)CnHZ9 z++<1?wRI|d?%ejfz**yyWmO~^N9YUiRqK3otJ)O1Q7bTY*29coKtot`W1B(}LNkz~MCHi{0!qzWKX&=P=8)@&PWLi`6YO*s~ybN5Q?*=mwpW zaNL>K*Btlug2a6b+>_su@7Y&~?S2%7g$IY4uAfZ4l0#E)?>4&O5Exy=#LLDYiLKl1T_m zHuxm*w$$irx)nBi%DBj(HGAbi7}Z%Jp=;92ux3*7Hmy35)yAKxFL6O$2+l!EjTig- zh*;w?CC3Ad89o9xh-RL0hfFtU80k0fpI7uy_rt6RKF`cReNN$w8ozWP7TQj*=4d9y z>?BQ3gKl4vWv;k>9*u(>X>gX}&G8(8P>vu9lGE zj~SFxxu1GKnYHyPk7mA(=BxzY&wMuN zqNcdg41Aj=eb$1dt$V|1iaAP1U1`S$WU((267af{&dG(coRV>~z0M!K@If}H#RTq^ zd6N2^JU)O9@)lu$X&atZ!1Q;1*eE4h#1G@dSTBGn2zZnn}^ zDil>jSE0$f*2BoX>JIWPj%*>R#OY6wwXxCPwaY4|w}YrDOPqEb&N;9!M@)2czyOn)r~{mF$D=M^$-^E199r5FqF-rw5&s zVtC;7$I~Yq>;oeli_tlv6}n~8Va3JuJdUDaz0%_8v1}?_?wNLF*Yxrkb+H1vYiv!d1x{U5u)>jI-e52N z*!h((BQ+V(6O>9~<4wpeBp`M=@WFWQB;w`U2sV~fk zfYEZ9R5si(;e`Sfqjg2CUHaKs!p>spvE7CgQnUB$_?NoanQ(JG11VCrdIjRN=yUKe z7`3}jBv;Ru!iBue?-D^Dux@0AQgyLN{`jFWVs@$oVxK5(5prYrp#OGRz-Zw$uO@vpGyA3;mtksyrrbj{E*OVs$mFyL2 z8{(f&Fzls^+T%@XFSq?BCGOo8B_-}xvO4f^yhmPRP%))NI0nZWVqNRDIyRyHVtk z{B+O&1s=kV6#bNWENxyT1`Xq>VX)ojVL~S)?FA3Qrh4qP9SAA_PEN{L_UkOhk6zo=B&ZmMT1&y>>vhOvNaoTT>tmgMWU&IXx*on76AsPqeV zXg^!)RuR^*w4%f!e9?>Y)W9@ukU)mlm?HjU;t`7{4Y|z|_f3 z#;=90(iY{|5pTlI&O-K5W-Xb_npGdKi!I10Ey|Bnw^i^df^Uojg9_(_pH+!`2ES}- z!#H|Mnkcb5f=17-9w#@wl}up<^h!WwiMHusq&WrN+^4m!SOHT#%xkX}P4V=nC#c>c4vti$MU(0eqT6gn8Dfox}oadcT+D5 zyf*x#ENI5jGIHI-7n`;|Y2b{6S?fs(&piuA*WdhJ+CLZPJC$%&_tCShH_}~H?%<$KQA>DKAhciN<6~au;N6+1^|9Rj%J#EQUos_sjAg~_Z{shx{xy+TF9#L! zp1QxFeHI-{+;OBcoP4`;E(*ti)xu>IbYk!E1e)PCLcpoMx3XOU6_&eQTOwXkf zu-$~(jj3lj<~S{DOUybPXTQGcDWOO0tU~@J4F-8HrX(Sdq~yfu@gBP=C z+)jf#?vHYp1U=6%S!Lb=W57yYPO8{l!mRUs`Nv4oFcqedrQ)FC1LL*0M;`35%7N}~ z8%dlnir59(o`&Ki=fhKA9~03Kkv(-iU-ba%?%sq_HQd}6ulxB@o>((-TOOl2pn%If=|AiQn>~A)CAQQC^Yk^7J zZ$e1c#ZnVZFq0R{9B6$V`~|}zxbMw#Pz!IUTR`QQ8kShZ)<@kQps?6sM|wgHxYvJ? z?sCsT-J=9O5|6PGdX&cH8F%808Zgz$_+}*bn=_&;mnm-pG`Kvw+cIA<9gaRqt$;^` zgMXm5N@X!KLLPCx_;@I2Mc=)?@f_b|7mKYGIj+5q$kzuWJvYUys{>MOG!r`4#qy*k z%Oa%X1T66*IU#kis{><6??YHFA8&F}e|q z;cHLlilueJjMaKV#SGJ3#*952f-m*BqCP%;>s8X#iRTzt-jp;gXy#e*FzD$o;VNlK z&FP9#FUD3kf{(`#i@9?@4fbzT2r^~636v`whZ@7kXTSD;?YTvWu7tA<|NbU7mR#11mf64hG1B+LMN8PMI-N@@!%A z4;G%rSxkEMPb)llmVR#3?aj^{q1z)O{i4GTKP#gZv}pT0BT<4bJ#=pLTbIdE$GmCK z$_0I9s;Q#0*=ISr;8-ZDWAZK~41X-kgn{t%3#d^zIaf4EMI~JI6_yq@wt4m#sX@KY ziBnd~4m-%FdA;8R(`1TCX)AM>!-;(yPOxE+0u@LrO7W)sF`7E8!0NbH#! z$6&%dS(e4z6RAGB0|1m(o08#a#4l_Cgpoh zO_+genso^?qqju#&Pz&Dpu~g*6G+BBxlV^ulN?J@F~($d;yshR`!>#+%4*e7ao1u3 zTc$Ij;wBbwR0lSb9@LIZRAq!(2j_+cN2y0C9`8{mnn5&YkT|Lf&5PjkoP|I-yN9UG zJEQ(Za&kB@Q-SnY-55>^VL@=$jCzBo6qEi$m*eNU)sRY8v1E{z;1`j3;1{)NR)Le# zjt@GWJFQM9UkRF5APuSGJkjl}S*i%@s2)ow6UB+Nz`)gxE7I3I@rbLPGKA)gIu!A} z@-r4t77TdF%pg9}yu^MMRX#CSNM;pQ&FPS@*fgZ~$8t?sYNN&-Ls4cc?+=YerCH@R zQx!}wB|S!;x>$k%dCZK!bndt_@a%22 zh%>s-$@0u?m2<<*CAMK&{F+G?m`O`c7<;Y`0@bWG+ZM$RiB0WjhosM)_#n&Nj-Wv! zfx{kD#q=1~?09qF3uH(I5nZ*+W6AA!O(T^10eivg#vCCZ+ENOu?NY{1rem1hlFS9r z&@hM6W8Lj0*^23{;Mb|Y@1meJLY4Gf24%~C?3{r!{Mnyba!n7Iu~-Uv;!M)ZW69xW zUB7_7M5h|#_Lw$IOYUry_*je5l^c~DUZljEB<_FgVU8#d)I(kFDb8W^_IRUCrev;Q zlpZQGVl*{MVSAf}lYx}iuzGHKyve*csf1fI4Z7AKjw9~gVwH$CI1-o*8bORpS*K^q z>^}J9+yrPwMaiI6#b25{rhP02u3a*$OwxTQtu|JmO>b|?4Fjn`&inu_)DNK``<_r2Gv0#dp0`@NQ9D2k9BZJx$1|_gU+|0 z(DEHF77l+XRm>MHm8faNQdQx?Vr<10o+)W5+qe0=I+nH%6Ct(rLs9fOYdgKsBZfbN zW;SFIAw^pRg6N4Jp+6@MzZV7?YI;RTF$F_ss>915s>$KE6IZDK*Jk{s+EW~HyNqY? zt)=mupt=az$$%PUmrfm`@HFr1M^wn{@F}*wz?l34qyEaj3{n&Yt45{A-N%kw!pEu; z={4|Veh83?gm*ud1L|4y(fH}*<92%aR0z$oE{E|kaqtcc#i@B#DaRR_izCNhipM9Z_LgmxjD z=g`D~p(>^Hv(^E{33vyoBViR4OK))6- z+8*C|4pgxsM19(K6{ajbkX_gX%QV!seJu}aWlvD>Wfppr2zp%0 zX&|1R#VG#`l(RubZ4qc&Ox5@_JZuK5Hf`Il^7?gEJEOz{JE;C1)(+Y6{xv0BeUuD` zQllGsZAyEar@2b`XPL`q1i!Q2zFqD`{iOWnc zxX8&6dmZ5c`6pivy%>&^SA8vCu8aM=)%W%ImmAh+w5HLWD7}qRFY1HEJ5W#BWtQ?f zQJXJdKkxT(;izywr)v2uztGKj>+#}^mKkQ=t3BaMB<6Ug{AHy&k&t%LwtXv+@cb+6 zWqVs4Eu#FLH*x1BonLc5fA=&Cv;DWxWt|Xqz)hl zcB;&ay!k+<|f|l-|wc59J+P!Ow58c=LKJDE*BZqWh`pt zP&WmwP=~tn@&|0QVzOOB!DY^iH`YRionG7*Ew_a>%r(4>Q7mo#Eam|7K56Bkdgmqf zv$of37qsaMqg!U))afXJ4Hc}yycb5ku%4?og|^;pxcMCJ>wTtUqP@}&es^_j%Oqlj z51-JhJ@Vq-4H$-xb($e+DDT6J3U0QHbcEFx^~gWD%UC85xZU(U`z+3^kCq>cjkCby zj(%Q$>*Y-|OfK>BQ-{W4`3twUw@*f*xG$bM`%JcN48?xFo9(H`Io7P<)t)JGxO|wU zqctJF=kjfe@YZzwTn;q^9{KL|5!^zmVbsQ5_VBF>Md1?r)mF~`tYxNzw!bMP?u0!* z^AXExoA-3l*0;noq;fw$y@7mmStpFAcitm!x5YJchUnH`%KeQa(Xhai6?P3O`U&Uh zqOY=dsqIQ-}NK5W` z>5#Fd5vPoMhR>vTIw}hj+x|IDpEGUDDXR{}w&Mevmvnp7vmRin`V4PVOeChgUgR)2 zu2IR8)HEF>u|to%@JW7;^lf%|4r^9Aldj05_o(gpVJq4r3pkyLi9G>3iRob5rfKSZ z{3pfBvP}A-4qu4<_S;iqqFXoeHg?X&4v~~}1eHQ)1s<8qC<&vAA-J@r;yAl4#=M?5 zcrWkD!W_Rm1{-a1b+)z=6RJ#lsVS!A{ioJWTTW<$z4cfWW|AbOHrBmur*NRhOL(Ds zd`xshiXQooPtojCWZg)N2@E}{KgrO~LDhkcT*VEYS74ms`?0rrWGOyScta%b!$=7) zy%gt6R2Sz^>4lknyDestK2B$FT|AEqow$%~iGxiwDk^~k`f-d{3ijL{q-gEA%Zlb(SNAW7#Pc6)`i$;r1pw)M9$N zk{UmZ*`kl7p+6LP7=%fEfCc8!z&#;V=CDmhF})}!p8Vx~hP7o_2WS@w}?i#OTZ23aT5H@L#_kL z@8Fl|i0>4GOBpU;J3SShuuLRDq*`L+Nx{%Hr$P}^@+E_u2q|%FTs0kT=nl-wKE{Wrw> zlPq`Nc=MfAr^w3wx>mZM987Rt6Mv9Tl<1!v?D2M&GFa;y0wk;TYvRzQvjdW-C0&)# z)R+b+S~1Oq`f`s7N^h<@Gt*dxP&I)zyBT#s|BB-}PXHyFUvB;V@aeTo;tFlU$jC^; z6nAn8+Y9BupOcYpSPf86dl{O{5DVIj)MPB?c~Og6WdOz2*z6WGd%BVK!7X5DFYSSH z<<%Zfk_N%tJAa++^5Cw+TEwj?OFr$HM$nJM2oweqf={wsP_ zNR#)ty}e(~s0Az^DPCcu_1SV~aJ1(SWe-EmAVxPaVNTvB%d%H*uqX8*I_j^i``15Q zaasqc`(C(nvvV1!s-#=*_-LwKWi6My#Gf(DNp~(SQR$jy^H9Y*1IXn>_2obK<&cgD zW%Y)x1s8jitEVGqfRihSg1`=vw`>yqAVc#{EC6J-Gw?pavTJ7(;ju6PJkbo@?wFR% z7w1{e7~1OwH7bdIy~|s48n?_ouOr+c{-J6vO|ExLqbH+%7V;LSEOTt@F=A43=^4r1 zWbBafFSd~@P%UI#Z*A&hJWz<|xcKC@s4}u;wZ0BMl#`7L7(Q6x=@qh<`qYf|WTg#H zt{n01(_I5IU;nahNurRrlt`57Umj6z%+O?J$~sC{%)AigykvJ4?5h6yD@};xdj=Pa ztw|(uC`6eRR^Ez5^)gM)t!Ic(Nt2;LM~HC&4ha72s#+dJi zBQ~GK^TdB*Ot@kE{YDh<=9)|O#@LW;sy7?GaWjn<&8#~D-f-96gQkID(Xlimb|K%>cmXUc3+ z^r=MXk{b9nS4?F*(>Tz{Z%`25U4=zyEe`G?mJTP@k!kw2ER!-y?_{TVzV$+Rm(0?} z%uw~=12iNzIT$sSu&~mF;h6bBDm%sXS|U;M>LmZlSN;k&i@k@FPUQw9QUEdTfsIY4 z*UF6CiU}mQ>v+}S-F|h(?W!EuK!YY`cl^34`Z{vw*_-*)CmfBB0KVO!Jilh=o#Sus z@rra?PdO^YToy|){N=OKS|t6YQ~Uc%n7`?$KWDck_S}2FO)XoU^*L&i%b5B!o3c<@ z0QEnUmMyYn!_fuw6n|`hr#UL&@Ki-J+SRmgsl~^;{_0X8PL%a2w1ExP4_t{xWRK)> z-2xewTWl;{PIG^_CUCa}3NYVZ(>P}Sjb6km+7pB;7 zR>k{8;%f2&vbvzJO5-GH*ZdZ`vHv81YKUW0J^iu8vA&%W$F<{?@tc`mZ&G&d*0Q)*`X9ei;DN{_*Pk zacKEB!Tn(*h&|E8NoZa+Z{|k3$+Exbpe*`0lRh|@Rs|Go{6qslyzyNOZs3<^p;)^} zmGpgqbDvk`Ql?~>8YeBt)1q1z?dqDlGSiL8ORFiQYc7=81EK@XUe3QKcnCqHtEj11 zynM-KzwRR>%UPcr?t(!;H5=iqVQ|XCrDhH;_cKvE#rmL<+s|vbyDx8DCqPQa?1UZJ z?`xuXRy{E`bz&`7ZBx|9R0wf7>N42$(O+f}U3s#2NV9#8HG?HU z_Q^jU8yV4QJXLc2;99uDgEa>QXYTEP3u3vr4fg10&ODw0L70&QpxdNu27AecG!uCA_c8AF$nygJQkfAC{` zRREuf_aU^T%tW-ViH09`UapN3AYp&<=go@tEeW~XXS9YL_gWKFbHf#Nef7lZ=~Smj zzDiQMlmlxxjq#447`LIS;7GcuXYPSP^jf{wa!uT@K)7a{2S;+R@vDtt4l&V1!fO%3 z2|HnSoVFuyBx_?{T{RaZRfBHRZ0uBr4*l2a*niOAjmKsLS(^4VWZU7DdAF5Ixn4QW zajREqnp~h`88k^IUQzCYvN{|oXugm}lf(ESCBY5&l_flmKk$q8mIiz1>$TK0Gr)@~ z-RJS~08KH^&rnx`bf>d$XDkO^Fz5EKuraiUw#(NN5)&=IvPyR#_fZ3K<d^@4Cfpo&?_ov}=$qED9OTdT110voijYy}o+Wtp@g&-{G3v&RY` zOOF)0e?Gk{?x9?@Z)m+oRW+BHCr2pn$l2+#s{i=(62F|5v-nD$&+dMGRl97d+0AAo z*E5*`28M_Vv|Ru9hTWyOH*!JK59J%~jn0oE_e69cfY)SfV8^@Z=&$Q>t(dr^1PFRa z(O8@_tO}@c+J*w<*SK3=P^a!=?+s{Y8=zA#T|b)S#M!br+}xenVTQ9^VRhLth=?W+ z`fO_l>V{xr$(=oQL>D#UyB#!j0MTuYNx5Kz)L4?8eMX@(pByrgM$r3uCj;D!XRcKw zv;1>EfJsm#rjkvxa{sWS*9*_nENnieh(E3$=AicTXw{lT#-<-HpBmVxT&mUMoL7a| zE`{P!(wKM0`;F!SRzKjePi`MhEG4JU(~h*3y}aqi_@C}Epr8_|?$Yv`FT`61Exs4U z?(JW!b!L5pkw)7b=xZT9Ozg-ysL=CO0hFLd2hG`pnWwxOFRcJV!P&lR*LX%v;Rdlw zYv3X>?JEWAmL3$=X-v8!DztPzC|3wvU=UOC&+ zi|X`7ejj3ZAbdislrFK4R46^|T~aKgdp}vzNj$qBlY6vY-P6;Pwu`uU zx(90I8WCLLHzU6v$vp(%q!$-h(7DDM_Dg|BGQ-M2t1P>8u2p~4AW{O{v><50>tl}` zV3|W|D3omwsCBsW+ggIVc3S3ZFy%^;X})OKV*A)td&WL0CNMmqFp~xbqvxuf)#1uv z+OBvlRS?+~ljLT_iWmKNvCh*zL3E!6KrkqICAm@x7G}l76x4Wbv7w~w=pos1#?0&| z4~j1RR#Q0f*c7{lg^?K(e1IXLp0*Fl`Rs|DdCS*PqU`Rv21L*G zfCfZ3K;9%)4vWi4A=X*2hxs&=_7hhH4OG#~B8qUu{bEUtQQa*K5?T`<4@OCga=(UF z3_shjp*DMQM44!AGhcvP9xoO#g&%qZOh2@hMuGCaQ>HL>m#3`Ji9hap6ejk% zXiY<(wnD30Z#U3-Ku896l{lP^CcC;nw2Un~-wAr)+9_k@M~mxKjuHP}y8kxze&O=c zJH&E(t0{5MX-?dbrZMGbtim2ITs7kXOYYqUTEQS#sxna5XQ0CR@}lBYal0opt(HBM z=vGJu%fG@cx`ALRNu-|J^SB{{~B(DiI(d9v+X5i)Mxxo%Yv;h!auFc>*7#LF(@e= z5x?O9Mq`=DV2_8>)MaY{l6MoSVl9_cXV|JCb@P7=see>9{wr)nXT@pcYCeJ}N{t_O z*{b^nZBza7bmsZ4s+UyIV?`0|QcvB|XhJwkf3PQV$xnGL)~o7Im$Skg01MQwtg|@x z;;O@@PQ+jOgeBsCSoP!;K*v-$Uv!z-S#?X-X=fWe_##~C$^S@}fBYZD*(*5`DYWex z1K;x>;uQ_JwbV#lZp>-vR_mLNoGHLnWq+%;=jd^?2YdAD-7sKdl=L^Ltwka0eD=n| zNSzIyS8D`YIuhFWPLf<*!TGDN#C}LzeCWX%&A>r_`uwcHaG=bNjH|k82c}(g4L1L@ z7V++LuQ$w2^=fAmjsh%VmK+8R?GH`c&LmQquC;Jn+ILR{1A&X|<;*Wx`2jwi^ z8myb|*V*StrsVu;dT823CMm6d{6$S+v1jhM)LIy}c{4~0&FcR{i|W6DVEsQe8vkdB zf3wN|D>Q&BPr@7!M4g6%BGbREs=VKHxYw1#(fSR5hHRc&%^HIqtPeKik=|_%b8x0h z!!cE(H>3#s&D8(ZMVkE-xuAyr+c`7-XJ7sebN?&$<^OM&RO#UFM;dY~CM^)~_yYE1 z1&%XM$wN?oX#iwI2_6a>+S`AbJ1x4BUrE=Pl7e`sw+eJJX(SLi6NpLkNPC-<@txL~ zhR-^p){vpJ=mEYIoLS}Ijx7z`VM&yf|3tA_PoV4>O#{5#@&CsqTNa4&LXuR4J< z%jVqtLeSJg10e0$Oy23Lz$#b%vDI3h1fh}14KyUgIP)n~v&hQ#k6*XgB*yJP0#6MN zD0a_Gq)(;Y+XI|B$fNRWvJ)`{ox@EUOjO#gv+ro&rCH83oO51jLvh+Xkho;!*I(tv ziMJ!TGSlOG^8EzYu&_5yKc0;?7x-wi)L`aaG27&zS!fl=?4T9r1x^I_RxDIaSx2hR ze4HuD(8%$h0JGZ9Og7AUVg~-6GfrILiuaFrz1Z9Pa(NcD;)z7=PApE`T-sR_g;^e= z;kl!DvSCyE)DpI|G-x)l-|ChX{7ozk9EhS8U!Uw~l>1s*r;$5ewa|ySXROXGSxHEKPe6E=5L93GfT1?mHlXzl}6Ck)Pt_pUuKCx{EE<2qJ(nB@J8d71>y>)5$)=k znCrrMP{bcFu%1?L9OOZWK!RD8C29|eh|hW7tvBCW2ls>1rd@at3D?OUn1XmVMrCmz z#9{h%7xfs;*s<@AZ9ZSBbEfj60>`l&5`hy4E(e*V2i3YVo-YiPFixH(Pu3O}@wylA zq6ZwJE3v#9U$lDh&6PdPt|b?fmmcJbMR6do?S_-~1^~@i1PN?4#_v-(BuqG*=r+#E zBC@gyrmET4)d1d~t_!T&xvXDjGt>*46$dK$q}o7B6RW!tEnb z7}uhpy;L}H<)vIFw{!iTh+39V@nNghj$XN^X~5fTbFM4SsxW zHu%^&UElHvw43EjhWswe=u_C0=tITelL(i&LyI%>u2(^n@P!;31aqzq%bAYCP~)op z*y^^L4v|m0!Tc5c!KG?k#NQ}rcyg2n-#7!jLKY(+4)wuLzZqOKQWYIpqmc0be z936rH)Qx2xHT5;=tUQ|x5O03!={03WAh))HEJVX0`bMFJL$aOx>DGdN;^kkf??_ll)}_~;a6f5U&JK>dw_`Kb6k z=tlOG%IYM@0&rMVBO=?SI<6J+be4dYRUF72*I0|J-Oe)!={DFHW6d%UAPWiaI?>T@ z$h*L&HcxtX@Ne$@e>tc4*KhtdL}u*UsHAg63F21GwSKp^{}T@V!)6!U4xuA0gTyDg z;D}N1vk>0CmYGLxg5b$d8dKK%a40(8n|p_^n!Cw48A z=zIx#SnC714VuArgeKoj%V*f-g2mqY` zQY5Mg9i=K(I)|p!MyA)z&d!Q=)ghU7IOv~{Muo(ueYU<2Y!mhhC)N}SX};YSv4*wY z{qA1<7Ap|{M^?3JA%~>%k*Z@a>&;#_5Wqnchu>eJ^Vik=XFt3d+tCd%jYcHWdB^9L zp1;Jji{5J`{r~Kpzs~Q?_?D*T=D6lWWfJ$^eB`;ciR9dw6!Oa=S4UBkkY+2hg#OD0f=JDKnPL) zf3^4BVKu*h<8_>9811Cuw9(K|s8fy+QmIH9($Jt%Mx{mGXr~eyTE{9>T8j2jR?;AC zZAC+il8m0$yU&Nm$0xeKzw7!v*K<7=f4#5V_qp%;b-(ug8Vz?3buMoZ&Pz9%y7Z}M zMcg9FpQuTdj#fNiB$bz){7LqMzqTjDg=K^#1$_s?uRqMPSjdBJl@h*--S_3d-m)5x z)_j%9q_9^`y@!4FFCWc+S{@QUI=8JJg_aoKGlV(0Je7TP{z`~yF2sOwkYs?%bjzX0 zNKzF?sB7)4VQqp6@Cp^yNf89*PMV6dHai3}0zbK4`1Mi0ql=jUkgl<>c(b5m1kmo= zj_r{svs7?Sw!o%AcEAs22|RnZ2Zc>U;ZpltV$C`&9blV8*#iM6YkgM?ssSR3D#pf0 z0_zISE8qBMHkFA1M{BsgbL00j|N0|MA0$r71$nErySBm(x0WU$iMbjahFOUZgHzaX zzW!=s{X7(K)Cj}QMq^k!GL&!@UGdH$6-80d$AQP(k>sw0Ujj#B={6Kknq(s2aoo^p z@$;L>9c*QWDY=Zvys%rlZoLLwl-+jW0&@5BUc)3Dvb|B6tv_+kY436eerF0sd~s~B zmi%EMf!PQQgZ2s+{1wl`t#`mt?c;WBMu|30!$p=0U8QlIH{hi6p^E|{C~J)Y6q^=H zS=C*ES);DC0`e%cF2h>fUBg0{(C||=6w0Jf#Yw}-b2{RQ?l9Gz^c6+(r?f5j+P2_{ zzj&MhynYZ*(P8ue##u}Ck#rkV463<$bcwCXQU3@5W(E$fC49F`88>e;!!IMS&DDR} zEBoiQLbM3K^9x-o2>V}_2!Tx#%C4I=&HM>Rz;E;Vb%cSwECTX?nYc^_qB<&6KlP7* z`z2?}SMpT8QW9cBsx=oUof48^!g9dVrrpsXB60vpo6=%=3GRrcAHetWB>mCiyo8XF z3J!z$xMj8Ge9gLFQ~8hAh4BJfD{c_4zBH|(@$VloTLZW$<(r{|LaH!EW&>Qe^&3ki zP9!)H86|xI54&{q=OCaVet*_m%m$!EW=#C0&O=N8d3XQbJ^mMYX}mA@;YZ3X>EyBW z5UCH~!L$CnPhc@tvrOhgRHf&pj)q?Hsxc(yOlgCD!_p9h@%jl~d3XdIoH_Xr#V4@T zii@65%{i2)n0Ujk31we9IPSi=9QXt;K|B%`k$;uks~d9BNJUZe1Z_RwD~nb#N*JSo zFab>DrYCj~&^EwVRr^@FNsAya2%J`I+m%xYk~di!`xd6zg$FM|FO1#sjKLRa%s9KH zciM@T$$GrVn9J3{Uktz8h7jdx@0{5cvQZB z9ATQgBplX9`_=~I8bCe$(Vp}pDBuso%_hx9cEki=RQB%K*(hm7GFEUiJ7#CQ5%vOE zoWasj!o|!+aohOI>&J^L65xOhgyN_&P=s@pZAuX$7~>gT0VQh@08gw9gNM-*pI!pl9GB$X&Xtpf^ zB*}Z|)PiCVrai|oePZr)uL69psWvT#5Q4j`aahca+31^w`zcaNtF04J)Ov7$Jawj+ zS>fmylxKu8MGO>kdxD?hy0%4(Pme)nQ&iFH!FSvU0HG&HNFRi#qYG=lcCM9|Cg~VC7;`if(l~dL-6>Lzx%aP-7*N=JlAwx zDn%I+ksW{#$*#Md!4i;d^RBje*m00PF-x7?v`3)}<*X1wtBw_nH74=9)SKfmrSYYm z(J$7iBWHJj%;X^8obx02Is3I8r4K8L#z~#*6h#6Tu{NFtA+5SC;N}b$4qk2XMc^}g zh(SjK`z1~k%7^&aV~qXKx}^)ktwzq+1jaZZ>q=ndkBS~F=rc*>cYAfV3%{KSrJ=Xv z#++w$#pRB#JvhaG)IEO1z0z!n5dJz@+0xu8a!1Xau7&jyNmg)vasBnZRR)o(iC*j) z;^y`*E*7t*i#;7Xua_J&t4(Yr>2PYRRN%UR%n!$}2c>;5>NDhgn2&iC_x9TE z%Tir?s`L~^77GapNz2G!uc)XfuBq9Xt|6yXFnNiN&hz6rtmbyZ~O>(|9&V`If7 zB^$;+yKh`TP-CdAtu5~Ewtm*x5NlUH>0!Al3q!$`FxJRziI^trKmAM~+^RM?he#l@VIp&3;@>3>SfI ziWtC!;DAI5_Irfb) zS{$royT(oLQF5S$9>F9p%`R`^Rxaowexce5%qdNNPMUck>@$&AIntElA1z37tqoFna}vQOqMm?t=vQuLJWnFI>Ju(Yps?Fs zHdb=dBa@x5wHBnT>}LQ0wLGOWn*q1EKN7OEr>B7*&-mC7uJrZW=z?u%JGn~TNk@96X1F`aObAAp2u`>Z;i>7T9-na~o)77{V z<;iMjnarXy>x)_!oGTS^QW603Mh~b$pY68XkEi~XQs2e$vbov2si|q`{d>KyT@E6Y zHe-N`<9;%Vaad)E)z#Id<>m27OG_8Gx0|0cuHnO85sgaP55~RJn~n06u|?lZG5X8+ z>CAg(tlPA;OS-zOGBOthQFbsG@$vDUw9#q~F8l#^mP%IHW--S*hfd+(RUw)9Pj#lt z^H$EminxY847CQheSe^%*c0BmDBB-6;C=4gxwN99CB~m7L_<%XNNv=#IOFpxi2MXK ziZfhejbRcBZNcqhB0wu)23G`s*z)gI(uoB!ajspCz$)=yBr(C|h}7<5oS6?z!ADQL z%E;6-e%G#TXqrJF)9t3WX$1uW!66|f_wL=Zh*+a_f+C33mX$$AbU`IwlwMwVNf zujco;XEr44>2mCt$!0!V_QyuFT?ecSj+Y>4ogG}R9v&>p%F6AZKatIO(6`i*T`6{3PDI`rI?NFWy71aC5rqjAba^ z{+|3G!nx?MeE@{ZCb18Nd05GTo>(J-*7iygI#IqGE8mj{I3GJlDLFtjV~cG zgHFMfaO5wt|Gn);bpQb8uA&~l1<=cJXn8p^0HEnOv69{3GTUWNIL_e%7g40bIE;O# zO412|EdwUGz8G1;ttwC=PkZR?oU7uemNcTkEQ> zzeSp`!8&0otH^A&Y6=^Xv#3q{%KbJb=0$8;I!wM=q-WO!1)1r)s3f9bz*0fWn{pq~Rl%@{SE<(rOsn@%SER7t_+X3lTYI5JYbS?8%Dbni@NY4p~D|rW55h z``LBKO;+s^gfE;r2Qz-;eKvg*>uPf zqWe7%b|(jWDwENZ5EZP-g+Q=IbxqqX2(Y6kq_Wkg@7+8Ofr^~RKbD#Ls0DHtbel{X zXA z$TN?B3gb1*2bTpF#{j9|-ZC}V@Q3Fr8r?@}Wh;vK#l*ylCr0~9pppsK9*DAheaCVG zpVp)?$?MY#)*jT{mq5fA3t4a7iX@^6*B3T&MkCc(@Oq>M*1iG(I~m2mI*= zDLn*XVc~YDh{A{$$I(cJIOR^uqEC_PPZM3kH){GI zy-!l8JXlv*gfC?Gt-?!h6!3RuP6E=rOZh>&{LK+4wFC&sb##Ro+=XD}r^|fFT3f(! z+sD_psbEPbN~{7sP$(o}PBLxSu;Eh8#jm`YUs>x#s~ql>jE}pnhW4!`qjjcB!ljI} z`?O3)MXKC~?#B>8T*qRdG(}9Ta-%0BQ(a#doet z3U+t2FdM;DVu=rR&JPbXpbSfP91Ploz$Rt#PINT%tG0;hL;$*)WKyBbz~Nmi zcvvHK0UVpSS0Knj#;c?>tFrJk`-Th|6AB+czoxmRJp1GIu+_kYQV2@*(j9FO2d#Ve z>qKA4sRgPAVkc6VR^qhGnvW!r+&RXff)zKOns>10T7(ZCetoCJ^CBJQ#&mlCqU*=< zRohXxCp}>vb3SD0w}TvJ-nGZgYBhwn7uOYi9i5$cNCA^$Fv9y+b5%Zh+EV+IL<80k z#d5ppYwwD8ekngjQehShvuDpPHun|a#vh-dc$+w9pV=XT7W4SdXszBn1b#CmnV6WD z#)&Sxcoqf2fG1a7g-bs-F@U0K;}Z2GFmom!YfA(fgLAu;%iwyh79!>N!fryTl)eG^DrTQ?@0uHsxarH<$jKtnJDED z41J*H(7pwXX(6WKh5r^O|NhH69vyOYQIt08b&##e%xX3$++w8=aB9Xp4W`qz5R}~1 zJ)M~Yp0KysMH6K!cJlqhRZ6pC$Ue}-tqNI8KJex%F?a6V{ILU*F_)%{)h8DuN_uOO zuOVpvayVNy+_p=#bxF^i4+eLDq^oHc%Z`)0ls z{H}YsTp^B^9nbY26y4Uk0o0%c>?=?p4Pq{U*4EZmVOkLNBtPfr4cBmuu(99yr@!BY zP9%*+qZkypXT`hLQU(HS=27!dFb~Z976u<5pJJ|EyV+Esa?*$6ksRHkRgI2CUmD*{ zoc2Z17B&ju)@N;saf#j>u`vH2-8;~;GoKy5db~dB5Y#eT7IU=4&Dpf)GcVC7ggoL0 zVGA;QP#z%sOwDqqLXc0lLiq^ug9l~4Vp{c?w99IN#W!~*6+eEwrP5{%)dQ>5>)SD$ z?6Ec;$!OCtv#=1-oXmajw&{*90}J=c)&5CL=S4vq*Kf&2P;@O}X!y>UaP=*~gD#~7 zpu%!vmN%f<&-S<%&OH1+Dl-oNBG=M zQUu_``MT0M1oll`0b`GuyD(uYz>LR8NSPXxWpBGs^7Onb7B)2?*m$LYjcaC1jORzB|A{d|E8a> zQi81Vrs1Hy>_``o$uf7|JZ}ZN+S8d8N)j=y9sZeX`%MT$Tef{FfMEePSyf8se?g2t zst(lo-waVa0(uBSoyB$^|JdccyHgY5sM&^+6m9MeqcA#3J^`wXo0j0E_zpr5KyT;1 z+6T(lOBpSff?zYhu8+W!&kG$anYZdRUWK6@d>qcpB%01qIN_pn@0+1KfnyFt(2{Py zON63t6Ix!3tZ(9hV^1bGBPjGFopjp$`x_T7TEqLNZC^GfEK&%6b4kleg82WPn0OKs7>)36=pvpFC;G9)^=W zIOna700d$;@M}eFnGlZap#3BZZF%$~ivCJ{e~-IGC%YX)WPllWfBMz1AAgEu2$I|K zio9zFoXdaCKF^tNl~M{rMH3xAh{5Gvd~p<|x)?t*e#bmwA6$aR&A*Z+jMG9zwW9FkOsVWFaLw>*kK{aK)P60?0 zHg0BpgPVCQE0>@!SwApTB!l@ib>m7$d*@Ty?f~xA`1SJ{SV4DOt#{T+rVf*G>g=Etr)!k4ihk4(-G`o&xwJ)v(*l0b@ zL(wO}C{rP}K!*xnyg*GmUY2L^pqt=g0D^0V3=;yB0j4_$f!tEeQ2k{|iJ&ikvG9MO z1k+*L84|pld*`is$+}JT3?`3-8+rUe;&U5}$MJyVGRe zSDRuEWopWjk(*;`+hg zwx~$XsY^^(aXD;ZYmMT6uLpkI&4*Fsfsd8n%|pqWr4j&}m{|qZQCZ+ay2|r30v(d( z!`Rh_Wz^hYp)M3kvsE3K zxhUH?K?J@}v<9OkTK$0?u@^VyTkGe|ybe7<0j#_lVxIxw2&`02|LOn=6kK6<9pw)# z-9T(~#iqXffm!_u7Y$*kl|X8j5Ljb@tNE{>=rm73l~K$KIm+|`cbP;aA3=eFEq?ge zdIq93$iL{b4+2r3unZ1=Ei{bx2p%vayHgoqcg%Pgj>tZ{P2ylWA`qTV3=O(3 zOa)~OK}JT>-DW9GDjqu^Hrk9}%95(i(X#6H@Hymkg&(X``S9K;cmX8xuZ17@=;nr! z??z(Bpb-3OyyHqcULAe)*YQr_s3!}FPgwUgySK2o+~ z0dx(zQ^LTrJG8{eg%o<8(g$2MwZm}xS>At@GS>;GB-JdpGjICY+&{wn%^n@?tMkRv zWh!2)fWbIQI9_8tny?M!N`NvTEGNLm@R8jb1R}!^ss~;&{el}DD0mMv7z$dvVePO{ zFanpwHRBqf$(@e98xeP7DOF{ZbJ$9`Z=TeTPshKyk(fw^i0cW zESDp0XWIuJV6W|D4N<8hKf`%CT0Q;kQ54FxdE)_5p3wQ}%K8(Ocm{Y8RXvr5a$M>n z4EL{034F-JoewG6whF#V-gHYhOFCVxg+fyR z=Go7TP+WyYH1K8!;QfYtA#UnbQSv;djbOHM>n_}cqC7{&|I-*fcznqi1ofD}Ejnwx zMv5WSg0Tv*Cq8L_8&*QW%>%(UkOSv#8EK=02#Ei1OM4#_RUs0X{JUcmDlZVf^~+`y zx3vx)t;FCLOsevR51xIhC^2@dzA%=5P2^=VEHol_M|S2*B=Nye`^&etSO6*dGg$9N zKz_^w#^yqJ^X{-@7@84WcbE;)w#NuxGr=5^=^S@_|%^%Ds4U|p9#?odO40w)}P`>fS@S&+@anH)%?avL5>)|qPVQQM}gU{Td-3nJ_z@#GGG(Ym45B>eI zWrrDfi+va$_A#Rs`*M-Qfi^nOP8ht?YoL5b~^hi`O{%* z0D-$jMrPhISZG0nqEF3(#j=n1IR+pS35xJ}k{+^^ZzLIGwRm?tm+U4(1YMs(Ql5H4 z!}+rJ>|YjzLcDF0khpKMsAcrlQE$8zbh2S) z_vD@r4D^IA9Pk+T$>yyX!WH7I#S)JlR0|6K?@Riv6rfDPtxx16d;Z`Eu42qmt z>UXaKG|6VRi8-|CZtu;cv5DkWkWadqmfD#7sOAO==Tcw+fB7V%UEQsDP!s-W30^7w z-GN|nirG=X^kxN56mLuDc^cl0`Cz$1-RU`tT}d}`=4p_eVg3t_m1}2~B2+WXCeBTa z|M+-&JG2>JYH+WYI04>gat5gA!oHaw>5)ho+|{QSvp^x%C?24eUdcCT+i zQTqJA1t!_JZ*F)FaEWeuV+2A7&4i&9J&jVB==?^lv<|Fg`skXv#mT~Z^Kp= zLI`Ax%?EC#bb4eV762d1XFKyFK;gd$IS+XL+gt5+C@WnB>JJMjOXCG=(TX!+=L{5r zOM+`K^g{UEV|#J+SCr`piu(H$qOR;)ELAB~b^r%rdR^6NfHp_(fXLtZq%#v9W(n{L zJ5$l-4s)@qT3o(Vw(V&>vZEtO!c@=ts3V>kr{_7HAE_gxuE>_GzSciBK z#@Z>-e~19HPv3TA48hA{NdjKJ&QE^}ND5qsR1#kh4x;g4+0vu4dYQ_0ap2RW7IT~98wLkAH511sN&NLPUost@5@86U>6IZ>btAHKx;FRi z;ek#(IYZEG(OWo#EO&2S5I^|b4ek9Sks9q%<`Jsbat%PWD9^jl*2i{r^Vx6ofPw=7 zT)2}Ie>>Lg|H_^EA4Jlz`sq&@piqj5Il{!n@V5LSNdcfGrwloiK3a6xEh1!ca^o35 zP}8|2fJ-7*sl=DDU%JW&5;8qL@F0mK3Q2V8!Xa9E`=T7jOo!G&6)=fa+o31r!Mo5H zy@YKu|Fb)JCg<-^`fen~Z6}v0?`=ZR+cUW8bDiyQPcoj!3)1GPOQi!3A1^P{904bt zJ4-&SA$V}@%S!y(KWgHL!f{yr48FaF_QdyRncX|-C$6vv%Ljfp@}ydEW-7-n!_a_$ zrVKV9b~R2PT?E!h?1q;j#kDK6Ij(O7t8%&e>uI*?;_G*Vj!xVcFLd!QCG9<)_BQxR zS|kl@&8MqSN_A5*K(dr>vKhJ$%#VFR)i}c0!eHpuZLI@ZV7n;X7(}W^7YFcg?|5ha z#~b~}8~uOH8;vK>9|X6VuNO31HXPi)=6fjy6slL^zB}?nq9pcc3-3VTr-9(Lo$Clh zcW_ttHCHjsOpTE2$p|jRA|_}=X^(d*E{1+t%{1aI5uICDB@*1M}3Bs z#zHW5q14ry3+TtoG0&0jN;m8N^MZP+JKp`bn+3a(`0FpBht} z$98$W+w=}}&H%2VqsJd?ay;~xDoN7Cu4zR0JbU)+AT&?+#@i72JDA%O_On3D?*7*{ z?g$zuuE~C9a=*1qzuWSZMqqINn|_|^4233de>$GTf^sR0PRnFguH@Xbd2x*_ zce-uYfeGn<36yCB(3b6O4L0%f@Q&U+w#~{rZDygj=T8ueDV?z%sq5Qoo`V(cJK6Zroi^OFQTnA$Z|*GGJ(FPgB8v6v_c} zv^`ZdsdfO2TRomA&TJ~~=1br>=yd}CcTLREab|+2%!Z-7r}ue_;HFtnc@+xSs$ovy z0>+gP+g}@$XN-aq%i#?>iS8L2F&94i2<4j*aG2NuV*`cf>^KiFu(kMK@qe2-^s#5WEgJ!;^AMBBIdvb}*)3(0$zPaCKDGd|rH z1Gl%aQ1>!5m~;N>Ta-r)9XLpdPRe=Q+GdyKMalLv69YVm(#zjOJq{leXPSTB$244r zq;n1y&`JvymBIb({FD|1XtVn5g59;)Ylee8A-?vby7C~`foURBsG695($4OV?7x5W zGAAVC+m$XoC5yXg8?$zIV#1_&2)ydlE1`r{o`C!Z@cEg!Lds9^h6O5|c#oW=)^Z#F zBUvcGq&cZ~7tdndeYjLqKxI&nkO|gIc|xbk%#;iKo0loZbELE#svwu`6HZ!*rxKjc z_dp>>2x*%#SI@fB?5j8&M*)Ptd#==97nLGOJfF#06gX3eyU2 zN+lr$B7(C$R3Ffzuo2=S_WHtO9+ADYl};Wn>$WvX8r@!s_YH-VmwD#p7ZnaIu+Oz5 z8hJteD_~CkKLvaL0;%&@^1;w;Pb-rh9RYW1nX)O8^9diSw5LoBN_}x3TD=J+ z!9NllHTEt`2HCHJa8H&te2oHy7kJ{9aykp+QwTm#6b|J12JOkzg1kC!DdrNod>@^t zO%E97`5Pt05_#GdaSjyE(@9ET_vh2u;)R(Nv$4LuzKryA-GRYD`TG~nf_K=*rd+Q_XlA^^io~I)t|`<=f6~*_)0&sh z$HB-cLuR>u6sg)+9YaNBW!b^ik~};-R>zKg>02=Zy@=wu-v}&O60-Ko`8gWoq2Na-nxe7=6PE?3zuK!2O;9{6tx!gh-q07jW_>*E>7?)LbS$R5zZW{SWy?mnGP|bcb zg2xxXcPayblH15>&dgUgbp|~^yd=N3OA&|PCYia`t`8iTW~0-EG++U6Q(Wmt;UDmm z7&mclrP01;0!SDP5pcS2Q$pES5QkTt1W>X_<<6Sfo}D_(a^{6`JMX(7JV{OsH)53` z+x8hj=0Kb=n{^dF|Q=P{K#X#|?Z+%z4jjvXWW&dEip3 zr^jHITS=y)kbBIu2=g9`A z1T@vo$jBJ^=qsUO?yAq!h~#sE&5Zj|idt8Qi;G*?+4Tu8S@1Ea-HrB!ry2JPGLd>gYwC)v7@5qc8~}%7C_=X8ZT|Ogbtllga%nw>a9`+P>^LU5qLc z4q=IjiE^t}?Q(H(dD7Cd=e1)~UO@ruz*CEWqGN$8GJ|F&vozJqLcVO5 + + + + + gc-compaction-split + + + Layer 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + branch point + + + + + + + + last branch point + + + + + + Job 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + branch point + + + + + + + + last branch point + + + + + + Job 2 + + + + + + Job 3 + + + + + + Job 4 + + + + + + Job 5 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Delta Layer + + + + + + + Image Layer + + + + + From 68120cfa31b10eda7f807c74b6049f60d7400a45 Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Wed, 14 May 2025 15:19:53 +0200 Subject: [PATCH 097/142] Fix Cloud Extensions Regression (#11907) ## Problem The regression test on extensions relied on the admin API to set the default endpoint settings, which is not stable and requires admin privileges. Specifically: - The workflow was using `default_endpoint_settings` to configure necessary PostgreSQL settings like `DateStyle`, `TimeZone`, and `neon.allow_unstable_extensions` - This approach was failing because the API endpoint for setting `default_endpoint_settings` was changed (referenced in a comment as issue #27108) - The admin API requires special privileges. ## Summary of changes We get rid of the admin API dependency and use ALTER DATABASE statements instead: **Removed the default_endpoint_settings mechanism:** - Removed the default_endpoint_settings input parameter from the neon-project-create action - Removed the API call that was attempting to set these settings at the project level - Completely removed the default_endpoint_settings configuration from the cloud-extensions workflow **Added database-level settings:** - Created a new `alter_db.sh` script that applies the same settings directly to each test database - Modified all extension test scripts to call this script after database creation --- .../actions/neon-project-create/action.yml | 20 ------------------- .github/workflows/cloud-extensions.yml | 15 +------------- docker-compose/ext-src/alter_db.sh | 8 ++++++++ .../ext-src/pg_graphql-src/regular-test.sh | 1 + .../ext-src/pgrag-src/regular-test.sh | 1 + docker-compose/ext-src/pgx_ulid-src/Makefile | 1 + .../ext-src/plv8-src/regular-test.sh | 1 + .../ext-src/rag_bge_small_en_v15-src/Makefile | 1 + .../rag_jina_reranker_v1_tiny_en-src/Makefile | 1 + .../ext-src/rum-src/regular-test.sh | 1 + 10 files changed, 16 insertions(+), 34 deletions(-) create mode 100755 docker-compose/ext-src/alter_db.sh diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index a5b4104908..d7ff05be1a 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -49,10 +49,6 @@ inputs: description: 'A JSON object with project settings' required: false default: '{}' - default_endpoint_settings: - description: 'A JSON object with the default endpoint settings' - required: false - default: '{}' outputs: dsn: @@ -139,21 +135,6 @@ runs: -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ -d "{\"scheduling\": \"Essential\"}" fi - # XXX - # This is a workaround for the default endpoint settings, which currently do not allow some settings in the public API. - # https://github.com/neondatabase/cloud/issues/27108 - if [[ -n ${DEFAULT_ENDPOINT_SETTINGS} && ${DEFAULT_ENDPOINT_SETTINGS} != "{}" ]] ; then - PROJECT_DATA=$(curl -X GET \ - "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}" \ - -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ - -d "{\"scheduling\": \"Essential\"}" - ) - NEW_DEFAULT_ENDPOINT_SETTINGS=$(echo ${PROJECT_DATA} | jq -rc ".project.default_endpoint_settings + ${DEFAULT_ENDPOINT_SETTINGS}") - curl -X POST --fail \ - "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}/default_endpoint_settings" \ - -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ - --data "${NEW_DEFAULT_ENDPOINT_SETTINGS}" - fi env: @@ -171,4 +152,3 @@ runs: PSQL: ${{ inputs.psql_path }} LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }} PROJECT_SETTINGS: ${{ inputs.project_settings }} - DEFAULT_ENDPOINT_SETTINGS: ${{ inputs.default_endpoint_settings }} diff --git a/.github/workflows/cloud-extensions.yml b/.github/workflows/cloud-extensions.yml index 4114f0f9b4..25fe0877d9 100644 --- a/.github/workflows/cloud-extensions.yml +++ b/.github/workflows/cloud-extensions.yml @@ -35,7 +35,7 @@ jobs: matrix: pg-version: [16, 17] - runs-on: [ self-hosted, small ] + runs-on: us-east-2 container: # We use the neon-test-extensions image here as it contains the source code for the extensions. image: ghcr.io/neondatabase/neon-test-extensions-v${{ matrix.pg-version }}:latest @@ -71,20 +71,7 @@ jobs: region_id: ${{ inputs.region_id || 'aws-us-east-2' }} postgres_version: ${{ matrix.pg-version }} project_settings: ${{ steps.project-settings.outputs.settings }} - # We need these settings to get the expected output results. - # We cannot use the environment variables e.g. PGTZ due to - # https://github.com/neondatabase/neon/issues/1287 - default_endpoint_settings: > - { - "pg_settings": { - "DateStyle": "Postgres,MDY", - "TimeZone": "America/Los_Angeles", - "compute_query_id": "off", - "neon.allow_unstable_extensions": "on" - } - } api_key: ${{ secrets.NEON_STAGING_API_KEY }} - admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} - name: Run the regression tests run: /run-tests.sh -r /ext-src diff --git a/docker-compose/ext-src/alter_db.sh b/docker-compose/ext-src/alter_db.sh new file mode 100755 index 0000000000..6df37e1c9b --- /dev/null +++ b/docker-compose/ext-src/alter_db.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# We need these settings to get the expected output results. +# We cannot use the environment variables e.g. PGTZ due to +# https://github.com/neondatabase/neon/issues/1287 +export DATABASE=${1:-contrib_regression} +psql -c "ALTER DATABASE ${DATABASE} SET neon.allow_unstable_extensions='on'" \ + -c "ALTER DATABASE ${DATABASE} SET DateStyle='Postgres,MDY'" \ + -c "ALTER DATABASE ${DATABASE} SET TimeZone='America/Los_Angeles'" \ diff --git a/docker-compose/ext-src/pg_graphql-src/regular-test.sh b/docker-compose/ext-src/pg_graphql-src/regular-test.sh index 85e1ae057a..9e7d63b612 100755 --- a/docker-compose/ext-src/pg_graphql-src/regular-test.sh +++ b/docker-compose/ext-src/pg_graphql-src/regular-test.sh @@ -18,6 +18,7 @@ TESTS=${TESTS/row_level_security/} TESTS=${TESTS/sqli_connection/} dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh psql -v ON_ERROR_STOP=1 -f test/fixtures.sql -d contrib_regression ${REGRESS} --use-existing --dbname=contrib_regression --inputdir=${TESTDIR} ${TESTS} diff --git a/docker-compose/ext-src/pgrag-src/regular-test.sh b/docker-compose/ext-src/pgrag-src/regular-test.sh index 6cb1b049a4..22eb7498fd 100755 --- a/docker-compose/ext-src/pgrag-src/regular-test.sh +++ b/docker-compose/ext-src/pgrag-src/regular-test.sh @@ -3,6 +3,7 @@ set -ex cd "$(dirname "${0}")" dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag" PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --load-extension=vector --load-extension=rag --dbname=contrib_regression basic_functions text_processing api_keys chunking_functions document_processing embedding_api_functions voyageai_functions diff --git a/docker-compose/ext-src/pgx_ulid-src/Makefile b/docker-compose/ext-src/pgx_ulid-src/Makefile index 6480c48441..00975e8c48 100644 --- a/docker-compose/ext-src/pgx_ulid-src/Makefile +++ b/docker-compose/ext-src/pgx_ulid-src/Makefile @@ -20,5 +20,6 @@ installcheck: regression-test regression-test: dropdb --if-exists contrib_regression createdb contrib_regression + ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION $(EXTNAME)" $(PG_REGRESS) --inputdir=. --outputdir=. --use-existing --dbname=contrib_regression $(REGRESS) diff --git a/docker-compose/ext-src/plv8-src/regular-test.sh b/docker-compose/ext-src/plv8-src/regular-test.sh index b10cc65e8a..d5224e341c 100755 --- a/docker-compose/ext-src/plv8-src/regular-test.sh +++ b/docker-compose/ext-src/plv8-src/regular-test.sh @@ -3,6 +3,7 @@ set -ex cd "$(dirname ${0})" dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress REGRESS="$(make -n installcheck | awk '{print substr($0,index($0,"init-extension"));}')" REGRESS="${REGRESS/startup_perms/}" diff --git a/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile b/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile index ac87cc511b..de6bdd06c0 100644 --- a/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile +++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile @@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress installcheck: dropdb --if-exists contrib_regression createdb contrib_regression + ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_bge_small_en_v15" $(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS) \ No newline at end of file diff --git a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile index e81f94ef47..7adcad32f7 100644 --- a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile +++ b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile @@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress installcheck: dropdb --if-exists contrib_regression createdb contrib_regression + ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_jina_reranker_v1_tiny_en" $(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS) diff --git a/docker-compose/ext-src/rum-src/regular-test.sh b/docker-compose/ext-src/rum-src/regular-test.sh index d1d45a36ef..815c1adb53 100755 --- a/docker-compose/ext-src/rum-src/regular-test.sh +++ b/docker-compose/ext-src/rum-src/regular-test.sh @@ -3,5 +3,6 @@ set -ex cd "$(dirname ${0})" dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression rum rum_hash ruminv timestamp orderby orderby_hash altorder altorder_hash limits int2 int4 int8 float4 float8 money oid time timetz date interval macaddr inet cidr text varchar char bytea bit varbit numeric rum_weight expr array \ No newline at end of file From 32a12783fde3aeb246457ae79b18dc00f85f8896 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 14 May 2025 18:30:21 +0200 Subject: [PATCH 098/142] pageserver: batching & concurrent IO: update binary-built-in defaults; reduce CI matrix (#11923) Use the current production config for batching & concurrent IO. Remove the permutation testing for unit tests from CI. (The pageserver unit test matrix takes ~10min for debug builds). Drive-by-fix use of `if cfg!(test)` inside crate `pageserver_api`. It is ineffective for early-enabling new defaults for pageserver unit tests only. The reason is that the `test` cfg is only set for the crate under test but not its dependencies. So, `cargo test -p pageserver` will build `pageserver_api` with `cfg!(test) == false`. Resort to checking for feature flag `testing` instead, since all our unit tests are run with `--feature testing`. refs - `scattered-lsn` batching has been implemented and rolled out in all envs, cf https://github.com/neondatabase/neon/issues/10765 - preliminary for https://github.com/neondatabase/neon/pull/10466 - epic https://github.com/neondatabase/neon/issues/9377 - epic https://github.com/neondatabase/neon/issues/9378 - drive-by fix https://neondb.slack.com/archives/C0277TKAJCA/p1746821515504219 --- .github/workflows/_build-and-test-locally.yml | 22 +++++++------------ .github/workflows/build_and_test.yml | 2 -- libs/pageserver_api/src/config.rs | 20 +++++------------ libs/pageserver_api/src/models.rs | 11 +--------- libs/utils/src/tracing_span_assert.rs | 4 ++-- 5 files changed, 17 insertions(+), 42 deletions(-) diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 7cede309f3..663afa2c8b 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -279,18 +279,14 @@ jobs: # run all non-pageserver tests ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)' - # run pageserver tests with different settings - for get_vectored_concurrent_io in sequential sidecar-task; do - for io_engine in std-fs tokio-epoll-uring ; do - for io_mode in buffered direct direct-rw ; do - NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \ - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \ - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE=$io_mode \ - ${cov_prefix} \ - cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' - done - done - done + # run pageserver tests + # (When developing new pageserver features gated by config fields, we commonly make the rust + # unit tests sensitive to an environment variable NEON_PAGESERVER_UNIT_TEST_FEATURENAME. + # Then run the nextest invocation below for all relevant combinations. Singling out the + # pageserver tests from non-pageserver tests cuts down the time it takes for this CI step.) + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=tokio-epoll-uring \ + ${cov_prefix} \ + cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty @@ -405,8 +401,6 @@ jobs: CHECK_ONDISK_DATA_COMPATIBILITY: nonempty BUILD_TAG: ${{ inputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task - PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} # Temporary disable this step until we figure out why it's so flaky diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e0995218f9..6b19f6ef01 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -323,8 +323,6 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task - PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw SYNC_BETWEEN_TESTS: true # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 5b0c13dd89..7e0bb7dc57 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -639,23 +639,15 @@ impl Default for ConfigToml { tenant_config: TenantConfigToml::default(), no_sync: None, wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL, - page_service_pipelining: if !cfg!(test) { - PageServicePipeliningConfig::Serial - } else { - // Do not turn this into the default until scattered reads have been - // validated and rolled-out fully. - PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { + page_service_pipelining: PageServicePipeliningConfig::Pipelined( + PageServicePipeliningConfigPipelined { max_batch_size: NonZeroUsize::new(32).unwrap(), execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures, batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn, - }) - }, - get_vectored_concurrent_io: if !cfg!(test) { - GetVectoredConcurrentIo::Sequential - } else { - GetVectoredConcurrentIo::SidecarTask - }, - enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") { + }, + ), + get_vectored_concurrent_io: GetVectoredConcurrentIo::SidecarTask, + enable_read_path_debugging: if cfg!(feature = "testing") { Some(true) } else { None diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 5fcdefba66..89d531d671 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1803,7 +1803,6 @@ pub struct TopTenantShardsResponse { } pub mod virtual_file { - use std::sync::LazyLock; #[derive( Copy, @@ -1851,15 +1850,7 @@ pub mod virtual_file { impl IoMode { pub fn preferred() -> Self { - // The default behavior when running Rust unit tests without any further - // flags is to use the newest behavior (DirectRw). - // The CI uses the environment variable to unit tests for all different modes. - // NB: the Python regression & perf tests have their own defaults management - // that writes pageserver.toml; they do not use this variable. - static ENV_OVERRIDE: LazyLock> = LazyLock::new(|| { - utils::env::var_serde_json_string("NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE") - }); - ENV_OVERRIDE.unwrap_or(IoMode::DirectRw) + IoMode::DirectRw } } diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs index 3d15e08400..857d98b644 100644 --- a/libs/utils/src/tracing_span_assert.rs +++ b/libs/utils/src/tracing_span_assert.rs @@ -127,12 +127,12 @@ macro_rules! __check_fields_present { match check_fields_present0($extractors) { Ok(FoundEverything) => Ok(()), - Ok(Unconfigured) if cfg!(test) => { + Ok(Unconfigured) if cfg!(feature = "testing") => { // allow unconfigured in tests Ok(()) }, Ok(Unconfigured) => { - panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer") + panic!(r#"utils::tracing_span_assert: outside of #[cfg(feature = "testing")] expected tracing to be configured with tracing_error::ErrorLayer"#) }, Err(missing) => Err(missing) } From 48b870bc078bd2c450eb7b468e743b9c118549bf Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 15 May 2025 07:45:22 +0300 Subject: [PATCH 099/142] Use unlogged build in GIST for storing root page (#11892) ## Problem See https://github.com/neondatabase/neon/issues/11891 Newly added assert is first when root page of GIST index is written to the disk as part of sorted build. ## Summary of changes Wrap writing of root page in unlogged build. https://github.com/neondatabase/postgres/pull/632 https://github.com/neondatabase/postgres/pull/633 https://github.com/neondatabase/postgres/pull/634 --------- Co-authored-by: Konstantin Knizhnik --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/revisions.json | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index ead1e76bdc..4cca6f8083 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit ead1e76bdcb71ef87f52f0610bd7333247f75179 +Subproject commit 4cca6f8083483dda9e12eae292cf788d45bd561f diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 052df87d33..daa81cffcf 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 052df87d338dc30687d0c96f1a4d9b6cb4882b2e +Subproject commit daa81cffcf063c54b29a9aabdb6604625f675ad0 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index bb5eee65ac..15710a76b7 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit bb5eee65ac753b5a66d255ec5fb4c0e33180e8fd +Subproject commit 15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc diff --git a/vendor/revisions.json b/vendor/revisions.json index cf9f474e1a..0fc2d3996d 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -5,14 +5,14 @@ ], "v16": [ "16.9", - "bb5eee65ac753b5a66d255ec5fb4c0e33180e8fd" + "15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc" ], "v15": [ "15.13", - "052df87d338dc30687d0c96f1a4d9b6cb4882b2e" + "daa81cffcf063c54b29a9aabdb6604625f675ad0" ], "v14": [ "14.18", - "ead1e76bdcb71ef87f52f0610bd7333247f75179" + "4cca6f8083483dda9e12eae292cf788d45bd561f" ] } From 9e5a41a3423782b1ab5f097e04583f38b78d9ba9 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 15 May 2025 15:02:16 +0800 Subject: [PATCH 100/142] fix(scrubber): `remote_storage` error causes layers to be deleted as orphans (#11924) ## Problem close https://github.com/neondatabase/neon/issues/11159 ; we get occasional wrong deletions of layer files being used and errors in staging. This patch fixed it. Example errors: ``` Timeline metadata errors: ["index_part.json contains a layer .... (shard 0000) that is not present in remote storage (layer_is_l0: false) with error: Failed to download a remote file: s3 head object\n\nCaused by:\n 0: dispatch failure\n 1: timeout\n 2: error trying to connect: HTTP connect timeout occurred after 3.1s\n ``` This error should not be fired because the file could exist, but we cannot know if it exists due to head request failure. ## Summary of changes Only generate cannot find layer errors when the head_object return type is `NotFound`. Signed-off-by: Alex Chi Z --- storage_scrubber/src/checks.rs | 43 +++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 40f3523a7e..865f0908f9 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -13,7 +13,7 @@ use pageserver::tenant::remote_timeline_client::{ }; use pageserver::tenant::storage_layer::LayerName; use pageserver_api::shard::ShardIndex; -use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; +use remote_storage::{DownloadError, GenericRemoteStorage, ListingObject, RemotePath}; use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use utils::generation::Generation; @@ -165,23 +165,34 @@ pub(crate) async fn branch_cleanup_and_check_errors( .head_object(&path, &CancellationToken::new()) .await; - if let Err(e) = response { - // Object is not present. - let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta()); + match response { + Ok(_) => {} + Err(DownloadError::NotFound) => { + // Object is not present. + let is_l0 = + LayerMap::is_l0(layer.key_range(), layer.is_delta()); - let msg = format!( - "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {}) with error: {}", - layer, - metadata.generation.get_suffix(), - metadata.shard, - is_l0, - e, - ); + let msg = format!( + "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})", + layer, + metadata.generation.get_suffix(), + metadata.shard, + is_l0, + ); - if is_l0 || ignore_error { - result.warnings.push(msg); - } else { - result.errors.push(msg); + if is_l0 || ignore_error { + result.warnings.push(msg); + } else { + result.errors.push(msg); + } + } + Err(e) => { + tracing::warn!( + "cannot check if the layer {}{} is present in remote storage (error: {})", + layer, + metadata.generation.get_suffix(), + e, + ); } } } From 42e4cf18c97dad427f882c04a70bd33a54503e26 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 15 May 2025 10:53:59 +0100 Subject: [PATCH 101/142] CI(neon_extra_builds): fix workflow syntax (#11932) ## Problem ``` Error when evaluating 'strategy' for job 'build-pgxn'. neondatabase/neon/.github/workflows/build-macos.yml@7907a9e2bf898f3d22b98d9d4d2c6ffc4d480fc3 (Line: 45, Col: 27): Matrix vector 'postgres-version' does not contain any values ``` See https://github.com/neondatabase/neon/actions/runs/15039594216/job/42268015127?pr=11929 ## Summary of changes - Fix typo: `.chnages` -> `.changes` - Ensure JSON is JSON by moving step output to env variable --- .github/workflows/neon_extra_builds.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 9c504eb5bf..3427a0eb49 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -63,8 +63,10 @@ jobs: - name: Filter out only v-string for build matrix id: postgres_changes + env: + CHANGES: ${{ steps.files_changed.outputs.changes }} run: | - v_strings_only_as_json_array=$(echo ${{ steps.files_changed.outputs.chnages }} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c) + v_strings_only_as_json_array=$(echo ${CHANGES} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c) echo "changes=${v_strings_only_as_json_array}" | tee -a "${GITHUB_OUTPUT}" check-macos-build: From a703cd342b1f7f8faf5920cec8ef09902f94eaa8 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 15 May 2025 11:02:11 +0100 Subject: [PATCH 102/142] storage_controller: enforce generations in import upcalls (#11900) ## Problem Import up-calls did not enforce the usage of the latest generation. The import might have finished in one previous generation, but not in the latest one. Hence, the controller might try to activate a timeline before it is ready. In theory, that would be fine, but it's tricky to reason about. ## Summary of Changes Pageserver provides the current generation in the upcall to the storage controller and the later validates the generation. If the generation is stale, we return an error which stops progress of the import job. Note that the import job will retry the upcall until the stale location is detached. I'll add some proper tests for this as part of the [checkpointing PR](https://github.com/neondatabase/neon/pull/11862). Closes https://github.com/neondatabase/neon/issues/11884 --- libs/pageserver_api/src/upcall_api.rs | 9 ++ pageserver/src/controller_upcall_client.rs | 22 +++- pageserver/src/deletion_queue.rs | 2 + .../src/tenant/timeline/import_pgdata.rs | 7 +- storage_controller/src/http.rs | 12 +- storage_controller/src/service.rs | 107 ++++++++++++++++-- 6 files changed, 142 insertions(+), 17 deletions(-) diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index 7ee63f9036..4dce5f7817 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -4,6 +4,7 @@ //! See docs/rfcs/025-generation-numbers.md use serde::{Deserialize, Serialize}; +use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; use crate::controller_api::NodeRegisterRequest; @@ -63,9 +64,17 @@ pub struct ValidateResponseTenant { pub valid: bool, } +#[derive(Serialize, Deserialize)] +pub struct TimelineImportStatusRequest { + pub tenant_shard_id: TenantShardId, + pub timeline_id: TimelineId, + pub generation: Generation, +} + #[derive(Serialize, Deserialize)] pub struct PutTimelineImportStatusRequest { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, pub status: ShardImportStatus, + pub generation: Generation, } diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 6d186b091a..779ef3e37d 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -7,7 +7,7 @@ use pageserver_api::models::ShardImportStatus; use pageserver_api::shard::TenantShardId; use pageserver_api::upcall_api::{ PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, - ValidateRequest, ValidateRequestTenant, ValidateResponse, + TimelineImportStatusRequest, ValidateRequest, ValidateRequestTenant, ValidateResponse, }; use reqwest::Certificate; use serde::Serialize; @@ -51,12 +51,14 @@ pub trait StorageControllerUpcallApi { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + generation: Generation, status: ShardImportStatus, ) -> impl Future> + Send; fn get_timeline_import_status( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + generation: Generation, ) -> impl Future, RetryForeverError>> + Send; } @@ -292,6 +294,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + generation: Generation, status: ShardImportStatus, ) -> Result<(), RetryForeverError> { let url = self @@ -302,6 +305,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { let request = PutTimelineImportStatusRequest { tenant_shard_id, timeline_id, + generation, status, }; @@ -313,15 +317,27 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + generation: Generation, ) -> Result, RetryForeverError> { let url = self .base_url - .join(format!("timeline_import_status/{}/{}", tenant_shard_id, timeline_id).as_str()) + .join("timeline_import_status") .expect("Failed to build path"); + let request = TimelineImportStatusRequest { + tenant_shard_id, + timeline_id, + generation, + }; + Ok(backoff::retry( || async { - let response = self.http_client.get(url.clone()).send().await?; + let response = self + .http_client + .get(url.clone()) + .json(&request) + .send() + .await?; if let Err(err) = response.error_for_status_ref() { if matches!(err.status(), Some(reqwest::StatusCode::NOT_FOUND)) { diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 65b2de28cd..0bbad87c09 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -793,6 +793,7 @@ mod test { &self, _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, + _generation: Generation, _status: pageserver_api::models::ShardImportStatus, ) -> Result<(), RetryForeverError> { unimplemented!() @@ -802,6 +803,7 @@ mod test { &self, _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, + _generation: Generation, ) -> Result, RetryForeverError> { unimplemented!() } diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index 53e15e5395..5fac9e0ce7 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -48,7 +48,11 @@ pub async fn doit( let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel); let shard_status = storcon_client - .get_timeline_import_status(timeline.tenant_shard_id, timeline.timeline_id) + .get_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + timeline.generation, + ) .await .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?; @@ -175,6 +179,7 @@ pub async fn doit( .put_timeline_import_status( timeline.tenant_shard_id, timeline.timeline_id, + timeline.generation, // TODO(vlad): What about import errors? ShardImportStatus::Done, ) diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 8d459cab9c..02c02c0e7f 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -31,7 +31,7 @@ use pageserver_api::models::{ }; use pageserver_api::shard::TenantShardId; use pageserver_api::upcall_api::{ - PutTimelineImportStatusRequest, ReAttachRequest, ValidateRequest, + PutTimelineImportStatusRequest, ReAttachRequest, TimelineImportStatusRequest, ValidateRequest, }; use pageserver_client::{BlockUnblock, mgmt_api}; use routerify::Middleware; @@ -160,22 +160,22 @@ async fn handle_validate(req: Request) -> Result, ApiError> async fn handle_get_timeline_import_status(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; - let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; - let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; - - let req = match maybe_forward(req).await { + let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { return res; } ForwardOutcome::NotForwarded(req) => req, }; + let get_req = json_request::(&mut req).await?; + let state = get_state(&req); + json_response( StatusCode::OK, state .service - .handle_timeline_shard_import_progress(tenant_shard_id, timeline_id) + .handle_timeline_shard_import_progress(get_req) .await?, ) } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 05430733c2..852005639a 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -47,7 +47,7 @@ use pageserver_api::shard::{ }; use pageserver_api::upcall_api::{ PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, - ValidateRequest, ValidateResponse, ValidateResponseTenant, + TimelineImportStatusRequest, ValidateRequest, ValidateResponse, ValidateResponseTenant, }; use pageserver_client::{BlockUnblock, mgmt_api}; use reqwest::{Certificate, StatusCode}; @@ -194,6 +194,14 @@ pub(crate) enum LeadershipStatus { Candidate, } +enum ShardGenerationValidity { + Valid, + Mismatched { + claimed: Generation, + actual: Option, + }, +} + pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256; pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32; @@ -3909,19 +3917,36 @@ impl Service { pub(crate) async fn handle_timeline_shard_import_progress( self: &Arc, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, + req: TimelineImportStatusRequest, ) -> Result { + let validity = self + .validate_shard_generation(req.tenant_shard_id, req.generation) + .await?; + match validity { + ShardGenerationValidity::Valid => { + // fallthrough + } + ShardGenerationValidity::Mismatched { claimed, actual } => { + tracing::info!( + claimed=?claimed.into(), + actual=?actual.and_then(|g| g.into()), + "Rejecting import progress fetch from stale generation" + ); + + return Err(ApiError::BadRequest(anyhow::anyhow!("Invalid generation"))); + } + } + let maybe_import = self .persistence - .get_timeline_import(tenant_shard_id.tenant_id, timeline_id) + .get_timeline_import(req.tenant_shard_id.tenant_id, req.timeline_id) .await?; let import = maybe_import.ok_or_else(|| { ApiError::NotFound( format!( "import for {}/{} not found", - tenant_shard_id.tenant_id, timeline_id + req.tenant_shard_id.tenant_id, req.timeline_id ) .into(), ) @@ -3930,11 +3955,11 @@ impl Service { import .shard_statuses .0 - .get(&tenant_shard_id.to_index()) + .get(&req.tenant_shard_id.to_index()) .cloned() .ok_or_else(|| { ApiError::NotFound( - format!("shard {} not found", tenant_shard_id.shard_slug()).into(), + format!("shard {} not found", req.tenant_shard_id.shard_slug()).into(), ) }) } @@ -3943,6 +3968,24 @@ impl Service { self: &Arc, req: PutTimelineImportStatusRequest, ) -> Result<(), ApiError> { + let validity = self + .validate_shard_generation(req.tenant_shard_id, req.generation) + .await?; + match validity { + ShardGenerationValidity::Valid => { + // fallthrough + } + ShardGenerationValidity::Mismatched { claimed, actual } => { + tracing::info!( + claimed=?claimed.into(), + actual=?actual.and_then(|g| g.into()), + "Rejecting import progress update from stale generation" + ); + + return Err(ApiError::PreconditionFailed("Invalid generation".into())); + } + } + let res = self .persistence .update_timeline_import(req.tenant_shard_id, req.timeline_id, req.status) @@ -3977,6 +4020,56 @@ impl Service { Ok(()) } + /// Check that a provided generation for some tenant shard is the most recent one. + /// + /// Validate with the in-mem state first, and, if that passes, validate with the + /// database state which is authoritative. + async fn validate_shard_generation( + self: &Arc, + tenant_shard_id: TenantShardId, + generation: Generation, + ) -> Result { + { + let locked = self.inner.read().unwrap(); + let tenant_shard = + locked + .tenants + .get(&tenant_shard_id) + .ok_or(ApiError::InternalServerError(anyhow::anyhow!( + "{} shard not found", + tenant_shard_id + )))?; + + if tenant_shard.generation != Some(generation) { + return Ok(ShardGenerationValidity::Mismatched { + claimed: generation, + actual: tenant_shard.generation, + }); + } + } + + let mut db_generations = self + .persistence + .shard_generations(std::iter::once(&tenant_shard_id)) + .await?; + let (_tid, db_generation) = + db_generations + .pop() + .ok_or(ApiError::InternalServerError(anyhow::anyhow!( + "{} shard not found", + tenant_shard_id + )))?; + + if db_generation != Some(generation) { + return Ok(ShardGenerationValidity::Mismatched { + claimed: generation, + actual: db_generation, + }); + } + + Ok(ShardGenerationValidity::Valid) + } + /// Finalize the import of a timeline /// /// This method should be called once all shards have reported that the import is complete. From 2621ce2daf2a49408f54a687e9e691b87f3477d0 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 15 May 2025 14:18:22 +0100 Subject: [PATCH 103/142] pageserver: checkpoint import progress in the storage controller (#11862) ## Problem Timeline imports do not have progress checkpointing. Any time that the tenant is shut-down, all progress is lost and the import restarts from the beginning when the tenant is re-attached. ## Summary of changes This PR adds progress checkpointing. ### Preliminaries The **unit of work** is a `ChunkProcessingJob`. Each `ChunkProcessingJob` deals with the import for a set of key ranges. The job split is done by using an estimation of how many pages each job will produce. The planning stage must be **pure**: given a fixed set of contents in the import bucket, it will always yield the same plan. This property is enforced by checking that the hash of the plan is identical when resuming from a checkpoint. The storage controller tracks the progress of each shard in the import in the database in the form of the **latest job** that has has completed. ### Flow This is the high level flow for the happy path: 1. On the first run of the import task, the import task queries storcon for the progress and sees that none is recorded. 2. Execute the preparatory stage of the import 3. Import jobs start running concurrently in a `FuturesOrdered`. Every time the checkpointing threshold of jobs has been reached, notify the storage controller. 4. Tenant is detached and re-attached 5. Import task starts up again and gets the latest progress checkpoint from the storage controller in the form of a job index. 6. The plan is computed again and we check that the hash matches with the original plan. 7. Jobs are spawned from where the previous import task left off. Note that we will not report progress after the completion of each job, so some jobs might run twice. Closes https://github.com/neondatabase/neon/issues/11568 Closes https://github.com/neondatabase/neon/issues/11664 --- Cargo.lock | 1 + libs/pageserver_api/src/config.rs | 2 + libs/pageserver_api/src/models.rs | 15 +- pageserver/Cargo.toml | 1 + .../src/tenant/timeline/import_pgdata.rs | 270 +++++++++++------- .../src/tenant/timeline/import_pgdata/flow.rs | 188 ++++++++++-- storage_controller/src/service.rs | 2 +- storage_controller/src/timeline_import.rs | 9 +- test_runner/fixtures/neon_fixtures.py | 6 + 9 files changed, 357 insertions(+), 137 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6df5d4a71e..f075b45e49 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4331,6 +4331,7 @@ dependencies = [ "toml_edit", "tracing", "tracing-utils", + "twox-hash", "url", "utils", "uuid", diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 7e0bb7dc57..f2ba50a86f 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -305,6 +305,7 @@ impl From for tracing_utils::Protocol { pub struct TimelineImportConfig { pub import_job_concurrency: NonZeroUsize, pub import_job_soft_size_limit: NonZeroUsize, + pub import_job_checkpoint_threshold: NonZeroUsize, } pub mod statvfs { @@ -661,6 +662,7 @@ impl Default for ConfigToml { timeline_import_config: TimelineImportConfig { import_job_concurrency: NonZeroUsize::new(128).unwrap(), import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(), + import_job_checkpoint_threshold: NonZeroUsize::new(128).unwrap(), }, } } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 89d531d671..58b8d80c0a 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -336,14 +336,25 @@ impl TimelineCreateRequest { #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub enum ShardImportStatus { - InProgress, + InProgress(Option), Done, Error(String), } + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub struct ShardImportProgress { + /// Total number of jobs in the import plan + pub jobs: usize, + /// Number of jobs completed + pub completed: usize, + /// Hash of the plan + pub import_plan_hash: u64, +} + impl ShardImportStatus { pub fn is_terminal(&self) -> bool { match self { - ShardImportStatus::InProgress => false, + ShardImportStatus::InProgress(_) => false, ShardImportStatus::Done | ShardImportStatus::Error(_) => true, } } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 8abd504922..b7b3e0eaf1 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -96,6 +96,7 @@ strum.workspace = true strum_macros.workspace = true wal_decoder.workspace = true smallvec.workspace = true +twox-hash.workspace = true [target.'cfg(target_os = "linux")'.dependencies] procfs.workspace = true diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index 5fac9e0ce7..602b20df97 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use anyhow::{Context, bail}; +use importbucket_client::{ControlFile, RemoteStorageWrapper}; use pageserver_api::models::ShardImportStatus; use remote_storage::RemotePath; use tokio::task::JoinHandle; @@ -57,115 +58,40 @@ pub async fn doit( .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?; info!(?shard_status, "peeking shard status"); - match shard_status { - None | Some(ShardImportStatus::InProgress) => { - // TODO: checkpoint the progress into the IndexPart instead of restarting - // from the beginning. - - // - // Wipe the slate clean - the flow does not allow resuming. - // We can implement resuming in the future by checkpointing the progress into the IndexPart. - // - info!("wipe the slate clean"); - { - // TODO: do we need to hold GC lock for this? - let mut guard = timeline.layers.write().await; - assert!( - guard.layer_map()?.open_layer.is_none(), - "while importing, there should be no in-memory layer" // this just seems like a good place to assert it - ); - let all_layers_keys = guard.all_persistent_layers(); - let all_layers: Vec<_> = all_layers_keys - .iter() - .map(|key| guard.get_from_key(key)) - .collect(); - let open = guard.open_mut().context("open_mut")?; - - timeline.remote_client.schedule_gc_update(&all_layers)?; - open.finish_gc_timeline(&all_layers); - } - - // - // Wait for pgdata to finish uploading - // - info!("wait for pgdata to reach status 'done'"); + match shard_status.unwrap_or(ShardImportStatus::InProgress(None)) { + ShardImportStatus::InProgress(maybe_progress) => { let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; - let status_prefix = RemotePath::from_string("status").unwrap(); - let pgdata_status_key = status_prefix.join("pgdata"); - loop { - let res = async { - let pgdata_status: Option = storage - .get_json(&pgdata_status_key) - .await - .context("get pgdata status")?; - info!(?pgdata_status, "peeking pgdata status"); - if pgdata_status.map(|st| st.done).unwrap_or(false) { - Ok(()) - } else { - Err(anyhow::anyhow!("pgdata not done yet")) - } - } - .await; - match res { - Ok(_) => break, - Err(err) => { - info!(?err, "indefinitely waiting for pgdata to finish"); - if tokio::time::timeout( - std::time::Duration::from_secs(10), - cancel.cancelled(), - ) - .await - .is_ok() - { - bail!("cancelled while waiting for pgdata"); - } - } - } - } - // - // Do the import - // - info!("do the import"); - let control_file = storage.get_control_file().await?; - let base_lsn = control_file.base_lsn(); + let control_file_res = if maybe_progress.is_none() { + // Only prepare the import once when there's no progress. + prepare_import(timeline, storage.clone(), &cancel).await + } else { + storage.get_control_file().await + }; - info!("update TimelineMetadata based on LSNs from control file"); - { - let pg_version = control_file.pg_version(); - let _ctx: &RequestContext = ctx; - async move { - // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the - // checkpoint record, and prev_record_lsn should point to its beginning. - // We should read the real end of the record from the WAL, but here we - // just fake it. - let disk_consistent_lsn = Lsn(base_lsn.0 + 8); - let prev_record_lsn = base_lsn; - let metadata = TimelineMetadata::new( - disk_consistent_lsn, - Some(prev_record_lsn), - None, // no ancestor - Lsn(0), // no ancestor lsn - base_lsn, // latest_gc_cutoff_lsn - base_lsn, // initdb_lsn - pg_version, + let control_file = match control_file_res { + Ok(cf) => cf, + Err(err) => { + return Err( + terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await, ); - - let _start_lsn = disk_consistent_lsn + 1; - - timeline - .remote_client - .schedule_index_upload_for_full_metadata_update(&metadata)?; - - timeline.remote_client.wait_completion().await?; - - anyhow::Ok(()) } - } - .await?; + }; - flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?; + let res = flow::run( + timeline.clone(), + control_file, + storage.clone(), + maybe_progress, + ctx, + ) + .await; + if let Err(err) = res { + return Err( + terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await, + ); + } // Communicate that shard is done. // Ensure at-least-once delivery of the upcall to storage controller @@ -180,7 +106,6 @@ pub async fn doit( timeline.tenant_shard_id, timeline.timeline_id, timeline.generation, - // TODO(vlad): What about import errors? ShardImportStatus::Done, ) .await @@ -188,16 +113,151 @@ pub async fn doit( anyhow::anyhow!("Shut down while putting timeline import status") })?; } - Some(ShardImportStatus::Error(err)) => { + ShardImportStatus::Error(err) => { info!( "shard status indicates that the shard is done (error), skipping import {}", err ); } - Some(ShardImportStatus::Done) => { + ShardImportStatus::Done => { info!("shard status indicates that the shard is done (success), skipping import"); } } Ok(()) } + +async fn prepare_import( + timeline: &Arc, + storage: RemoteStorageWrapper, + cancel: &CancellationToken, +) -> anyhow::Result { + // Wipe the slate clean before starting the import as a precaution. + // This method is only called when there's no recorded checkpoint for the import + // in the storage controller. + // + // Note that this is split-brain safe (two imports for same timeline shards running in + // different generations) because we go through the usual deletion path, including deletion queue. + info!("wipe the slate clean"); + { + // TODO: do we need to hold GC lock for this? + let mut guard = timeline.layers.write().await; + assert!( + guard.layer_map()?.open_layer.is_none(), + "while importing, there should be no in-memory layer" // this just seems like a good place to assert it + ); + let all_layers_keys = guard.all_persistent_layers(); + let all_layers: Vec<_> = all_layers_keys + .iter() + .map(|key| guard.get_from_key(key)) + .collect(); + let open = guard.open_mut().context("open_mut")?; + + timeline.remote_client.schedule_gc_update(&all_layers)?; + open.finish_gc_timeline(&all_layers); + } + + // + // Wait for pgdata to finish uploading + // + info!("wait for pgdata to reach status 'done'"); + let status_prefix = RemotePath::from_string("status").unwrap(); + let pgdata_status_key = status_prefix.join("pgdata"); + loop { + let res = async { + let pgdata_status: Option = storage + .get_json(&pgdata_status_key) + .await + .context("get pgdata status")?; + info!(?pgdata_status, "peeking pgdata status"); + if pgdata_status.map(|st| st.done).unwrap_or(false) { + Ok(()) + } else { + Err(anyhow::anyhow!("pgdata not done yet")) + } + } + .await; + match res { + Ok(_) => break, + Err(err) => { + info!(?err, "indefinitely waiting for pgdata to finish"); + if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) + .await + .is_ok() + { + bail!("cancelled while waiting for pgdata"); + } + } + } + } + + let control_file = storage.get_control_file().await?; + let base_lsn = control_file.base_lsn(); + + info!("update TimelineMetadata based on LSNs from control file"); + { + let pg_version = control_file.pg_version(); + async move { + // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the + // checkpoint record, and prev_record_lsn should point to its beginning. + // We should read the real end of the record from the WAL, but here we + // just fake it. + let disk_consistent_lsn = Lsn(base_lsn.0 + 8); + let prev_record_lsn = base_lsn; + let metadata = TimelineMetadata::new( + disk_consistent_lsn, + Some(prev_record_lsn), + None, // no ancestor + Lsn(0), // no ancestor lsn + base_lsn, // latest_gc_cutoff_lsn + base_lsn, // initdb_lsn + pg_version, + ); + + let _start_lsn = disk_consistent_lsn + 1; + + timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata)?; + + timeline.remote_client.wait_completion().await?; + + anyhow::Ok(()) + } + } + .await?; + + Ok(control_file) +} + +async fn terminate_flow_with_error( + timeline: &Arc, + error: anyhow::Error, + storcon_client: &StorageControllerUpcallClient, + cancel: &CancellationToken, +) -> anyhow::Error { + // The import task is a aborted on tenant shutdown, so in principle, it should + // never be cancelled. To be on the safe side, check the cancellation tokens + // before marking the import as failed. + if !(cancel.is_cancelled() || timeline.cancel.is_cancelled()) { + let notify_res = storcon_client + .put_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + timeline.generation, + ShardImportStatus::Error(format!("{error:#}")), + ) + .await; + + if let Err(_notify_error) = notify_res { + // The [`StorageControllerUpcallClient::put_timeline_import_status`] retries + // forever internally, so errors returned by it can only be due to cancellation. + info!("failed to notify storcon about permanent import error"); + } + + // Will be logged by [`Tenant::create_timeline_import_pgdata_task`] + error + } else { + anyhow::anyhow!("Import task cancelled") + } +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index 5b9c8ec5b5..c8c3bdcdfb 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -29,10 +29,11 @@ //! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest) use std::collections::HashSet; +use std::hash::{Hash, Hasher}; use std::ops::Range; use std::sync::Arc; -use anyhow::{bail, ensure}; +use anyhow::ensure; use bytes::Bytes; use futures::stream::FuturesOrdered; use itertools::Itertools; @@ -43,6 +44,7 @@ use pageserver_api::key::{ slru_segment_size_to_key, }; use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range}; +use pageserver_api::models::{ShardImportProgress, ShardImportStatus}; use pageserver_api::reltag::{RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; use postgres_ffi::relfile_utils::parse_relfilename; @@ -59,16 +61,18 @@ use super::Timeline; use super::importbucket_client::{ControlFile, RemoteStorageWrapper}; use crate::assert_u64_eq_usize::UsizeIsU64; use crate::context::{DownloadBehavior, RequestContext}; +use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient}; use crate::pgdatadir_mapping::{ DbDirectory, RelDirectory, SlruSegmentDirectory, TwoPhaseDirectory, }; use crate::task_mgr::TaskKind; -use crate::tenant::storage_layer::{ImageLayerWriter, Layer}; +use crate::tenant::storage_layer::{AsLayerDesc, ImageLayerWriter, Layer}; pub async fn run( timeline: Arc, control_file: ControlFile, storage: RemoteStorageWrapper, + import_progress: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { let planner = Planner { @@ -81,9 +85,31 @@ pub async fn run( let import_config = &timeline.conf.timeline_import_config; let plan = planner.plan(import_config).await?; + // Hash the plan and compare with the hash of the plan we got back from the storage controller. + // If the two match, it means that the planning stage had the same output. + // + // This is not intended to be a cryptographically secure hash. + const SEED: u64 = 42; + let mut hasher = twox_hash::XxHash64::with_seed(SEED); + plan.hash(&mut hasher); + let plan_hash = hasher.finish(); + + if let Some(progress) = &import_progress { + if plan_hash != progress.import_plan_hash { + anyhow::bail!("Import plan does not match storcon metadata"); + } + + // Handle collisions on jobs of unequal length + if progress.jobs != plan.jobs.len() { + anyhow::bail!("Import plan job length does not match storcon metadata") + } + } + pausable_failpoint!("import-timeline-pre-execute-pausable"); - plan.execute(timeline, import_config, ctx).await + let start_from_job_idx = import_progress.map(|progress| progress.completed); + plan.execute(timeline, start_from_job_idx, plan_hash, import_config, ctx) + .await } struct Planner { @@ -93,8 +119,11 @@ struct Planner { tasks: Vec, } +#[derive(Hash)] struct Plan { jobs: Vec, + // Included here such that it ends up in the hash for the plan + shard: ShardIdentity, } impl Planner { @@ -198,7 +227,10 @@ impl Planner { pgdata_lsn, )); - Ok(Plan { jobs }) + Ok(Plan { + jobs, + shard: self.shard, + }) } #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))] @@ -327,25 +359,45 @@ impl Plan { async fn execute( self, timeline: Arc, + start_after_job_idx: Option, + import_plan_hash: u64, import_config: &TimelineImportConfig, ctx: &RequestContext, ) -> anyhow::Result<()> { + let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &timeline.cancel); + let mut work = FuturesOrdered::new(); let semaphore = Arc::new(Semaphore::new(import_config.import_job_concurrency.into())); let jobs_in_plan = self.jobs.len(); - let mut jobs = self.jobs.into_iter().enumerate().peekable(); - let mut results = Vec::new(); + let mut jobs = self + .jobs + .into_iter() + .enumerate() + .map(|(idx, job)| (idx + 1, job)) + .filter(|(idx, _job)| { + // Filter out any jobs that have been done already + if let Some(start_after) = start_after_job_idx { + *idx > start_after + } else { + true + } + }) + .peekable(); + + let mut last_completed_job_idx = start_after_job_idx.unwrap_or(0); + let checkpoint_every: usize = import_config.import_job_checkpoint_threshold.into(); // Run import jobs concurrently up to the limit specified by the pageserver configuration. // Note that we process completed futures in the oreder of insertion. This will be the // building block for resuming imports across pageserver restarts or tenant migrations. - while results.len() < jobs_in_plan { + while last_completed_job_idx < jobs_in_plan { tokio::select! { permit = semaphore.clone().acquire_owned(), if jobs.peek().is_some() => { let permit = permit.expect("never closed"); let (job_idx, job) = jobs.next().expect("we peeked"); + let job_timeline = timeline.clone(); let ctx = ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); @@ -357,13 +409,33 @@ impl Plan { }, maybe_complete_job_idx = work.next() => { match maybe_complete_job_idx { - Some(Ok((_job_idx, res))) => { - results.push(res); + Some(Ok((job_idx, res))) => { + assert!(last_completed_job_idx.checked_add(1).unwrap() == job_idx); + + res?; + last_completed_job_idx = job_idx; + + if last_completed_job_idx % checkpoint_every == 0 { + storcon_client.put_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + timeline.generation, + ShardImportStatus::InProgress(Some(ShardImportProgress { + jobs: jobs_in_plan, + completed: last_completed_job_idx, + import_plan_hash, + })) + ) + .await + .map_err(|_err| { + anyhow::anyhow!("Shut down while putting timeline import status") + })?; + } }, Some(Err(_)) => { - results.push(Err(anyhow::anyhow!( - "parallel job panicked or cancelled, check pageserver logs" - ))); + anyhow::bail!( + "import job panicked or cancelled" + ); } None => {} } @@ -371,17 +443,7 @@ impl Plan { } } - if results.iter().all(|r| r.is_ok()) { - Ok(()) - } else { - let mut msg = String::new(); - for result in results { - if let Err(err) = result { - msg.push_str(&format!("{err:?}\n\n")); - } - } - bail!("Some parallel jobs failed:\n\n{msg}"); - } + Ok(()) } } @@ -553,6 +615,15 @@ struct ImportSingleKeyTask { buf: Bytes, } +impl Hash for ImportSingleKeyTask { + fn hash(&self, state: &mut H) { + let ImportSingleKeyTask { key, buf } = self; + + key.hash(state); + buf.hash(state); + } +} + impl ImportSingleKeyTask { fn new(key: Key, buf: Bytes) -> Self { ImportSingleKeyTask { key, buf } @@ -581,6 +652,20 @@ struct ImportRelBlocksTask { storage: RemoteStorageWrapper, } +impl Hash for ImportRelBlocksTask { + fn hash(&self, state: &mut H) { + let ImportRelBlocksTask { + shard_identity: _, + key_range, + path, + storage: _, + } = self; + + key_range.hash(state); + path.hash(state); + } +} + impl ImportRelBlocksTask { fn new( shard_identity: ShardIdentity, @@ -665,6 +750,19 @@ struct ImportSlruBlocksTask { storage: RemoteStorageWrapper, } +impl Hash for ImportSlruBlocksTask { + fn hash(&self, state: &mut H) { + let ImportSlruBlocksTask { + key_range, + path, + storage: _, + } = self; + + key_range.hash(state); + path.hash(state); + } +} + impl ImportSlruBlocksTask { fn new(key_range: Range, path: &RemotePath, storage: RemoteStorageWrapper) -> Self { ImportSlruBlocksTask { @@ -707,6 +805,7 @@ impl ImportTask for ImportSlruBlocksTask { } } +#[derive(Hash)] enum AnyImportTask { SingleKey(ImportSingleKeyTask), RelBlocks(ImportRelBlocksTask), @@ -753,6 +852,7 @@ impl From for AnyImportTask { } } +#[derive(Hash)] struct ChunkProcessingJob { range: Range, tasks: Vec, @@ -790,17 +890,51 @@ impl ChunkProcessingJob { let resident_layer = if nimages > 0 { let (desc, path) = writer.finish(ctx).await?; + + { + let guard = timeline.layers.read().await; + let existing_layer = guard.try_get_from_key(&desc.key()); + if let Some(layer) = existing_layer { + if layer.metadata().generation != timeline.generation { + return Err(anyhow::anyhow!( + "Import attempted to rewrite layer file in the same generation: {}", + layer.local_path() + )); + } + } + } + Layer::finish_creating(timeline.conf, &timeline, desc, &path)? } else { // dropping the writer cleans up return Ok(()); }; - // this is sharing the same code as create_image_layers + // The same import job might run multiple times since not each job is checkpointed. + // Hence, we must support the cases where the layer already exists. We cannot be + // certain that the existing layer is identical to the new one, so in that case + // we replace the old layer with the one we just generated. + let mut guard = timeline.layers.write().await; - guard - .open_mut()? - .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics); + + let existing_layer = guard + .try_get_from_key(&resident_layer.layer_desc().key()) + .cloned(); + match existing_layer { + Some(existing) => { + guard.open_mut()?.rewrite_layers( + &[(existing.clone(), resident_layer.clone())], + &[], + &timeline.metrics, + ); + } + None => { + guard + .open_mut()? + .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics); + } + } + crate::tenant::timeline::drop_wlock(guard); timeline diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 852005639a..7e4bb627af 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -4082,7 +4082,7 @@ impl Service { /// imports are stored in the database). #[instrument(skip_all, fields( tenant_id=%import.tenant_id, - shard_id=%import.timeline_id, + timeline_id=%import.timeline_id, ))] async fn finalize_timeline_import( self: &Arc, diff --git a/storage_controller/src/timeline_import.rs b/storage_controller/src/timeline_import.rs index 5d9d633932..909e8e2899 100644 --- a/storage_controller/src/timeline_import.rs +++ b/storage_controller/src/timeline_import.rs @@ -5,7 +5,7 @@ use http_utils::error::ApiError; use reqwest::Method; use serde::{Deserialize, Serialize}; -use pageserver_api::models::ShardImportStatus; +use pageserver_api::models::{ShardImportProgress, ShardImportStatus}; use tokio_util::sync::CancellationToken; use utils::{ id::{TenantId, TimelineId}, @@ -28,7 +28,12 @@ impl ShardImportStatuses { ShardImportStatuses( shards .into_iter() - .map(|ts_id| (ts_id, ShardImportStatus::InProgress)) + .map(|ts_id| { + ( + ts_id, + ShardImportStatus::InProgress(None::), + ) + }) .collect(), ) } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 2801a0e867..9d86fd027c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1255,6 +1255,12 @@ class NeonEnv: "no_sync": True, # Look for gaps in WAL received from safekeepeers "validate_wal_contiguity": True, + # TODO(vlad): make these configurable through the builder + "timeline_import_config": { + "import_job_concurrency": 4, + "import_job_soft_size_limit": 512 * 1024, + "import_job_checkpoint_threshold": 4, + }, } # Batching (https://github.com/neondatabase/neon/issues/9377): From 31026d5a3c246956dda9ba4925efdc72ded42de0 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 15 May 2025 17:13:15 +0100 Subject: [PATCH 104/142] pageserver: support import schema evolution (#11935) ## Problem Imports don't support schema evolution nicely. If we want to change the stuff we keep in storcon, we'd have to carry the old cruft around. ## Summary of changes Version import progress. Note that the import progress version determines the version of the import job split and execution. This means that we can also use it as a mechanism for deploying new import implementations in the future. --- libs/pageserver_api/src/models.rs | 7 ++- pageserver/src/controller_upcall_client.rs | 49 ++++++------------- pageserver/src/deletion_queue.rs | 2 +- .../src/tenant/timeline/import_pgdata.rs | 2 +- .../src/tenant/timeline/import_pgdata/flow.rs | 32 +++++++++--- 5 files changed, 50 insertions(+), 42 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 58b8d80c0a..e9b37c8ca6 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -342,7 +342,12 @@ pub enum ShardImportStatus { } #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] -pub struct ShardImportProgress { +pub enum ShardImportProgress { + V1(ShardImportProgressV1), +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub struct ShardImportProgressV1 { /// Total number of jobs in the import plan pub jobs: usize, /// Number of jobs completed diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 779ef3e37d..dc38ea616c 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -59,7 +59,7 @@ pub trait StorageControllerUpcallApi { tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, - ) -> impl Future, RetryForeverError>> + Send; + ) -> impl Future> + Send; } impl StorageControllerUpcallClient { @@ -104,6 +104,7 @@ impl StorageControllerUpcallClient { &self, url: &url::Url, request: R, + method: reqwest::Method, ) -> Result where R: Serialize, @@ -113,7 +114,7 @@ impl StorageControllerUpcallClient { || async { let response = self .http_client - .post(url.clone()) + .request(method.clone(), url.clone()) .json(&request) .send() .await?; @@ -222,7 +223,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { register: register.clone(), }; - let response: ReAttachResponse = self.retry_http_forever(&url, request).await?; + let response: ReAttachResponse = self + .retry_http_forever(&url, request, reqwest::Method::POST) + .await?; tracing::info!( "Received re-attach response with {} tenants (node {}, register: {:?})", response.tenants.len(), @@ -275,7 +278,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { return Err(RetryForeverError::ShuttingDown); } - let response: ValidateResponse = self.retry_http_forever(&url, request).await?; + let response: ValidateResponse = self + .retry_http_forever(&url, request, reqwest::Method::POST) + .await?; for rt in response.tenants { result.insert(rt.id, rt.valid); } @@ -309,7 +314,8 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { status, }; - self.retry_http_forever(&url, request).await + self.retry_http_forever(&url, request, reqwest::Method::POST) + .await } #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context @@ -318,7 +324,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, - ) -> Result, RetryForeverError> { + ) -> Result { let url = self .base_url .join("timeline_import_status") @@ -330,32 +336,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { generation, }; - Ok(backoff::retry( - || async { - let response = self - .http_client - .get(url.clone()) - .json(&request) - .send() - .await?; - - if let Err(err) = response.error_for_status_ref() { - if matches!(err.status(), Some(reqwest::StatusCode::NOT_FOUND)) { - return Ok(None); - } else { - return Err(err); - } - } - response.json::().await.map(Some) - }, - |_| false, - 3, - u32::MAX, - "storage controller upcall", - &self.cancel, - ) - .await - .ok_or(RetryForeverError::ShuttingDown)? - .expect("We retry forever, this should never be reached")) + let response: ShardImportStatus = self + .retry_http_forever(&url, request, reqwest::Method::GET) + .await?; + Ok(response) } } diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 0bbad87c09..7854fd9e36 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -804,7 +804,7 @@ mod test { _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, _generation: Generation, - ) -> Result, RetryForeverError> { + ) -> Result { unimplemented!() } } diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index 602b20df97..658d867c18 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -58,7 +58,7 @@ pub async fn doit( .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?; info!(?shard_status, "peeking shard status"); - match shard_status.unwrap_or(ShardImportStatus::InProgress(None)) { + match shard_status { ShardImportStatus::InProgress(maybe_progress) => { let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index c8c3bdcdfb..3e10a4e6d6 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -44,7 +44,7 @@ use pageserver_api::key::{ slru_segment_size_to_key, }; use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range}; -use pageserver_api::models::{ShardImportProgress, ShardImportStatus}; +use pageserver_api::models::{ShardImportProgress, ShardImportProgressV1, ShardImportStatus}; use pageserver_api::reltag::{RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; use postgres_ffi::relfile_utils::parse_relfilename; @@ -74,6 +74,24 @@ pub async fn run( storage: RemoteStorageWrapper, import_progress: Option, ctx: &RequestContext, +) -> anyhow::Result<()> { + // Match how we run the import based on the progress version. + // If there's no import progress, it means that this is a new import + // and we can use whichever version we want. + match import_progress { + Some(ShardImportProgress::V1(progress)) => { + run_v1(timeline, control_file, storage, Some(progress), ctx).await + } + None => run_v1(timeline, control_file, storage, None, ctx).await, + } +} + +async fn run_v1( + timeline: Arc, + control_file: ControlFile, + storage: RemoteStorageWrapper, + import_progress: Option, + ctx: &RequestContext, ) -> anyhow::Result<()> { let planner = Planner { control_file, @@ -416,15 +434,17 @@ impl Plan { last_completed_job_idx = job_idx; if last_completed_job_idx % checkpoint_every == 0 { + let progress = ShardImportProgressV1 { + jobs: jobs_in_plan, + completed: last_completed_job_idx, + import_plan_hash, + }; + storcon_client.put_timeline_import_status( timeline.tenant_shard_id, timeline.timeline_id, timeline.generation, - ShardImportStatus::InProgress(Some(ShardImportProgress { - jobs: jobs_in_plan, - completed: last_completed_job_idx, - import_plan_hash, - })) + ShardImportStatus::InProgress(Some(ShardImportProgress::V1(progress))) ) .await .map_err(|_err| { From a7ce323949d277fa720a612d710b810903c1b1ff Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 15 May 2025 19:48:13 +0200 Subject: [PATCH 105/142] benchmarking: extend `test_page_service_batching.py` to cover concurrent IO + batching under random reads (#10466) This PR commits the benchmarks I ran to qualify concurrent IO before we released it. Changes: - Add `l0stack` fixture; a reusable abstraction for creating a stack of L0 deltas each of which has 1 Value::Delta per page. - Such a stack of L0 deltas is a good and understandable demo for concurrent IO because to reconstruct any page, $layer_stack_height` Values need to be read. Before concurrent IO, the reads were sequential. With concurrent IO, they are executed concurrently. - So, switch `test_latency` to use the l0stack. - Teach `pagebench`, which is used by `test_latency`, to limit itself to the blocks of the relation created by the l0stack abstraction. - Additional parametrization of `test_latency` over dimensions `ps_io_concurrency,l0_stack_height,queue_depth` - Use better names for the tests to reflect what they do, leave interpretation of the (now quite high-dimensional) results to the reader - `test_{throughput => postgres_seqscan}` - `test_{latency => random_reads}` - Cut down on permutations to those we use in production. Runtime is about 2min. Refs - concurrent IO epic https://github.com/neondatabase/neon/issues/9378 - batching task: fixes https://github.com/neondatabase/neon/issues/9837 --------- Co-authored-by: Peter Bendel --- libs/pageserver_api/src/key.rs | 5 + .../pagebench/src/cmd/getpage_latest_lsn.rs | 10 +- .../bin/neon_local_create_deep_l0_stack.py | 59 +++++++ test_runner/fixtures/neon_fixtures.py | 11 +- .../pageserver/makelayers/__init__.py | 0 .../fixtures/pageserver/makelayers/l0stack.py | 148 ++++++++++++++++ test_runner/performance/README.md | 3 +- test_runner/performance/out_dir_to_csv.py | 57 ++++++ .../pageserver/test_page_service_batching.py | 167 ++++++++++-------- 9 files changed, 387 insertions(+), 73 deletions(-) create mode 100644 test_runner/bin/neon_local_create_deep_l0_stack.py create mode 100644 test_runner/fixtures/pageserver/makelayers/__init__.py create mode 100644 test_runner/fixtures/pageserver/makelayers/l0stack.py create mode 100644 test_runner/performance/out_dir_to_csv.py diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 0c4d7fd4cb..c14975167b 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -910,6 +910,11 @@ impl Key { self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff } + #[inline(always)] + pub fn is_rel_block_of_rel(&self, rel: Oid) -> bool { + self.is_rel_block_key() && self.field4 == rel + } + #[inline(always)] pub fn is_rel_dir_key(&self) -> bool { self.field1 == 0x00 diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 771a7cbe5b..50419ec338 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -65,6 +65,9 @@ pub(crate) struct Args { #[clap(long, default_value = "1")] queue_depth: NonZeroUsize, + #[clap(long)] + only_relnode: Option, + targets: Option>, } @@ -206,7 +209,12 @@ async fn main_impl( for r in partitioning.keys.ranges.iter() { let mut i = r.start; while i != r.end { - if i.is_rel_block_key() { + let mut include = true; + include &= i.is_rel_block_key(); + if let Some(only_relnode) = args.only_relnode { + include &= i.is_rel_block_of_rel(only_relnode); + } + if include { filtered.add_key(i); } i = i.next(); diff --git a/test_runner/bin/neon_local_create_deep_l0_stack.py b/test_runner/bin/neon_local_create_deep_l0_stack.py new file mode 100644 index 0000000000..ebe11f7308 --- /dev/null +++ b/test_runner/bin/neon_local_create_deep_l0_stack.py @@ -0,0 +1,59 @@ +""" +Script to creates a stack of L0 deltas each of which should have 1 Value::Delta per page in `data`, +in your running neon_local setup. + +Use this bash setup to reset your neon_local environment. +The last line of this bash snippet will run this file here. +``` + export NEON_REPO_DIR=$PWD/.neon + export NEON_BIN_DIR=$PWD/target/release + $NEON_BIN_DIR/neon_local stop + rm -rf $NEON_REPO_DIR + $NEON_BIN_DIR/neon_local init + cat >> $NEON_REPO_DIR/pageserver_1/pageserver.toml <<"EOF" + # customizations + virtual_file_io_mode = "direct-rw" + page_service_pipelining={mode="pipelined", max_batch_size=32, execution="concurrent-futures"} + get_vectored_concurrent_io={mode="sidecar-task"} +EOF + $NEON_BIN_DIR/neon_local start + + psql 'postgresql://localhost:1235/storage_controller' -c 'DELETE FROM tenant_shards' + sed 's/.*get_vectored_concurrent_io.*/get_vectored_concurrent_io={mode="sidecar-task"}/' -i $NEON_REPO_DIR/pageserver_1/pageserver.toml + $NEON_BIN_DIR/neon_local pageserver restart + sleep 2 + $NEON_BIN_DIR/neon_local tenant create --set-default + ./target/debug/neon_local endpoint stop foo + rm -rf $NEON_REPO_DIR/endpoints/foo + ./target/debug/neon_local endpoint create foo + echo 'full_page_writes=off' >> $NEON_REPO_DIR/endpoints/foo/postgresql.conf + ./target/debug/neon_local endpoint start foo + + pushd test_runner; poetry run python3 -m bin.neon_local_create_deep_l0_stack 10; popd +``` +""" + +import sys + +import psycopg2 +from fixtures.common_types import TenantShardId, TimelineId +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.makelayers.l0stack import L0StackShape, make_l0_stack_standalone + +ps_http = PageserverHttpClient(port=9898, is_testing_enabled_or_skip=lambda: None) +vps_http = PageserverHttpClient(port=1234, is_testing_enabled_or_skip=lambda: None) + +tenants = ps_http.tenant_list() +assert len(tenants) == 1 +tenant_shard_id = TenantShardId.parse(tenants[0]["id"]) + +timlines = ps_http.timeline_list(tenant_shard_id) +assert len(timlines) == 1 +timeline_id = TimelineId(timlines[0]["timeline_id"]) + +connstr = "postgresql://cloud_admin@localhost:55432/postgres" +conn = psycopg2.connect(connstr) + +shape = L0StackShape(logical_table_size_mib=50, delta_stack_height=int(sys.argv[1])) + +make_l0_stack_standalone(vps_http, ps_http, tenant_shard_id, timeline_id, conn, shape) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9d86fd027c..e413b3c6d2 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1377,7 +1377,11 @@ class NeonEnv: force=config.config_init_force, ) - def start(self, timeout_in_seconds: int | None = None): + def start( + self, + timeout_in_seconds: int | None = None, + extra_ps_env_vars: dict[str, str] | None = None, + ): # Storage controller starts first, so that pageserver /re-attach calls don't # bounce through retries on startup self.storage_controller.start(timeout_in_seconds=timeout_in_seconds) @@ -1396,7 +1400,10 @@ class NeonEnv: for pageserver in self.pageservers: futs.append( executor.submit( - lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] + lambda ps=pageserver: ps.start( # type: ignore[misc] + extra_env_vars=extra_ps_env_vars or {}, + timeout_in_seconds=timeout_in_seconds, + ), ) ) diff --git a/test_runner/fixtures/pageserver/makelayers/__init__.py b/test_runner/fixtures/pageserver/makelayers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/fixtures/pageserver/makelayers/l0stack.py b/test_runner/fixtures/pageserver/makelayers/l0stack.py new file mode 100644 index 0000000000..408ba1254f --- /dev/null +++ b/test_runner/fixtures/pageserver/makelayers/l0stack.py @@ -0,0 +1,148 @@ +from dataclasses import dataclass + +from psycopg2.extensions import connection as PgConnection + +from fixtures.common_types import Lsn, TenantShardId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import wait_for_last_record_lsn + + +@dataclass +class L0StackShape: + logical_table_size_mib: int = 50 + delta_stack_height: int = 20 + + +def make_l0_stack(endpoint: Endpoint, shape: L0StackShape): + """ + Creates stack of L0 deltas each of which should have 1 Value::Delta per page in table `data`. + """ + env = endpoint.env + + # TDOO: wait for storcon to finish any reonciles before jumping to action here? + description = env.storage_controller.tenant_describe(endpoint.tenant_id) + shards = description["shards"] + assert len(shards) == 1, "does not support sharding" + tenant_shard_id = TenantShardId.parse(shards[0]["tenant_shard_id"]) + + endpoint.config(["full_page_writes=off"]) + endpoint.reconfigure() + + ps = env.get_pageserver(shards[0]["node_attached"]) + + timeline_id = endpoint.show_timeline_id() + + vps_http = env.storage_controller.pageserver_api() + ps_http = ps.http_client() + endpoint_conn = endpoint.connect() + make_l0_stack_standalone(vps_http, ps_http, tenant_shard_id, timeline_id, endpoint_conn, shape) + + +def make_l0_stack_standalone( + vps_http: PageserverHttpClient, + ps_http: PageserverHttpClient, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + endpoint_conn: PgConnection, + shape: L0StackShape, +): + """ + See make_l0_stack for details. + + This function is a standalone version of make_l0_stack, usable from not-test code. + """ + + assert not tenant_shard_id.shard_index.is_sharded, ( + "the current implementation only supports unsharded tenants" + ) + + tenant_id = tenant_shard_id.tenant_id + conn = endpoint_conn + desired_size = shape.logical_table_size_mib * 1024 * 1024 + + config = { + "gc_period": "0s", # disable periodic gc + "checkpoint_timeout": "10 years", + "compaction_period": "1h", # doesn't matter, but 0 value will kill walredo every 10s + "compaction_threshold": 100000, # we just want L0s + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 100000, # we just want L0s + } + + vps_http.set_tenant_config(tenant_id, config) + + conn.autocommit = True + cur = conn.cursor() + + # Ensure full_page_writes are disabled so that all Value::Delta in + # pageserver are !will_init, and therefore a getpage needs to read + # the entire delta stack. + cur.execute("SHOW full_page_writes") + assert cur.fetchall()[0][0] == "off", "full_page_writes should be off" + + # each tuple is 23 (header) + 100 bytes = 123 bytes + # page header si 24 bytes + # 8k page size + # (8k-24bytes) / 123 bytes = 63 tuples per page + # set fillfactor to 10 to have 6 tuples per page + cur.execute("DROP TABLE IF EXISTS data") + cur.execute("CREATE TABLE data(id bigint, row char(92)) with (fillfactor=10)") + need_pages = desired_size // 8192 + need_rows = need_pages * 6 + log.info(f"Need {need_pages} pages, {need_rows} rows") + cur.execute(f"INSERT INTO data SELECT i,'row'||i FROM generate_series(1, {need_rows}) as i") + # Raise fillfactor to 100% so that all updates are HOT updates. + # We assert they're hot updates by checking fetch_id_to_page_mapping remains the same. + cur.execute("ALTER TABLE data SET (fillfactor=100)") + + def settle_and_flush(): + cur.execute("SELECT pg_current_wal_flush_lsn()") + flush_lsn = Lsn(cur.fetchall()[0][0]) + wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, flush_lsn) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + + # create an L0 for the initial data we just inserted + settle_and_flush() + + # assert we wrote what we think we wrote + cur.execute(""" + with ntuples_per_page as ( + select (ctid::text::point)[0]::bigint pageno,count(*) ntuples from data group by pageno + ) + select ntuples, count(*) npages from ntuples_per_page group by ntuples order by ntuples; + """) + rows = cur.fetchall() + log.info(f"initial table layout: {rows}") + assert len(rows) == 1 + assert rows[0][0] == 6, f"expected 6 tuples per page, got {rows[0][0]}" + assert rows[0][1] == need_pages, f"expected {need_pages} pages, got {rows[0][1]}" + + def fetch_id_to_page_mapping(): + cur.execute(""" + SELECT id,(ctid::text::point)[0]::bigint pageno FROM data ORDER BY id + """) + return cur.fetchall() + + initial_mapping = fetch_id_to_page_mapping() + + # every iteration updates one tuple in each page + delta_stack_height = shape.delta_stack_height + for i in range(0, delta_stack_height): + log.info(i) + cur.execute(f"UPDATE data set row = row||',u' where id % 6 = {i % 6}") + log.info(f"modified rows: {cur.rowcount}") + assert cur.rowcount == need_pages + settle_and_flush() + post_update_mapping = fetch_id_to_page_mapping() + assert initial_mapping == post_update_mapping, "Postgres should be doing HOT updates" + + # Assert the layer count is what we expect it is + layer_map = vps_http.layer_map_info(tenant_id, timeline_id) + assert ( + len(layer_map.delta_l0_layers()) == delta_stack_height + 1 + 1 + ) # +1 for the initdb layer + 1 for the table creation & fill + assert len(layer_map.delta_l0_layers()) == len(layer_map.delta_layers()) # it's all L0s + assert len(layer_map.image_layers()) == 0 # no images diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index 3b25a60e9b..21844648d1 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -15,7 +15,8 @@ Some handy pytest flags for local development: - `-k` selects a test to run - `--timeout=0` disables our default timeout of 300s (see `setup.cfg`) - `--preserve-database-files` to skip cleanup -- `--out-dir` to produce a JSON with the recorded test metrics +- `--out-dir` to produce a JSON with the recorded test metrics. + There is a post-processing tool at `test_runner/performance/out_dir_to_csv.py`. # What performance tests do we have and how we run them diff --git a/test_runner/performance/out_dir_to_csv.py b/test_runner/performance/out_dir_to_csv.py new file mode 100644 index 0000000000..8647ad4acc --- /dev/null +++ b/test_runner/performance/out_dir_to_csv.py @@ -0,0 +1,57 @@ +# Tool to convert the JSON output from running a perf test with `--out-dir` to a CSV that +# can be easily pasted into a spreadsheet for quick viz & analysis. +# Check the `./README.md` in this directory for `--out-dir`. +# +# TODO: add the pytest.mark.parametrize to the json and make them columns here +# https://github.com/neondatabase/neon/issues/11878 + +import csv +import json +import os +import sys + + +def json_to_csv(json_file): + with open(json_file) as f: + data = json.load(f) + + # Collect all possible metric names to form headers + all_metrics = set() + for result in data.get("result", []): + for metric in result.get("data", []): + all_metrics.add(metric["name"]) + + # Sort metrics for consistent output + metrics = sorted(list(all_metrics)) + + # Create headers + headers = ["suit"] + metrics + + # Prepare rows + rows = [] + for result in data.get("result", []): + row = {"suit": result["suit"]} + + # Initialize all metrics to empty + for metric in metrics: + row[metric] = "" + + # Fill in available metrics + for item in result.get("data", []): + row[item["name"]] = item["value"] + + rows.append(row) + + # Write to stdout as CSV + writer = csv.DictWriter(sys.stdout, fieldnames=headers) + writer.writeheader() + writer.writerows(rows) + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print(f"Usage: python {os.path.basename(__file__)} ") + sys.exit(1) + + json_file = sys.argv[1] + json_to_csv(json_file) diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py index b17ca772c9..9e2312311a 100644 --- a/test_runner/performance/pageserver/test_page_service_batching.py +++ b/test_runner/performance/pageserver/test_page_service_batching.py @@ -10,7 +10,8 @@ from typing import Any import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin +from fixtures.pageserver.makelayers import l0stack from fixtures.utils import humantime_to_ms TARGET_RUNTIME = 30 @@ -34,28 +35,18 @@ class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig): mode: str = "pipelined" -EXECUTION = ["concurrent-futures"] -BATCHING = ["uniform-lsn", "scattered-lsn"] - -NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] -for max_batch_size in [1, 32]: - for execution in EXECUTION: - for batching in BATCHING: - NON_BATCHABLE.append( - PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) - ) - -BATCHABLE: list[PageServicePipeliningConfig] = [] +PS_IO_CONCURRENCY = ["sidecar-task"] +PIPELINING_CONFIGS: list[PageServicePipeliningConfig] = [] for max_batch_size in [32]: - for execution in EXECUTION: - for batching in BATCHING: - BATCHABLE.append( + for execution in ["concurrent-futures"]: + for batching in ["scattered-lsn"]: + PIPELINING_CONFIGS.append( PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) ) @pytest.mark.parametrize( - "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name", + "tablesize_mib, pipelining_config, target_runtime, ps_io_concurrency, effective_io_concurrency, readhead_buffer_size, name", [ # batchable workloads should show throughput and CPU efficiency improvements *[ @@ -63,20 +54,23 @@ for max_batch_size in [32]: 50, config, TARGET_RUNTIME, + ps_io_concurrency, 100, 128, f"batchable {dataclasses.asdict(config)}", ) - for config in BATCHABLE + for config in PIPELINING_CONFIGS + for ps_io_concurrency in PS_IO_CONCURRENCY ], ], ) -def test_throughput( +def test_postgres_seqscan( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, tablesize_mib: int, pipelining_config: PageServicePipeliningConfig, target_runtime: int, + ps_io_concurrency: str, effective_io_concurrency: int, readhead_buffer_size: int, name: str, @@ -97,6 +91,10 @@ def test_throughput( If the compute provides pipeline depth (effective_io_concurrency=100), then pipelining configs, especially with max_batch_size>1 should yield dramatic improvements in all performance metrics. + + We advance the LSN from a disruptor thread to simulate the effect of a workload with concurrent writes + in another table. The `scattered-lsn` batching mode handles this well whereas the + initial implementatin (`uniform-lsn`) would break the batch. """ # @@ -114,7 +112,19 @@ def test_throughput( } ) # For storing configuration as a metric, insert a fake 0 with labels with actual data - params.update({"pipelining_config": (0, {"labels": dataclasses.asdict(pipelining_config)})}) + params.update( + { + "config": ( + 0, + { + "labels": { + "pipelining_config": dataclasses.asdict(pipelining_config), + "ps_io_concurrency": ps_io_concurrency, + } + }, + ) + } + ) log.info("params: %s", params) @@ -266,7 +276,10 @@ def test_throughput( return iters env.pageserver.patch_config_toml_nonrecursive( - {"page_service_pipelining": dataclasses.asdict(pipelining_config)} + { + "page_service_pipelining": dataclasses.asdict(pipelining_config), + "get_vectored_concurrent_io": {"mode": ps_io_concurrency}, + } ) # set trace for log analysis below @@ -318,77 +331,63 @@ def test_throughput( ) -PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] -for max_batch_size in [1, 32]: - for execution in EXECUTION: - for batching in BATCHING: - PRECISION_CONFIGS.append( - PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) - ) - - @pytest.mark.parametrize( - "pipelining_config,name", - [(config, f"{dataclasses.asdict(config)}") for config in PRECISION_CONFIGS], + "pipelining_config,ps_io_concurrency,l0_stack_height,queue_depth,name", + [ + (config, ps_io_concurrency, l0_stack_height, queue_depth, f"{dataclasses.asdict(config)}") + for config in PIPELINING_CONFIGS + for ps_io_concurrency in PS_IO_CONCURRENCY + for queue_depth in [1, 2, 32] + for l0_stack_height in [0, 20] + ], ) -def test_latency( +def test_random_reads( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, pipelining_config: PageServicePipeliningConfig, + ps_io_concurrency: str, + l0_stack_height: int, + queue_depth: int, name: str, ): """ - Measure the latency impact of pipelining in an un-batchable workloads. - - An ideal implementation should not increase average or tail latencies for such workloads. - - We don't have support in pagebench to create queue depth yet. - => https://github.com/neondatabase/neon/issues/9837 + Throw pagebench random getpage at latest lsn workload from a single client against pageserver. """ # # Setup # + def build_snapshot_cb(neon_env_builder: NeonEnvBuilder) -> NeonEnv: + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main") + l0stack.make_l0_stack( + endpoint, + l0stack.L0StackShape(logical_table_size_mib=50, delta_stack_height=l0_stack_height), + ) + return env + + env = neon_env_builder.build_and_use_snapshot( + f"test_page_service_batching--test_pagebench-{l0_stack_height}", build_snapshot_cb + ) + def patch_ps_config(ps_config): - if pipelining_config is not None: - ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config) + ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config) + ps_config["get_vectored_concurrent_io"] = {"mode": ps_io_concurrency} - neon_env_builder.pageserver_config_override = patch_ps_config + env.pageserver.edit_config_toml(patch_ps_config) - env = neon_env_builder.init_start() - endpoint = env.endpoints.create_start("main") - conn = endpoint.connect() - cur = conn.cursor() + env.start() - cur.execute("SET max_parallel_workers_per_gather=0") # disable parallel backends - cur.execute("SET effective_io_concurrency=1") - - cur.execute("CREATE EXTENSION IF NOT EXISTS neon;") - cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;") - - log.info("Filling the table") - cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)") - tablesize = 50 * 1024 * 1024 - npages = tablesize // (8 * 1024) - cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,)) - # TODO: can we force postgres to do sequential scans? - - cur.close() - conn.close() - - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) - - endpoint.stop() + lsn = env.safekeepers[0].get_commit_lsn(env.initial_tenant, env.initial_timeline) + ep = env.endpoints.create_start("main", lsn=lsn) + data_table_relnode_oid = ep.safe_psql_scalar("SELECT 'data'::regclass::oid") + ep.stop_and_destroy() for sk in env.safekeepers: sk.stop() - # - # Run single-threaded pagebench (TODO: dedup with other benchmark code) - # - env.pageserver.allowed_errors.append( # https://github.com/neondatabase/neon/issues/6925 r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" @@ -396,6 +395,8 @@ def test_latency( ps_http = env.pageserver.http_client() + metrics_before = ps_http.get_metrics() + cmd = [ str(env.neon_binpath / "pagebench"), "get-page-latest-lsn", @@ -405,6 +406,10 @@ def test_latency( env.pageserver.connstr(password=None), "--num-clients", "1", + "--queue-depth", + str(queue_depth), + "--only-relnode", + str(data_table_relnode_oid), "--runtime", "10s", ] @@ -413,12 +418,22 @@ def test_latency( results_path = Path(basepath + ".stdout") log.info(f"Benchmark results at: {results_path}") + metrics_after = ps_http.get_metrics() + with open(results_path) as f: results = json.load(f) log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") total = results["total"] + metric = "request_count" + zenbenchmark.record( + metric, + metric_value=total[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + metric = "latency_mean" zenbenchmark.record( metric, @@ -435,3 +450,17 @@ def test_latency( unit="ms", report=MetricReport.LOWER_IS_BETTER, ) + + reads_before = metrics_before.query_one( + "pageserver_io_operations_seconds_count", filter={"operation": "read"} + ) + reads_after = metrics_after.query_one( + "pageserver_io_operations_seconds_count", filter={"operation": "read"} + ) + + zenbenchmark.record( + "virtual_file_reads", + metric_value=reads_after.value - reads_before.value, + unit="", + report=MetricReport.LOWER_IS_BETTER, + ) From 2d247375b3b10d80b1f235aa0e12bd41d626d54a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 16 May 2025 14:21:24 +0200 Subject: [PATCH 106/142] Update rust to 1.87.0 (#11938) We keep the practice of keeping the compiler up to date, pointing to the latest release. This is done by many other projects in the Rust ecosystem as well. The 1.87.0 release marks 10 years of Rust. [Announcement blog post](https://blog.rust-lang.org/2025/05/15/Rust-1.87.0/) Prior update was in #11431 --- build-tools.Dockerfile | 2 +- pageserver/src/virtual_file/io_engine.rs | 4 +--- proxy/src/binary/pg_sni_router.rs | 1 + proxy/src/binary/proxy.rs | 2 +- rust-toolchain.toml | 2 +- storage_controller/src/scheduler.rs | 6 +----- 6 files changed, 6 insertions(+), 11 deletions(-) diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index f63d844afd..1933fd19d8 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -292,7 +292,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.86.0 +ENV RUSTC_VERSION=1.87.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index 7827682498..3cde34eda7 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -124,9 +124,7 @@ pub(super) fn epoll_uring_error_to_std( ) -> std::io::Error { match e { tokio_epoll_uring::Error::Op(e) => e, - tokio_epoll_uring::Error::System(system) => { - std::io::Error::new(std::io::ErrorKind::Other, system) - } + tokio_epoll_uring::Error::System(system) => std::io::Error::other(system), } } diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index 2239d064b2..3e87538ae7 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -394,6 +394,7 @@ async fn handle_client( } } +#[allow(clippy::large_enum_variant)] enum Connection { Raw(tokio::net::TcpStream), Tls(tokio_rustls::client::TlsStream), diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index fe0d551f7f..4cb5ddc335 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -862,7 +862,7 @@ async fn configure_redis( ("irsa", _) => match (&args.redis_host, args.redis_port) { (Some(host), Some(port)) => Some( ConnectionWithCredentialsProvider::new_with_credentials_provider( - host.to_string(), + host.clone(), port, elasticache::CredentialsProvider::new( args.aws_region.clone(), diff --git a/rust-toolchain.toml b/rust-toolchain.toml index a0d5970bd5..c48def3483 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.86.0" +channel = "1.87.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 3d5f36fb98..773373391e 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -628,11 +628,7 @@ impl Scheduler { tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node); } - if node.attached_shard_count < expected_attached_shards_per_node { - expected_attached_shards_per_node - node.attached_shard_count - } else { - 0 - } + expected_attached_shards_per_node.saturating_sub(node.attached_shard_count) } pub(crate) fn expected_attached_shard_count(&self) -> usize { From aa22572d8c7602c1e6b26c0afde2df3a4e90f36d Mon Sep 17 00:00:00 2001 From: Evan Fleming Date: Fri, 16 May 2025 05:41:10 -0700 Subject: [PATCH 107/142] safekeeper: refactor static remote storage usage to use Arc (#10179) Greetings! Please add `w=1` to github url when viewing diff (sepcifically `wal_backup.rs`) ## Problem This PR is aimed at addressing the remaining work of #8200. Namely, removing static usage of remote storage in favour of arc. I did not opt to pass `Arc` directly since it is actually `Optional` as it is not necessarily always configured. I wanted to avoid having to pass `Arc>` everywhere with individual consuming functions likely needing to handle unwrapping. Instead I've added a `WalBackup` struct that holds `Optional` and handles initialization/unwrapping RemoteStorage internally. wal_backup functions now take self and `Arc` is passed as a dependency through the various consumers that need it. ## Summary of changes - Add `WalBackup` that holds `Optional` and handles initialization and unwrapping - Modify wal_backup functions to take `WalBackup` as self (Add `w=1` to github url when viewing diff here) - Initialize `WalBackup` in safekeeper root - Store `Arc` in `GlobalTimelineMap` and pass and store in each Timeline as loaded - use `WalBackup` through Timeline as needed ## Refs - task to remove global variables https://github.com/neondatabase/neon/issues/8200 - drive-by fixes https://github.com/neondatabase/neon/issues/11501 by turning the panic reported there into an error `remote storage not configured` --------- Co-authored-by: Christian Schwarz --- safekeeper/src/bin/safekeeper.rs | 9 +- safekeeper/src/copy_timeline.rs | 3 + safekeeper/src/http/routes.rs | 10 ++- safekeeper/src/lib.rs | 6 -- safekeeper/src/pull_timeline.rs | 43 +++++++-- safekeeper/src/test_utils.rs | 6 +- safekeeper/src/timeline.rs | 28 ++++-- safekeeper/src/timeline_eviction.rs | 47 +++++++--- safekeeper/src/timeline_manager.rs | 26 ++++-- safekeeper/src/timelines_global_map.rs | 41 +++++++-- safekeeper/src/wal_backup.rs | 115 ++++++++++++++----------- safekeeper/src/wal_backup_partial.rs | 21 +++-- safekeeper/src/wal_storage.rs | 13 +-- 13 files changed, 255 insertions(+), 113 deletions(-) diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index c267a55cb6..8d31ada24f 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -22,9 +22,10 @@ use safekeeper::defaults::{ DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE, }; +use safekeeper::wal_backup::WalBackup; use safekeeper::{ BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf, - WAL_SERVICE_RUNTIME, broker, control_file, http, wal_backup, wal_service, + WAL_SERVICE_RUNTIME, broker, control_file, http, wal_service, }; use sd_notify::NotifyState; use storage_broker::{DEFAULT_ENDPOINT, Uri}; @@ -484,15 +485,15 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { None => None, }; - let global_timelines = Arc::new(GlobalTimelines::new(conf.clone())); + let wal_backup = Arc::new(WalBackup::new(&conf).await?); + + let global_timelines = Arc::new(GlobalTimelines::new(conf.clone(), wal_backup.clone())); // Register metrics collector for active timelines. It's important to do this // after daemonizing, otherwise process collector will be upset. let timeline_collector = safekeeper::metrics::TimelineCollector::new(global_timelines.clone()); metrics::register_internal(Box::new(timeline_collector))?; - wal_backup::init_remote_storage(&conf).await; - // Keep handles to main tasks to die if any of them disappears. let mut tasks_handles: FuturesUnordered> = FuturesUnordered::new(); diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 11daff22cb..7984c2e2b9 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use anyhow::{Result, bail}; use camino::Utf8PathBuf; use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; +use remote_storage::GenericRemoteStorage; use safekeeper_api::membership::Configuration; use tokio::fs::OpenOptions; use tokio::io::{AsyncSeekExt, AsyncWriteExt}; @@ -30,6 +31,7 @@ pub struct Request { pub async fn handle_request( request: Request, global_timelines: Arc, + storage: Arc, ) -> Result<()> { // TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :( // if LSN will point to the middle of a WAL record, timeline will be in "broken" state @@ -127,6 +129,7 @@ pub async fn handle_request( assert!(first_ondisk_segment >= first_segment); copy_s3_segments( + &storage, wal_seg_size, &request.source_ttid, &request.destination_ttid, diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 1a25b07496..384c582678 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -258,6 +258,7 @@ async fn timeline_snapshot_handler(request: Request) -> Result, // so create the chan and write to it in another task. @@ -269,6 +270,7 @@ async fn timeline_snapshot_handler(request: Request) -> Result) -> Result bool { - self.remote_storage.is_some() && self.wal_backup_enabled - } -} - impl SafeKeeperConf { pub fn dummy() -> Self { SafeKeeperConf { diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index c955e667bd..14aef1ee5e 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -9,6 +9,7 @@ use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; use http_utils::error::ApiError; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; +use remote_storage::GenericRemoteStorage; use reqwest::Certificate; use safekeeper_api::Term; use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}; @@ -43,6 +44,7 @@ pub async fn stream_snapshot( source: NodeId, destination: NodeId, tx: mpsc::Sender>, + storage: Option>, ) { match tli.try_wal_residence_guard().await { Err(e) => { @@ -53,10 +55,32 @@ pub async fn stream_snapshot( Ok(maybe_resident_tli) => { if let Err(e) = match maybe_resident_tli { Some(resident_tli) => { - stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone()) - .await + stream_snapshot_resident_guts( + resident_tli, + source, + destination, + tx.clone(), + storage, + ) + .await + } + None => { + if let Some(storage) = storage { + stream_snapshot_offloaded_guts( + tli, + source, + destination, + tx.clone(), + &storage, + ) + .await + } else { + tx.send(Err(anyhow!("remote storage not configured"))) + .await + .ok(); + return; + } } - None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await, } { // Error type/contents don't matter as they won't can't reach the client // (hyper likely doesn't do anything with it), but http stream will be @@ -123,10 +147,12 @@ pub(crate) async fn stream_snapshot_offloaded_guts( source: NodeId, destination: NodeId, tx: mpsc::Sender>, + storage: &GenericRemoteStorage, ) -> Result<()> { let mut ar = prepare_tar_stream(tx); - tli.snapshot_offloaded(&mut ar, source, destination).await?; + tli.snapshot_offloaded(&mut ar, source, destination, storage) + .await?; ar.finish().await?; @@ -139,10 +165,13 @@ pub async fn stream_snapshot_resident_guts( source: NodeId, destination: NodeId, tx: mpsc::Sender>, + storage: Option>, ) -> Result<()> { let mut ar = prepare_tar_stream(tx); - let bctx = tli.start_snapshot(&mut ar, source, destination).await?; + let bctx = tli + .start_snapshot(&mut ar, source, destination, storage) + .await?; pausable_failpoint!("sk-snapshot-after-list-pausable"); let tli_dir = tli.get_timeline_dir(); @@ -182,6 +211,7 @@ impl Timeline { ar: &mut tokio_tar::Builder, source: NodeId, destination: NodeId, + storage: &GenericRemoteStorage, ) -> Result<()> { // Take initial copy of control file, then release state lock let mut control_file = { @@ -216,6 +246,7 @@ impl Timeline { // can fail if the timeline was un-evicted and modified in the background. let remote_timeline_path = &self.remote_path; wal_backup::copy_partial_segment( + storage, &replace.previous.remote_path(remote_timeline_path), &replace.current.remote_path(remote_timeline_path), ) @@ -262,6 +293,7 @@ impl WalResidentTimeline { ar: &mut tokio_tar::Builder, source: NodeId, destination: NodeId, + storage: Option>, ) -> Result { let mut shared_state = self.write_shared_state().await; let wal_seg_size = shared_state.get_wal_seg_size(); @@ -283,6 +315,7 @@ impl WalResidentTimeline { let remote_timeline_path = &self.tli.remote_path; wal_backup::copy_partial_segment( + &*storage.context("remote storage not configured")?, &replace.previous.remote_path(remote_timeline_path), &replace.current.remote_path(remote_timeline_path), ) diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs index 618e2b59d2..e2817c8337 100644 --- a/safekeeper/src/test_utils.rs +++ b/safekeeper/src/test_utils.rs @@ -18,7 +18,7 @@ use crate::send_wal::EndWatch; use crate::state::{TimelinePersistentState, TimelineState}; use crate::timeline::{SharedState, StateSK, Timeline, get_timeline_dir}; use crate::timelines_set::TimelinesSet; -use crate::wal_backup::remote_timeline_path; +use crate::wal_backup::{WalBackup, remote_timeline_path}; use crate::{SafeKeeperConf, control_file, receive_wal, wal_storage}; /// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop. @@ -101,18 +101,22 @@ impl Env { let safekeeper = self.make_safekeeper(node_id, ttid, start_lsn).await?; let shared_state = SharedState::new(StateSK::Loaded(safekeeper)); + let wal_backup = Arc::new(WalBackup::new(&conf).await?); + let timeline = Timeline::new( ttid, &timeline_dir, &remote_path, shared_state, conf.clone(), + wal_backup.clone(), ); timeline.bootstrap( &mut timeline.write_shared_state().await, &conf, Arc::new(TimelinesSet::default()), // ignored for now RateLimiter::new(0, 0), + wal_backup, ); Ok(timeline) } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index b7ba28f435..588bd4f2c9 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -35,7 +35,8 @@ use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, Tim use crate::timeline_guard::ResidenceGuard; use crate::timeline_manager::{AtomicStatus, ManagerCtl}; use crate::timelines_set::TimelinesSet; -use crate::wal_backup::{self, remote_timeline_path}; +use crate::wal_backup; +use crate::wal_backup::{WalBackup, remote_timeline_path}; use crate::wal_backup_partial::PartialRemoteSegment; use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; use crate::{SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_storage}; @@ -452,6 +453,8 @@ pub struct Timeline { manager_ctl: ManagerCtl, conf: Arc, + pub(crate) wal_backup: Arc, + remote_deletion: std::sync::Mutex>, /// Hold this gate from code that depends on the Timeline's non-shut-down state. While holding @@ -476,6 +479,7 @@ impl Timeline { remote_path: &RemotePath, shared_state: SharedState, conf: Arc, + wal_backup: Arc, ) -> Arc { let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(shared_state.sk.state().commit_lsn); @@ -509,6 +513,7 @@ impl Timeline { wal_backup_active: AtomicBool::new(false), last_removed_segno: AtomicU64::new(0), mgr_status: AtomicStatus::new(), + wal_backup, }) } @@ -516,6 +521,7 @@ impl Timeline { pub fn load_timeline( conf: Arc, ttid: TenantTimelineId, + wal_backup: Arc, ) -> Result> { let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); @@ -529,6 +535,7 @@ impl Timeline { &remote_path, shared_state, conf, + wal_backup, )) } @@ -539,6 +546,7 @@ impl Timeline { conf: &SafeKeeperConf, broker_active_set: Arc, partial_backup_rate_limiter: RateLimiter, + wal_backup: Arc, ) { let (tx, rx) = self.manager_ctl.bootstrap_manager(); @@ -561,6 +569,7 @@ impl Timeline { tx, rx, partial_backup_rate_limiter, + wal_backup, ) .await } @@ -606,9 +615,10 @@ impl Timeline { // it is cancelled, so WAL storage won't be opened again. shared_state.sk.close_wal_store(); - if !only_local && self.conf.is_wal_backup_enabled() { + if !only_local { self.remote_delete().await?; } + let dir_existed = delete_dir(&self.timeline_dir).await?; Ok(dir_existed) } @@ -675,11 +685,20 @@ impl Timeline { guard: &mut std::sync::MutexGuard>, ) -> RemoteDeletionReceiver { tracing::info!("starting remote deletion"); + let storage = self.wal_backup.get_storage().clone(); let (result_tx, result_rx) = tokio::sync::watch::channel(None); let ttid = self.ttid; tokio::task::spawn( async move { - let r = wal_backup::delete_timeline(&ttid).await; + let r = if let Some(storage) = storage { + wal_backup::delete_timeline(&storage, &ttid).await + } else { + tracing::info!( + "skipping remote deletion because no remote storage is configured; this effectively leaks the objects in remote storage" + ); + Ok(()) + }; + if let Err(e) = &r { // Log error here in case nobody ever listens for our result (e.g. dropped API request) tracing::error!("remote deletion failed: {e}"); @@ -1046,14 +1065,13 @@ impl WalResidentTimeline { pub async fn get_walreader(&self, start_lsn: Lsn) -> Result { let (_, persisted_state) = self.get_state().await; - let enable_remote_read = self.conf.is_wal_backup_enabled(); WalReader::new( &self.ttid, self.timeline_dir.clone(), &persisted_state, start_lsn, - enable_remote_read, + self.wal_backup.clone(), ) } diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 84c636daf6..e817dbf6f9 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -6,7 +6,7 @@ use anyhow::Context; use camino::Utf8PathBuf; -use remote_storage::RemotePath; +use remote_storage::{GenericRemoteStorage, RemotePath}; use tokio::fs::File; use tokio::io::{AsyncRead, AsyncWriteExt}; use tracing::{debug, info, instrument, warn}; @@ -68,6 +68,10 @@ impl Manager { #[instrument(name = "evict_timeline", skip_all)] pub(crate) async fn evict_timeline(&mut self) -> bool { assert!(!self.is_offloaded); + let Some(storage) = self.wal_backup.get_storage() else { + warn!("no remote storage configured, skipping uneviction"); + return false; + }; let partial_backup_uploaded = match &self.partial_backup_uploaded { Some(p) => p.clone(), None => { @@ -87,7 +91,7 @@ impl Manager { .inc(); }); - if let Err(e) = do_eviction(self, &partial_backup_uploaded).await { + if let Err(e) = do_eviction(self, &partial_backup_uploaded, &storage).await { warn!("failed to evict timeline: {:?}", e); return false; } @@ -102,6 +106,10 @@ impl Manager { #[instrument(name = "unevict_timeline", skip_all)] pub(crate) async fn unevict_timeline(&mut self) { assert!(self.is_offloaded); + let Some(storage) = self.wal_backup.get_storage() else { + warn!("no remote storage configured, skipping uneviction"); + return; + }; let partial_backup_uploaded = match &self.partial_backup_uploaded { Some(p) => p.clone(), None => { @@ -121,7 +129,7 @@ impl Manager { .inc(); }); - if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await { + if let Err(e) = do_uneviction(self, &partial_backup_uploaded, &storage).await { warn!("failed to unevict timeline: {:?}", e); return; } @@ -137,8 +145,12 @@ impl Manager { /// Ensure that content matches the remote partial backup, if local segment exists. /// Then change state in control file and in-memory. If `delete_offloaded_wal` is set, /// delete the local segment. -async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> { - compare_local_segment_with_remote(mgr, partial).await?; +async fn do_eviction( + mgr: &mut Manager, + partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, +) -> anyhow::Result<()> { + compare_local_segment_with_remote(mgr, partial, storage).await?; mgr.tli.switch_to_offloaded(partial).await?; // switch manager state as soon as possible @@ -153,12 +165,16 @@ async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyho /// Ensure that content matches the remote partial backup, if local segment exists. /// Then download segment to local disk and change state in control file and in-memory. -async fn do_uneviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> { +async fn do_uneviction( + mgr: &mut Manager, + partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, +) -> anyhow::Result<()> { // if the local segment is present, validate it - compare_local_segment_with_remote(mgr, partial).await?; + compare_local_segment_with_remote(mgr, partial, storage).await?; // atomically download the partial segment - redownload_partial_segment(mgr, partial).await?; + redownload_partial_segment(mgr, partial, storage).await?; mgr.tli.switch_to_present().await?; // switch manager state as soon as possible @@ -181,6 +197,7 @@ async fn delete_local_segment(mgr: &Manager, partial: &PartialRemoteSegment) -> async fn redownload_partial_segment( mgr: &Manager, partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, ) -> anyhow::Result<()> { let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp"); let remote_segfile = remote_segment_path(mgr, partial); @@ -190,7 +207,7 @@ async fn redownload_partial_segment( remote_segfile, tmp_file ); - let mut reader = wal_backup::read_object(&remote_segfile, 0).await?; + let mut reader = wal_backup::read_object(storage, &remote_segfile, 0).await?; let mut file = File::create(&tmp_file).await?; let actual_len = tokio::io::copy(&mut reader, &mut file).await?; @@ -234,13 +251,16 @@ async fn redownload_partial_segment( async fn compare_local_segment_with_remote( mgr: &Manager, partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, ) -> anyhow::Result<()> { let local_path = local_segment_path(mgr, partial); match File::open(&local_path).await { - Ok(mut local_file) => do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial) - .await - .context("validation failed"), + Ok(mut local_file) => { + do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial, storage) + .await + .context("validation failed") + } Err(_) => { info!( "local WAL file {} is not present, skipping validation", @@ -258,6 +278,7 @@ async fn do_validation( file: &mut File, wal_seg_size: usize, partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, ) -> anyhow::Result<()> { let local_size = file.metadata().await?.len() as usize; if local_size != wal_seg_size { @@ -270,7 +291,7 @@ async fn do_validation( let remote_segfile = remote_segment_path(mgr, partial); let mut remote_reader: std::pin::Pin> = - wal_backup::read_object(&remote_segfile, 0).await?; + wal_backup::read_object(storage, &remote_segfile, 0).await?; // remote segment should have bytes excatly up to `flush_lsn` let expected_remote_size = partial.flush_lsn.segment_offset(mgr.wal_seg_size); diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index 71e99a4de7..48eda92fed 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -35,7 +35,7 @@ use crate::state::TimelineState; use crate::timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline}; use crate::timeline_guard::{AccessService, GuardId, ResidenceGuard}; use crate::timelines_set::{TimelineSetGuard, TimelinesSet}; -use crate::wal_backup::{self, WalBackupTaskHandle}; +use crate::wal_backup::{self, WalBackup, WalBackupTaskHandle}; use crate::wal_backup_partial::{self, PartialBackup, PartialRemoteSegment}; pub(crate) struct StateSnapshot { @@ -200,6 +200,7 @@ pub(crate) struct Manager { pub(crate) conf: SafeKeeperConf, pub(crate) wal_seg_size: usize, pub(crate) walsenders: Arc, + pub(crate) wal_backup: Arc, // current state pub(crate) state_version_rx: tokio::sync::watch::Receiver, @@ -238,6 +239,7 @@ pub async fn main_task( manager_tx: tokio::sync::mpsc::UnboundedSender, mut manager_rx: tokio::sync::mpsc::UnboundedReceiver, global_rate_limiter: RateLimiter, + wal_backup: Arc, ) { tli.set_status(Status::Started); @@ -256,6 +258,7 @@ pub async fn main_task( broker_active_set, manager_tx, global_rate_limiter, + wal_backup, ) .await; @@ -371,7 +374,7 @@ pub async fn main_task( mgr.tli_broker_active.set(false); // shutdown background tasks - if mgr.conf.is_wal_backup_enabled() { + if let Some(storage) = mgr.wal_backup.get_storage() { if let Some(backup_task) = mgr.backup_task.take() { // If we fell through here, then the timeline is shutting down. This is important // because otherwise joining on the wal_backup handle might hang. @@ -379,7 +382,7 @@ pub async fn main_task( backup_task.join().await; } - wal_backup::update_task(&mut mgr, false, &last_state).await; + wal_backup::update_task(&mut mgr, storage, false, &last_state).await; } if let Some(recovery_task) = &mut mgr.recovery_task { @@ -415,11 +418,13 @@ impl Manager { broker_active_set: Arc, manager_tx: tokio::sync::mpsc::UnboundedSender, global_rate_limiter: RateLimiter, + wal_backup: Arc, ) -> Manager { let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await; Manager { wal_seg_size: tli.get_wal_seg_size().await, walsenders: tli.get_walsenders().clone(), + wal_backup, state_version_rx: tli.get_state_version_rx(), num_computes_rx: tli.get_walreceivers().get_num_rx(), tli_broker_active: broker_active_set.guard(tli.clone()), @@ -477,8 +482,8 @@ impl Manager { let is_wal_backup_required = wal_backup::is_wal_backup_required(self.wal_seg_size, num_computes, state); - if self.conf.is_wal_backup_enabled() { - wal_backup::update_task(self, is_wal_backup_required, state).await; + if let Some(storage) = self.wal_backup.get_storage() { + wal_backup::update_task(self, storage, is_wal_backup_required, state).await; } // update the state in Arc @@ -624,9 +629,9 @@ impl Manager { /// Spawns partial WAL backup task if needed. async fn update_partial_backup(&mut self, state: &StateSnapshot) { // check if WAL backup is enabled and should be started - if !self.conf.is_wal_backup_enabled() { + let Some(storage) = self.wal_backup.get_storage() else { return; - } + }; if self.partial_backup_task.is_some() { // partial backup is already running @@ -650,6 +655,7 @@ impl Manager { self.conf.clone(), self.global_rate_limiter.clone(), cancel.clone(), + storage, )); self.partial_backup_task = Some((handle, cancel)); } @@ -669,6 +675,10 @@ impl Manager { /// Reset partial backup state and remove its remote storage data. Since it /// might concurrently uploading something, cancel the task first. async fn backup_partial_reset(&mut self) -> anyhow::Result> { + let Some(storage) = self.wal_backup.get_storage() else { + anyhow::bail!("remote storage is not enabled"); + }; + info!("resetting partial backup state"); // Force unevict timeline if it is evicted before erasing partial backup // state. The intended use of this function is to drop corrupted remote @@ -689,7 +699,7 @@ impl Manager { } let tli = self.wal_resident_timeline()?; - let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await; + let mut partial_backup = PartialBackup::new(tli, self.conf.clone(), storage).await; // Reset might fail e.g. when cfile is already reset but s3 removal // failed, so set manager state to None beforehand. In any case caller // is expected to retry until success. diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 41abee369e..af33bcbd20 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -25,6 +25,7 @@ use crate::rate_limit::RateLimiter; use crate::state::TimelinePersistentState; use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_timeline_dir}; use crate::timelines_set::TimelinesSet; +use crate::wal_backup::WalBackup; use crate::wal_storage::Storage; use crate::{SafeKeeperConf, control_file, wal_storage}; @@ -47,15 +48,24 @@ struct GlobalTimelinesState { conf: Arc, broker_active_set: Arc, global_rate_limiter: RateLimiter, + wal_backup: Arc, } impl GlobalTimelinesState { /// Get dependencies for a timeline constructor. - fn get_dependencies(&self) -> (Arc, Arc, RateLimiter) { + fn get_dependencies( + &self, + ) -> ( + Arc, + Arc, + RateLimiter, + Arc, + ) { ( self.conf.clone(), self.broker_active_set.clone(), self.global_rate_limiter.clone(), + self.wal_backup.clone(), ) } @@ -84,7 +94,7 @@ pub struct GlobalTimelines { impl GlobalTimelines { /// Create a new instance of the global timelines map. - pub fn new(conf: Arc) -> Self { + pub fn new(conf: Arc, wal_backup: Arc) -> Self { Self { state: Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), @@ -92,6 +102,7 @@ impl GlobalTimelines { conf, broker_active_set: Arc::new(TimelinesSet::default()), global_rate_limiter: RateLimiter::new(1, 1), + wal_backup, }), } } @@ -147,7 +158,7 @@ impl GlobalTimelines { /// just lock and unlock it for each timeline -- this function is called /// during init when nothing else is running, so this is fine. async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> { - let (conf, broker_active_set, partial_backup_rate_limiter) = { + let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = { let state = self.state.lock().unwrap(); state.get_dependencies() }; @@ -162,7 +173,7 @@ impl GlobalTimelines { TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or("")) { let ttid = TenantTimelineId::new(tenant_id, timeline_id); - match Timeline::load_timeline(conf.clone(), ttid) { + match Timeline::load_timeline(conf.clone(), ttid, wal_backup.clone()) { Ok(tli) => { let mut shared_state = tli.write_shared_state().await; self.state @@ -175,6 +186,7 @@ impl GlobalTimelines { &conf, broker_active_set.clone(), partial_backup_rate_limiter.clone(), + wal_backup.clone(), ); } // If we can't load a timeline, it's most likely because of a corrupted @@ -212,6 +224,10 @@ impl GlobalTimelines { self.state.lock().unwrap().broker_active_set.clone() } + pub fn get_wal_backup(&self) -> Arc { + self.state.lock().unwrap().wal_backup.clone() + } + /// Create a new timeline with the given id. If the timeline already exists, returns /// an existing timeline. pub(crate) async fn create( @@ -222,7 +238,7 @@ impl GlobalTimelines { start_lsn: Lsn, commit_lsn: Lsn, ) -> Result> { - let (conf, _, _) = { + let (conf, _, _, _) = { let state = self.state.lock().unwrap(); if let Ok(timeline) = state.get(&ttid) { // Timeline already exists, return it. @@ -267,7 +283,7 @@ impl GlobalTimelines { check_tombstone: bool, ) -> Result> { // Check for existence and mark that we're creating it. - let (conf, broker_active_set, partial_backup_rate_limiter) = { + let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = { let mut state = self.state.lock().unwrap(); match state.timelines.get(&ttid) { Some(GlobalMapTimeline::CreationInProgress) => { @@ -296,7 +312,14 @@ impl GlobalTimelines { }; // Do the actual move and reflect the result in the map. - match GlobalTimelines::install_temp_timeline(ttid, tmp_path, conf.clone()).await { + match GlobalTimelines::install_temp_timeline( + ttid, + tmp_path, + conf.clone(), + wal_backup.clone(), + ) + .await + { Ok(timeline) => { let mut timeline_shared_state = timeline.write_shared_state().await; let mut state = self.state.lock().unwrap(); @@ -314,6 +337,7 @@ impl GlobalTimelines { &conf, broker_active_set, partial_backup_rate_limiter, + wal_backup, ); drop(timeline_shared_state); Ok(timeline) @@ -336,6 +360,7 @@ impl GlobalTimelines { ttid: TenantTimelineId, tmp_path: &Utf8PathBuf, conf: Arc, + wal_backup: Arc, ) -> Result> { let tenant_path = get_tenant_dir(conf.as_ref(), &ttid.tenant_id); let timeline_path = get_timeline_dir(conf.as_ref(), &ttid); @@ -377,7 +402,7 @@ impl GlobalTimelines { // Do the move. durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?; - Timeline::load_timeline(conf, ttid) + Timeline::load_timeline(conf, ttid, wal_backup) } /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 56f4a2faf9..0beb272a60 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -2,6 +2,7 @@ use std::cmp::min; use std::collections::HashSet; use std::num::NonZeroU32; use std::pin::Pin; +use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result}; @@ -17,7 +18,7 @@ use safekeeper_api::models::PeerInfo; use tokio::fs::File; use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::sync::{OnceCell, watch}; +use tokio::sync::watch; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; @@ -63,7 +64,12 @@ pub(crate) fn is_wal_backup_required( /// Based on peer information determine which safekeeper should offload; if it /// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task /// is running, kill it. -pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &StateSnapshot) { +pub(crate) async fn update_task( + mgr: &mut Manager, + storage: Arc, + need_backup: bool, + state: &StateSnapshot, +) { let (offloader, election_dbg_str) = determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf); let elected_me = Some(mgr.conf.my_id) == offloader; @@ -82,7 +88,12 @@ pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &St return; }; - let async_task = backup_task_main(resident, mgr.conf.backup_parallel_jobs, shutdown_rx); + let async_task = backup_task_main( + resident, + storage, + mgr.conf.backup_parallel_jobs, + shutdown_rx, + ); let handle = if mgr.conf.current_thread_runtime { tokio::spawn(async_task) @@ -169,33 +180,31 @@ fn determine_offloader( } } -static REMOTE_STORAGE: OnceCell> = OnceCell::const_new(); - -// Storage must be configured and initialized when this is called. -fn get_configured_remote_storage() -> &'static GenericRemoteStorage { - REMOTE_STORAGE - .get() - .expect("failed to get remote storage") - .as_ref() - .unwrap() +pub struct WalBackup { + storage: Option>, } -pub async fn init_remote_storage(conf: &SafeKeeperConf) { - // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide - // dependencies to all tasks instead. - REMOTE_STORAGE - .get_or_init(|| async { - if let Some(conf) = conf.remote_storage.as_ref() { - Some( - GenericRemoteStorage::from_config(conf) - .await - .expect("failed to create remote storage"), - ) - } else { - None +impl WalBackup { + /// Create a new WalBackup instance. + pub async fn new(conf: &SafeKeeperConf) -> Result { + if !conf.wal_backup_enabled { + return Ok(Self { storage: None }); + } + + match conf.remote_storage.as_ref() { + Some(config) => { + let storage = GenericRemoteStorage::from_config(config).await?; + Ok(Self { + storage: Some(Arc::new(storage)), + }) } - }) - .await; + None => Ok(Self { storage: None }), + } + } + + pub fn get_storage(&self) -> Option> { + self.storage.clone() + } } struct WalBackupTask { @@ -204,12 +213,14 @@ struct WalBackupTask { wal_seg_size: usize, parallel_jobs: usize, commit_lsn_watch_rx: watch::Receiver, + storage: Arc, } /// Offload single timeline. #[instrument(name = "wal_backup", skip_all, fields(ttid = %tli.ttid))] async fn backup_task_main( tli: WalResidentTimeline, + storage: Arc, parallel_jobs: usize, mut shutdown_rx: Receiver<()>, ) { @@ -223,6 +234,7 @@ async fn backup_task_main( timeline_dir: tli.get_timeline_dir(), timeline: tli, parallel_jobs, + storage, }; // task is spinned up only when wal_seg_size already initialized @@ -293,6 +305,7 @@ impl WalBackupTask { match backup_lsn_range( &self.timeline, + self.storage.clone(), &mut backup_lsn, commit_lsn, self.wal_seg_size, @@ -322,6 +335,7 @@ impl WalBackupTask { async fn backup_lsn_range( timeline: &WalResidentTimeline, + storage: Arc, backup_lsn: &mut Lsn, end_lsn: Lsn, wal_seg_size: usize, @@ -352,7 +366,12 @@ async fn backup_lsn_range( loop { let added_task = match iter.next() { Some(s) => { - uploads.push_back(backup_single_segment(s, timeline_dir, remote_timeline_path)); + uploads.push_back(backup_single_segment( + &storage, + s, + timeline_dir, + remote_timeline_path, + )); true } None => false, @@ -388,6 +407,7 @@ async fn backup_lsn_range( } async fn backup_single_segment( + storage: &GenericRemoteStorage, seg: &Segment, timeline_dir: &Utf8Path, remote_timeline_path: &RemotePath, @@ -395,7 +415,13 @@ async fn backup_single_segment( let segment_file_path = seg.file_path(timeline_dir)?; let remote_segment_path = seg.remote_path(remote_timeline_path); - let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await; + let res = backup_object( + storage, + &segment_file_path, + &remote_segment_path, + seg.size(), + ) + .await; if res.is_ok() { BACKED_UP_SEGMENTS.inc(); } else { @@ -455,12 +481,11 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec { } async fn backup_object( + storage: &GenericRemoteStorage, source_file: &Utf8Path, target_file: &RemotePath, size: usize, ) -> Result<()> { - let storage = get_configured_remote_storage(); - let file = File::open(&source_file) .await .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?; @@ -475,12 +500,11 @@ async fn backup_object( } pub(crate) async fn backup_partial_segment( + storage: &GenericRemoteStorage, source_file: &Utf8Path, target_file: &RemotePath, size: usize, ) -> Result<()> { - let storage = get_configured_remote_storage(); - let file = File::open(&source_file) .await .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?; @@ -504,25 +528,20 @@ pub(crate) async fn backup_partial_segment( } pub(crate) async fn copy_partial_segment( + storage: &GenericRemoteStorage, source: &RemotePath, destination: &RemotePath, ) -> Result<()> { - let storage = get_configured_remote_storage(); let cancel = CancellationToken::new(); storage.copy_object(source, destination, &cancel).await } pub async fn read_object( + storage: &GenericRemoteStorage, file_path: &RemotePath, offset: u64, ) -> anyhow::Result>> { - let storage = REMOTE_STORAGE - .get() - .context("Failed to get remote storage")? - .as_ref() - .context("No remote storage configured")?; - info!("segment download about to start from remote path {file_path:?} at offset {offset}"); let cancel = CancellationToken::new(); @@ -547,8 +566,10 @@ pub async fn read_object( /// Delete WAL files for the given timeline. Remote storage must be configured /// when called. -pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { - let storage = get_configured_remote_storage(); +pub async fn delete_timeline( + storage: &GenericRemoteStorage, + ttid: &TenantTimelineId, +) -> Result<()> { let remote_path = remote_timeline_path(ttid)?; // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE @@ -618,14 +639,14 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { } /// Used by wal_backup_partial. -pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> { +pub async fn delete_objects(storage: &GenericRemoteStorage, paths: &[RemotePath]) -> Result<()> { let cancel = CancellationToken::new(); // not really used - let storage = get_configured_remote_storage(); storage.delete_objects(paths, &cancel).await } /// Copy segments from one timeline to another. Used in copy_timeline. pub async fn copy_s3_segments( + storage: &GenericRemoteStorage, wal_seg_size: usize, src_ttid: &TenantTimelineId, dst_ttid: &TenantTimelineId, @@ -634,12 +655,6 @@ pub async fn copy_s3_segments( ) -> Result<()> { const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024; - let storage = REMOTE_STORAGE - .get() - .expect("failed to get remote storage") - .as_ref() - .unwrap(); - let remote_dst_path = remote_timeline_path(dst_ttid)?; let cancel = CancellationToken::new(); diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 049852a048..fe0f1b3607 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -19,9 +19,11 @@ //! file. Code updates state in the control file before doing any S3 operations. //! This way control file stores information about all potentially existing //! remote partial segments and can clean them up after uploading a newer version. +use std::sync::Arc; + use camino::Utf8PathBuf; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; -use remote_storage::RemotePath; +use remote_storage::{GenericRemoteStorage, RemotePath}; use safekeeper_api::Term; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; @@ -154,12 +156,16 @@ pub struct PartialBackup { conf: SafeKeeperConf, local_prefix: Utf8PathBuf, remote_timeline_path: RemotePath, - + storage: Arc, state: State, } impl PartialBackup { - pub async fn new(tli: WalResidentTimeline, conf: SafeKeeperConf) -> PartialBackup { + pub async fn new( + tli: WalResidentTimeline, + conf: SafeKeeperConf, + storage: Arc, + ) -> PartialBackup { let (_, persistent_state) = tli.get_state().await; let wal_seg_size = tli.get_wal_seg_size().await; @@ -173,6 +179,7 @@ impl PartialBackup { conf, local_prefix, remote_timeline_path, + storage, } } @@ -240,7 +247,8 @@ impl PartialBackup { let remote_path = prepared.remote_path(&self.remote_timeline_path); // Upload first `backup_bytes` bytes of the segment to the remote storage. - wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?; + wal_backup::backup_partial_segment(&self.storage, &local_path, &remote_path, backup_bytes) + .await?; PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64); // We uploaded the segment, now let's verify that the data is still actual. @@ -326,7 +334,7 @@ impl PartialBackup { let remote_path = self.remote_timeline_path.join(seg); objects_to_delete.push(remote_path); } - wal_backup::delete_objects(&objects_to_delete).await + wal_backup::delete_objects(&self.storage, &objects_to_delete).await } /// Delete all non-Uploaded segments from the remote storage. There should be only one @@ -424,6 +432,7 @@ pub async fn main_task( conf: SafeKeeperConf, limiter: RateLimiter, cancel: CancellationToken, + storage: Arc, ) -> Option { debug!("started"); let await_duration = conf.partial_backup_timeout; @@ -432,7 +441,7 @@ pub async fn main_task( let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx(); let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx(); - let mut backup = PartialBackup::new(tli, conf).await; + let mut backup = PartialBackup::new(tli, conf, storage).await; debug!("state: {:?}", backup.state); diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index f0bac4b40a..8ba3e7cc47 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -21,6 +21,7 @@ use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo, dispatch_pgversion}; use pq_proto::SystemId; use remote_storage::RemotePath; +use std::sync::Arc; use tokio::fs::{self, File, OpenOptions, remove_file}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; use tracing::*; @@ -32,7 +33,7 @@ use crate::metrics::{ REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, time_io_closure, }; use crate::state::TimelinePersistentState; -use crate::wal_backup::{read_object, remote_timeline_path}; +use crate::wal_backup::{WalBackup, read_object, remote_timeline_path}; pub trait Storage { // Last written LSN. @@ -645,7 +646,7 @@ pub struct WalReader { wal_segment: Option>>, // S3 will be used to read WAL if LSN is not available locally - enable_remote_read: bool, + wal_backup: Arc, // We don't have WAL locally if LSN is less than local_start_lsn local_start_lsn: Lsn, @@ -664,7 +665,7 @@ impl WalReader { timeline_dir: Utf8PathBuf, state: &TimelinePersistentState, start_pos: Lsn, - enable_remote_read: bool, + wal_backup: Arc, ) -> Result { if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) { bail!("state uninitialized, no data to read"); @@ -693,7 +694,7 @@ impl WalReader { wal_seg_size: state.server.wal_seg_size as usize, pos: start_pos, wal_segment: None, - enable_remote_read, + wal_backup, local_start_lsn: state.local_start_lsn, timeline_start_lsn: state.timeline_start_lsn, pg_version: state.server.pg_version / 10000, @@ -812,9 +813,9 @@ impl WalReader { } // Try to open remote file, if remote reads are enabled - if self.enable_remote_read { + if let Some(storage) = self.wal_backup.get_storage() { let remote_wal_file_path = self.remote_path.join(&wal_file_name); - return read_object(&remote_wal_file_path, xlogoff as u64).await; + return read_object(&storage, &remote_wal_file_path, xlogoff as u64).await; } bail!("WAL segment is not found") From baafcc5d4108b1be38edf428c3f3dd87cc0c9508 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Fri, 16 May 2025 14:12:39 +0000 Subject: [PATCH 108/142] proxy: Fix misspelled flag value alias, swap names and aliases (#11949) ## Problem There's a misspelled flag value alias that's not really used anywhere. ## Summary of changes Fix the alias and make aliases the official flag values and keep old values as aliases. Also rename enum variant. No need for it to carry the version now. --- proxy/src/binary/proxy.rs | 9 +++++---- proxy/src/context/mod.rs | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 4cb5ddc335..51713902bc 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -43,11 +43,12 @@ project_build_tag!(BUILD_TAG); use clap::{Parser, ValueEnum}; #[derive(Clone, Debug, ValueEnum)] +#[clap(rename_all = "kebab-case")] enum AuthBackendType { - #[value(name("cplane-v1"), alias("control-plane"))] - ControlPlaneV1, + #[clap(alias("cplane-v1"))] + ControlPlane, - #[value(name("link"), alias("control-redirect"))] + #[clap(alias("link"))] ConsoleRedirect, #[cfg(any(test, feature = "testing"))] @@ -707,7 +708,7 @@ fn build_auth_backend( args: &ProxyCliArgs, ) -> anyhow::Result, &'static ConsoleRedirectBackend>> { match &args.auth_backend { - AuthBackendType::ControlPlaneV1 => { + AuthBackendType::ControlPlane => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 5f649d2b21..79aaf22990 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -78,7 +78,7 @@ struct RequestContextInner { #[derive(Clone, Debug)] pub(crate) enum AuthMethod { - // aka passwordless, fka link + // aka link ConsoleRedirect, ScramSha256, ScramSha256Plus, From 55f91cf10b30c3c648ac1301b95cd049bd7f0e21 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 16 May 2025 17:45:08 +0300 Subject: [PATCH 109/142] Update 'nix' package (#11948) There were some incompatible changes. Most churn was from switching from the now-deprecated fcntl:flock() function to fcntl::Flock::lock(). The new function returns a guard object, while with the old function, the lock was associated directly with the file descriptor. It's good to stay up-to-date in general, but the impetus to do this now is that in https://github.com/neondatabase/neon/pull/11929, I want to use some functions that were added only in the latest version of 'nix', and it's nice to not have to build multiple versions. (Although, different versions of 'nix' are still pulled in as indirect dependencies from other packages) --- Cargo.lock | 25 +++++--- Cargo.toml | 2 +- control_plane/src/background_process.rs | 4 +- control_plane/src/bin/neon_local.rs | 13 ++-- libs/utils/src/crashsafe.rs | 6 +- libs/utils/src/fs_ext/rename_noreplace.rs | 4 +- libs/utils/src/lock_file.rs | 63 ++++++++++--------- pageserver/src/tenant/secondary/downloader.rs | 4 +- pageserver/src/virtual_file.rs | 2 +- 9 files changed, 66 insertions(+), 57 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f075b45e49..1edd20105d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1112,6 +1112,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "cgroups-rs" version = "0.3.3" @@ -1306,7 +1312,7 @@ dependencies = [ "itertools 0.10.5", "jsonwebtoken", "metrics", - "nix 0.27.1", + "nix 0.30.1", "notify", "num_cpus", "once_cell", @@ -1429,7 +1435,7 @@ dependencies = [ "humantime-serde", "hyper 0.14.30", "jsonwebtoken", - "nix 0.27.1", + "nix 0.30.1", "once_cell", "pageserver_api", "pageserver_client", @@ -3512,9 +3518,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.169" +version = "0.2.172" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] name = "libloading" @@ -3821,12 +3827,13 @@ dependencies = [ [[package]] name = "nix" -version = "0.27.1" +version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" +checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ "bitflags 2.8.0", "cfg-if", + "cfg_aliases", "libc", "memoffset 0.9.0", ] @@ -4280,7 +4287,7 @@ dependencies = [ "jsonwebtoken", "md5", "metrics", - "nix 0.27.1", + "nix 0.30.1", "num-traits", "num_cpus", "once_cell", @@ -4356,7 +4363,7 @@ dependencies = [ "humantime", "humantime-serde", "itertools 0.10.5", - "nix 0.27.1", + "nix 0.30.1", "once_cell", "postgres_backend", "postgres_ffi", @@ -7899,7 +7906,7 @@ dependencies = [ "humantime", "jsonwebtoken", "metrics", - "nix 0.27.1", + "nix 0.30.1", "once_cell", "pem", "pin-project-lite", diff --git a/Cargo.toml b/Cargo.toml index 6b87ce549d..d6fffe7768 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -127,7 +127,7 @@ md5 = "0.7.0" measured = { version = "0.0.22", features=["lasso"] } measured-process = { version = "0.0.22" } memoffset = "0.9" -nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] } +nix = { version = "0.30.1", features = ["dir", "fs", "process", "socket", "signal", "poll"] } # Do not update to >= 7.0.0, at least. The update will have a significant impact # on compute startup metrics (start_postgres_ms), >= 25% degradation. notify = "6.0.0" diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 1eac4f7ff0..4f0934e411 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -14,7 +14,7 @@ use std::ffi::OsStr; use std::io::Write; -use std::os::unix::prelude::AsRawFd; +use std::os::fd::AsFd; use std::os::unix::process::CommandExt; use std::path::Path; use std::process::Command; @@ -356,7 +356,7 @@ where let file = pid_file::claim_for_current_process(&path).expect("claim pid file"); // Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile // remains locked after exec. - nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty())) + nix::fcntl::fcntl(file.as_fd(), FcntlArg::F_SETFD(FdFlag::empty())) .expect("remove FD_CLOEXEC"); // Don't run drop(file), it would close the file before we actually exec. std::mem::forget(file); diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 191a22f1de..98ab6e5657 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -8,7 +8,6 @@ use std::borrow::Cow; use std::collections::{BTreeSet, HashMap}; use std::fs::File; -use std::os::fd::AsRawFd; use std::path::PathBuf; use std::process::exit; use std::str::FromStr; @@ -31,7 +30,7 @@ use control_plane::safekeeper::SafekeeperNode; use control_plane::storage_controller::{ NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController, }; -use nix::fcntl::{FlockArg, flock}; +use nix::fcntl::{Flock, FlockArg}; use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, @@ -749,16 +748,16 @@ struct TimelineTreeEl { /// A flock-based guard over the neon_local repository directory struct RepoLock { - _file: File, + _file: Flock, } impl RepoLock { fn new() -> Result { let repo_dir = File::open(local_env::base_path())?; - let repo_dir_fd = repo_dir.as_raw_fd(); - flock(repo_dir_fd, FlockArg::LockExclusive)?; - - Ok(Self { _file: repo_dir }) + match Flock::lock(repo_dir, FlockArg::LockExclusive) { + Ok(f) => Ok(Self { _file: f }), + Err((_, e)) => Err(e).context("flock error"), + } } } diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index 215fa36df4..45acaf682f 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::fs::{self, File}; use std::io::{self, Write}; -use std::os::fd::AsRawFd; +use std::os::fd::AsFd; use camino::{Utf8Path, Utf8PathBuf}; @@ -210,13 +210,13 @@ pub fn overwrite( /// Syncs the filesystem for the given file descriptor. #[cfg_attr(target_os = "macos", allow(unused_variables))] -pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> { +pub fn syncfs(fd: impl AsFd) -> anyhow::Result<()> { // Linux guarantees durability for syncfs. // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync(). #[cfg(target_os = "linux")] { use anyhow::Context; - nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?; + nix::unistd::syncfs(fd).context("syncfs")?; } #[cfg(target_os = "macos")] { diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs index d0c07353d0..c945ecadf0 100644 --- a/libs/utils/src/fs_ext/rename_noreplace.rs +++ b/libs/utils/src/fs_ext/rename_noreplace.rs @@ -11,9 +11,9 @@ pub fn rename_noreplace( #[cfg(all(target_os = "linux", target_env = "gnu"))] { nix::fcntl::renameat2( - None, + nix::fcntl::AT_FDCWD, src, - None, + nix::fcntl::AT_FDCWD, dst, nix::fcntl::RenameFlags::RENAME_NOREPLACE, ) diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs index 6aeeeca021..b3c8d74d7d 100644 --- a/libs/utils/src/lock_file.rs +++ b/libs/utils/src/lock_file.rs @@ -1,6 +1,6 @@ //! A module to create and read lock files. //! -//! File locking is done using [`fcntl::flock`] exclusive locks. +//! File locking is done using [`nix::fcntl::Flock`] exclusive locks. //! The only consumer of this module is currently //! [`pid_file`](crate::pid_file). See the module-level comment //! there for potential pitfalls with lock files that are used @@ -9,26 +9,25 @@ use std::fs; use std::io::{Read, Write}; use std::ops::Deref; -use std::os::unix::prelude::AsRawFd; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use nix::errno::Errno::EAGAIN; -use nix::fcntl; +use nix::fcntl::{Flock, FlockArg}; use crate::crashsafe; -/// A handle to an open and unlocked, but not-yet-written lock file. +/// A handle to an open and flocked, but not-yet-written lock file. /// Returned by [`create_exclusive`]. #[must_use] pub struct UnwrittenLockFile { path: Utf8PathBuf, - file: fs::File, + file: Flock, } /// Returned by [`UnwrittenLockFile::write_content`]. #[must_use] -pub struct LockFileGuard(fs::File); +pub struct LockFileGuard(Flock); impl Deref for LockFileGuard { type Target = fs::File; @@ -67,17 +66,14 @@ pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result Ok(UnwrittenLockFile { + Ok(lock_file) => Ok(UnwrittenLockFile { path: lock_file_path.to_owned(), file: lock_file, }), - Err(EAGAIN) => anyhow::bail!("file is already locked"), - Err(e) => Err(e).context("flock error"), + Err((_, EAGAIN)) => anyhow::bail!("file is already locked"), + Err((_, e)) => Err(e).context("flock error"), } } @@ -105,32 +101,37 @@ pub enum LockFileRead { /// Check the [`LockFileRead`] variants for details. pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result { let res = fs::OpenOptions::new().read(true).open(path); - let mut lock_file = match res { + let lock_file = match res { Ok(f) => f, Err(e) => match e.kind() { std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist), _ => return Err(e).context("open lock file"), }, }; - let res = fcntl::flock( - lock_file.as_raw_fd(), - fcntl::FlockArg::LockExclusiveNonblock, - ); + let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock); // We need the content regardless of lock success / failure. // But, read it after flock so that, if it succeeded, the content is consistent. - let mut content = String::new(); - lock_file - .read_to_string(&mut content) - .context("read lock file")?; match res { - Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess( - LockFileGuard(lock_file), - content, - )), - Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess { - not_locked_file: lock_file, - content, - }), - Err(e) => Err(e).context("flock error"), + Ok(mut locked_file) => { + let mut content = String::new(); + locked_file + .read_to_string(&mut content) + .context("read lock file")?; + Ok(LockFileRead::NotHeldByAnyProcess( + LockFileGuard(locked_file), + content, + )) + } + Err((mut not_locked_file, EAGAIN)) => { + let mut content = String::new(); + not_locked_file + .read_to_string(&mut content) + .context("read lock file")?; + Ok(LockFileRead::LockedByOtherProcess { + not_locked_file, + content, + }) + } + Err((_, e)) => Err(e).context("flock error"), } } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index c26b7626ef..dd49c843f3 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -668,7 +668,9 @@ impl From for UpdateError { impl From for UpdateError { fn from(value: std::io::Error) -> Self { - if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) { + if let Some(nix::errno::Errno::ENOSPC) = + value.raw_os_error().map(nix::errno::Errno::from_raw) + { UpdateError::NoSpace } else if value .get_ref() diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index c707d35114..45b6e44c54 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -408,7 +408,7 @@ impl OpenFiles { /// error types may be elegible for retry. pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool { use nix::errno::Errno::*; - match e.raw_os_error().map(nix::errno::from_i32) { + match e.raw_os_error().map(nix::errno::Errno::from_raw) { Some(EIO) => { // Terminate on EIO because we no longer trust the device to store // data safely, or to uphold persistence guarantees on fsync. From 532d9b646e4eaab6e0d94da8a6f890a9c834647c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 17 May 2025 00:22:36 +0300 Subject: [PATCH 110/142] Add simple facility for an extendable shared memory area (#11929) You still need to provide a max size up-front, but memory is only allocated for the portion that is in use. The module is currently unused, but will be used by the new compute communicator project, in the neon Postgres extension. See https://github.com/neondatabase/neon/issues/11729 --------- Co-authored-by: Erik Grinaker --- Cargo.lock | 11 + Cargo.toml | 3 +- libs/neon-shmem/Cargo.toml | 13 ++ libs/neon-shmem/src/lib.rs | 418 +++++++++++++++++++++++++++++++++++++ workspace_hack/Cargo.toml | 3 +- 5 files changed, 446 insertions(+), 2 deletions(-) create mode 100644 libs/neon-shmem/Cargo.toml create mode 100644 libs/neon-shmem/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 1edd20105d..8ca65b58ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3794,6 +3794,16 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +[[package]] +name = "neon-shmem" +version = "0.1.0" +dependencies = [ + "nix 0.30.1", + "tempfile", + "thiserror 1.0.69", + "workspace_hack", +] + [[package]] name = "never-say-never" version = "6.6.666" @@ -8482,6 +8492,7 @@ dependencies = [ "log", "memchr", "nix 0.26.4", + "nix 0.30.1", "nom", "num", "num-bigint", diff --git a/Cargo.toml b/Cargo.toml index d6fffe7768..74b281f88f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ members = [ "libs/postgres_ffi", "libs/safekeeper_api", "libs/desim", + "libs/neon-shmem", "libs/utils", "libs/consumption_metrics", "libs/postgres_backend", @@ -127,7 +128,7 @@ md5 = "0.7.0" measured = { version = "0.0.22", features=["lasso"] } measured-process = { version = "0.0.22" } memoffset = "0.9" -nix = { version = "0.30.1", features = ["dir", "fs", "process", "socket", "signal", "poll"] } +nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] } # Do not update to >= 7.0.0, at least. The update will have a significant impact # on compute startup metrics (start_postgres_ms), >= 25% degradation. notify = "6.0.0" diff --git a/libs/neon-shmem/Cargo.toml b/libs/neon-shmem/Cargo.toml new file mode 100644 index 0000000000..2a636bec40 --- /dev/null +++ b/libs/neon-shmem/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "neon-shmem" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +thiserror.workspace = true +nix.workspace=true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } + +[target.'cfg(target_os = "macos")'.dependencies] +tempfile = "3.14.0" diff --git a/libs/neon-shmem/src/lib.rs b/libs/neon-shmem/src/lib.rs new file mode 100644 index 0000000000..e1b14b1371 --- /dev/null +++ b/libs/neon-shmem/src/lib.rs @@ -0,0 +1,418 @@ +//! Shared memory utilities for neon communicator + +use std::num::NonZeroUsize; +use std::os::fd::{AsFd, BorrowedFd, OwnedFd}; +use std::ptr::NonNull; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use nix::errno::Errno; +use nix::sys::mman::MapFlags; +use nix::sys::mman::ProtFlags; +use nix::sys::mman::mmap as nix_mmap; +use nix::sys::mman::munmap as nix_munmap; +use nix::unistd::ftruncate as nix_ftruncate; + +/// ShmemHandle represents a shared memory area that can be shared by processes over fork(). +/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's +/// specified at creation. +/// +/// The area is backed by an anonymous file created with memfd_create(). The full address space for +/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`], +/// the underlying file is resized. Do not access the area beyond the current size. Currently, that +/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the +/// future. +pub struct ShmemHandle { + /// memfd file descriptor + fd: OwnedFd, + + max_size: usize, + + // Pointer to the beginning of the shared memory area. The header is stored there. + shared_ptr: NonNull, + + // Pointer to the beginning of the user data + pub data_ptr: NonNull, +} + +/// This is stored at the beginning in the shared memory area. +struct SharedStruct { + max_size: usize, + + /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag + current_size: AtomicUsize, +} + +const RESIZE_IN_PROGRESS: usize = 1 << 63; + +const HEADER_SIZE: usize = std::mem::size_of::(); + +/// Error type returned by the ShmemHandle functions. +#[derive(thiserror::Error, Debug)] +#[error("{msg}: {errno}")] +pub struct Error { + pub msg: String, + pub errno: Errno, +} + +impl Error { + fn new(msg: &str, errno: Errno) -> Error { + Error { + msg: msg.to_string(), + errno, + } + } +} + +impl ShmemHandle { + /// Create a new shared memory area. To communicate between processes, the processes need to be + /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes. + /// + /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other + /// processes can continue using it, however. + pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result { + // create the backing anonymous file. + let fd = create_backing_file(name)?; + + Self::new_with_fd(fd, initial_size, max_size) + } + + fn new_with_fd( + fd: OwnedFd, + initial_size: usize, + max_size: usize, + ) -> Result { + // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size + // is a little larger than this because of the SharedStruct header. Make the upper limit + // somewhat smaller than that, because with anything close to that, you'll run out of + // memory anyway. + if max_size >= 1 << 48 { + panic!("max size {} too large", max_size); + } + if initial_size > max_size { + panic!("initial size {initial_size} larger than max size {max_size}"); + } + + // The actual initial / max size is the one given by the caller, plus the size of + // 'SharedStruct'. + let initial_size = HEADER_SIZE + initial_size; + let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap(); + + // Reserve address space for it with mmap + // + // TODO: Use MAP_HUGETLB if possible + let start_ptr = unsafe { + nix_mmap( + None, + max_size, + ProtFlags::PROT_READ | ProtFlags::PROT_WRITE, + MapFlags::MAP_SHARED, + &fd, + 0, + ) + } + .map_err(|e| Error::new("mmap failed: {e}", e))?; + + // Reserve space for the initial size + enlarge_file(fd.as_fd(), initial_size as u64)?; + + // Initialize the header + let shared: NonNull = start_ptr.cast(); + unsafe { + shared.write(SharedStruct { + max_size: max_size.into(), + current_size: AtomicUsize::new(initial_size), + }) + }; + + // The user data begins after the header + let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) }; + + Ok(ShmemHandle { + fd, + max_size: max_size.into(), + shared_ptr: shared, + data_ptr, + }) + } + + // return reference to the header + fn shared(&self) -> &SharedStruct { + unsafe { self.shared_ptr.as_ref() } + } + + /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified + /// when creating the area. + /// + /// This may only be called from one process/thread concurrently. We detect that case + /// and return an Error. + pub fn set_size(&self, new_size: usize) -> Result<(), Error> { + let new_size = new_size + HEADER_SIZE; + let shared = self.shared(); + + if new_size > self.max_size { + panic!( + "new size ({} is greater than max size ({})", + new_size, self.max_size + ); + } + assert_eq!(self.max_size, shared.max_size); + + // Lock the area by setting the bit in 'current_size' + // + // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory + // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But + // since this is not performance-critical, better safe than sorry . + let mut old_size = shared.current_size.load(Ordering::Acquire); + loop { + if (old_size & RESIZE_IN_PROGRESS) != 0 { + return Err(Error::new( + "concurrent resize detected", + Errno::UnknownErrno, + )); + } + match shared.current_size.compare_exchange( + old_size, + new_size, + Ordering::Acquire, + Ordering::Relaxed, + ) { + Ok(_) => break, + Err(x) => old_size = x, + } + } + + // Ok, we got the lock. + // + // NB: If anything goes wrong, we *must* clear the bit! + let result = { + use std::cmp::Ordering::{Equal, Greater, Less}; + match new_size.cmp(&old_size) { + Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| { + Error::new("could not shrink shmem segment, ftruncate failed: {e}", e) + }), + Equal => Ok(()), + Greater => enlarge_file(self.fd.as_fd(), new_size as u64), + } + }; + + // Unlock + shared.current_size.store( + if result.is_ok() { new_size } else { old_size }, + Ordering::Release, + ); + + result + } + + /// Returns the current user-visible size of the shared memory segment. + /// + /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's + /// responsibility not to access the area beyond the current size. + pub fn current_size(&self) -> usize { + let total_current_size = + self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS; + total_current_size - HEADER_SIZE + } +} + +impl Drop for ShmemHandle { + fn drop(&mut self) { + // SAFETY: The pointer was obtained from mmap() with the given size. + // We unmap the entire region. + let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) }; + // The fd is dropped automatically by OwnedFd. + } +} + +/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an +/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for +/// development and testing, but in production we want the file to stay in memory. +/// +/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused. +#[allow(unused_variables)] +fn create_backing_file(name: &str) -> Result { + #[cfg(not(target_os = "macos"))] + { + nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty()) + .map_err(|e| Error::new("memfd_create failed: {e}", e)) + } + #[cfg(target_os = "macos")] + { + let file = tempfile::tempfile().map_err(|e| { + Error::new( + "could not create temporary file to back shmem area: {e}", + nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)), + ) + })?; + Ok(OwnedFd::from(file)) + } +} + +fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> { + // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that + // we don't get a segfault later when trying to actually use it. + #[cfg(not(target_os = "macos"))] + { + nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| { + Error::new( + "could not grow shmem segment, posix_fallocate failed: {e}", + e, + ) + }) + } + // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate' + #[cfg(target_os = "macos")] + { + nix::unistd::ftruncate(fd, size as i64) + .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use nix::unistd::ForkResult; + use std::ops::Range; + + /// check that all bytes in given range have the expected value. + fn assert_range(ptr: *const u8, expected: u8, range: Range) { + for i in range { + let b = unsafe { *(ptr.add(i)) }; + assert_eq!(expected, b, "unexpected byte at offset {}", i); + } + } + + /// Write 'b' to all bytes in the given range + fn write_range(ptr: *mut u8, b: u8, range: Range) { + unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) }; + } + + // simple single-process test of growing and shrinking + #[test] + fn test_shmem_resize() -> Result<(), Error> { + let max_size = 1024 * 1024; + let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?; + + assert_eq!(init_struct.current_size(), 0); + + // Initial grow + let size1 = 10000; + init_struct.set_size(size1).unwrap(); + assert_eq!(init_struct.current_size(), size1); + + // Write some data + let data_ptr = init_struct.data_ptr.as_ptr(); + write_range(data_ptr, 0xAA, 0..size1); + assert_range(data_ptr, 0xAA, 0..size1); + + // Shrink + let size2 = 5000; + init_struct.set_size(size2).unwrap(); + assert_eq!(init_struct.current_size(), size2); + + // Grow again + let size3 = 20000; + init_struct.set_size(size3).unwrap(); + assert_eq!(init_struct.current_size(), size3); + + // Try to read it. The area that was shrunk and grown again should read as all zeros now + assert_range(data_ptr, 0xAA, 0..5000); + assert_range(data_ptr, 0, 5000..size1); + + // Try to grow beyond max_size + //let size4 = max_size + 1; + //assert!(init_struct.set_size(size4).is_err()); + + // Dropping init_struct should unmap the memory + drop(init_struct); + + Ok(()) + } + + /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier, + /// but is stored in the shared memory area and works across processes. It's implemented by + /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes. + struct SimpleBarrier { + num_procs: usize, + count: AtomicUsize, + } + + impl SimpleBarrier { + unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) { + unsafe { + *ptr = SimpleBarrier { + num_procs, + count: AtomicUsize::new(0), + } + } + } + + pub fn wait(&self) { + let old = self.count.fetch_add(1, Ordering::Relaxed); + + let generation = old / self.num_procs; + + let mut current = old + 1; + while current < (generation + 1) * self.num_procs { + std::thread::sleep(std::time::Duration::from_millis(10)); + current = self.count.load(Ordering::Relaxed); + } + } + } + + #[test] + fn test_multi_process() { + // Initialize + let max_size = 1_000_000_000_000; + let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap(); + let ptr = init_struct.data_ptr.as_ptr(); + + // Store the SimpleBarrier in the first 1k of the area. + init_struct.set_size(10000).unwrap(); + let barrier_ptr: *mut SimpleBarrier = unsafe { + ptr.add(ptr.align_offset(std::mem::align_of::())) + .cast() + }; + unsafe { SimpleBarrier::init(barrier_ptr, 2) }; + let barrier = unsafe { barrier_ptr.as_ref().unwrap() }; + + // Fork another test process. The code after this runs in both processes concurrently. + let fork_result = unsafe { nix::unistd::fork().unwrap() }; + + // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000 + if fork_result.is_parent() { + write_range(ptr, 0xAA, 1000..2000); + } else { + write_range(ptr, 0xBB, 2000..3000); + } + barrier.wait(); + // Verify the contents. (in both processes) + assert_range(ptr, 0xAA, 1000..2000); + assert_range(ptr, 0xBB, 2000..3000); + + // Grow, from the child this time + let size = 10_000_000; + if !fork_result.is_parent() { + init_struct.set_size(size).unwrap(); + } + barrier.wait(); + + // make some writes at the end + if fork_result.is_parent() { + write_range(ptr, 0xAA, (size - 10)..size); + } else { + write_range(ptr, 0xBB, (size - 20)..(size - 10)); + } + barrier.wait(); + + // Verify the contents. (This runs in both processes) + assert_range(ptr, 0, (size - 1000)..(size - 20)); + assert_range(ptr, 0xBB, (size - 20)..(size - 10)); + assert_range(ptr, 0xAA, (size - 10)..size); + + if let ForkResult::Parent { child } = fork_result { + nix::sys::wait::waitpid(child, None).unwrap(); + } + } +} diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index fecf62f756..69d44b82ea 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -60,7 +60,8 @@ lazy_static = { version = "1", default-features = false, features = ["spin_no_st libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } -nix = { version = "0.26" } +nix-2f80eeee3b1b6c7e = { package = "nix", version = "0.26" } +nix-fa1f6196edfd7249 = { package = "nix", version = "0.30", features = ["dir", "ioctl", "mman", "poll", "signal", "socket"] } nom = { version = "7" } num = { version = "0.4" } num-bigint = { version = "0.4" } From deed46015dd5eaa2dcc48f5f17f3e923a13e6711 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Sat, 17 May 2025 08:34:54 +0200 Subject: [PATCH 111/142] CI(test-images): increase timeout from 20m to 60m (#11955) ## Problem For some reason (unknown yet) 20m timeout is not enough for `test-images` job on arm runners. Ref: https://github.com/neondatabase/neon/actions/runs/15075321681/job/42387530399?pr=11953 ## Summary of changes - Increase the timeout from 20m to 1h --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6b19f6ef01..a887db2ab1 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -963,7 +963,7 @@ jobs: fi - name: Verify docker-compose example and test extensions - timeout-minutes: 20 + timeout-minutes: 60 env: TAG: >- ${{ From 8e05639dbf6def383da7b138e28cf930ac506647 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 17 May 2025 22:06:59 +0300 Subject: [PATCH 112/142] Invalidate LFC after unlogged build (#11951) ## Problem See https://neondb.slack.com/archives/C04DGM6SMTM/p1747391617951239 LFC is not always properly updated during unlogged build so it can contain stale content. ## Summary of changes Invalidate LFC content at the end of unlogged build Co-authored-by: Konstantin Knizhnik --- pgxn/neon/file_cache.c | 38 ++++++++++++++++++++++++++++++++++++++ pgxn/neon/file_cache.h | 1 + pgxn/neon/pagestore_smgr.c | 19 ++----------------- 3 files changed, 41 insertions(+), 17 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index ecc55bb540..176fd9643f 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -936,6 +936,44 @@ lfc_prewarm_main(Datum main_arg) lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp(); } +void +lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks) +{ + BufferTag tag; + FileCacheEntry *entry; + uint32 hash; + + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + return; + + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + if (LFC_ENABLED()) + { + for (BlockNumber blkno = 0; blkno < nblocks; blkno += lfc_blocks_per_chunk) + { + tag.blockNum = blkno; + hash = get_hash_value(lfc_hash, &tag); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + if (entry != NULL) + { + for (int i = 0; i < lfc_blocks_per_chunk; i++) + { + if (GET_STATE(entry, i) == AVAILABLE) + { + lfc_ctl->used_pages -= 1; + SET_STATE(entry, i, UNAVAILABLE); + } + } + } + } + } + LWLockRelease(lfc_lock); +} /* * Check if page is present in the cache. diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h index c7b6b09f72..d5ac55d5ba 100644 --- a/pgxn/neon/file_cache.h +++ b/pgxn/neon/file_cache.h @@ -28,6 +28,7 @@ typedef struct FileCacheState extern bool lfc_store_prefetch_result; /* functions for local file cache */ +extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks); extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *const *buffers, BlockNumber nblocks); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 31e47db7d7..5558a903e2 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -919,9 +919,6 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdextend(reln, forkNum, blkno, buffer, skipFsync); - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer); return; default: @@ -1010,14 +1007,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - { - for (int i = 0; i < nblocks; i++) - { - lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data); - } - } return; default: @@ -1617,9 +1606,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo #else mdwrite(reln, forknum, blocknum, buffer, skipFsync); #endif - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); @@ -1685,9 +1671,6 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); @@ -2083,6 +2066,8 @@ neon_end_unlogged_build(SMgrRelation reln) forknum); forget_cached_relsize(InfoFromNInfoB(rinfob), forknum); + lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks); + mdclose(reln, forknum); #ifndef DEBUG_COMPARE_LOCAL /* use isRedo == true, so that we drop it immediately */ From 81c6a5a796d1a4278b320d241c2dcab95982a7c6 Mon Sep 17 00:00:00 2001 From: Emmanuel Ferdman Date: Sun, 18 May 2025 00:12:01 +0300 Subject: [PATCH 113/142] Migrate to correct logger interface (#11956) ## Problem Currently the `logger` library throws annoying deprecation warnings: ```python DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead ``` ## Summary of changes This small PR resolves the annoying deprecation warnings by migrating to `.warning` as suggested. Signed-off-by: Emmanuel Ferdman --- test_runner/fixtures/neon_cli.py | 2 +- test_runner/regress/test_pageserver_secondary.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 4eaa4b7d99..bb07e2b6d1 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -103,7 +103,7 @@ class AbstractNeonCli: else: stdout = "" - log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}") + log.warning(f"CLI timeout: stderr={stderr}, stdout={stdout}") raise indent = " " diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 3aa0c63979..f2523ec9b5 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -510,7 +510,7 @@ def list_elegible_layers( except KeyError: # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map # matches what's on disk. - log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}") + log.warning(f"Lookup {layer_file_name} from {list(visible_map.keys())}") raise return list(c for c in candidates if is_visible(c)) @@ -636,7 +636,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): except: # On assertion failures, log some details to help with debugging heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id) - log.warn(f"heatmap contents: {json.dumps(heatmap, indent=2)}") + log.warning(f"heatmap contents: {json.dumps(heatmap, indent=2)}") raise # Scrub the remote storage From 4f0a9fc5698dfcc1a59ce6d32ca2b1e8ebb5de77 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 18 May 2025 00:06:32 +0200 Subject: [PATCH 114/142] chore(deps): bump flask-cors from 5.0.0 to 6.0.0 in the pip group across 1 directory (#11960) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index 1a772d3415..e6440761be 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1145,18 +1145,19 @@ dotenv = ["python-dotenv"] [[package]] name = "flask-cors" -version = "5.0.0" -description = "A Flask extension adding a decorator for CORS support" +version = "6.0.0" +description = "A Flask extension simplifying CORS support" optional = false -python-versions = "*" +python-versions = "<4.0,>=3.9" groups = ["main"] files = [ - {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"}, - {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"}, + {file = "flask_cors-6.0.0-py3-none-any.whl", hash = "sha256:6332073356452343a8ccddbfec7befdc3fdd040141fe776ec9b94c262f058657"}, + {file = "flask_cors-6.0.0.tar.gz", hash = "sha256:4592c1570246bf7beee96b74bc0adbbfcb1b0318f6ba05c412e8909eceec3393"}, ] [package.dependencies] -Flask = ">=0.9" +flask = ">=0.9" +Werkzeug = ">=0.7" [[package]] name = "frozenlist" From e9631296784799269f079af6a3c5b2fe65e3c057 Mon Sep 17 00:00:00 2001 From: Trung Dinh Date: Sat, 17 May 2025 15:30:29 -0700 Subject: [PATCH 115/142] pagesteam_handle_batched_message -> pagestream_handle_batched_message (#11916) ## Problem Found a typo in code. ## Summary of changes Co-authored-by: Trung Dinh Co-authored-by: Erik Grinaker --- pageserver/src/page_service.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index bca1cb5b49..101e312ec3 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1278,7 +1278,7 @@ impl PageServerHandler { } #[instrument(level = tracing::Level::DEBUG, skip_all)] - async fn pagesteam_handle_batched_message( + async fn pagestream_handle_batched_message( &mut self, pgb_writer: &mut PostgresBackend, batch: BatchedFeMessage, @@ -1733,7 +1733,7 @@ impl PageServerHandler { }; let result = self - .pagesteam_handle_batched_message( + .pagestream_handle_batched_message( pgb_writer, msg, io_concurrency.clone(), @@ -1909,7 +1909,7 @@ impl PageServerHandler { return Err(e); } }; - self.pagesteam_handle_batched_message( + self.pagestream_handle_batched_message( pgb_writer, batch, io_concurrency.clone(), From 81c557d87e2381d653deb0b0b9decbbdfc76f30f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 18 May 2025 08:02:47 +0300 Subject: [PATCH 116/142] Unlogged build get smgr (#11954) ## Problem See https://github.com/neondatabase/neon/issues/11910 and https://neondb.slack.com/archives/C04DGM6SMTM/p1747314649059129 ## Summary of changes Do not change persistence in `start_unlogged_build` Postgres PRs: https://github.com/neondatabase/postgres/pull/642 https://github.com/neondatabase/postgres/pull/641 https://github.com/neondatabase/postgres/pull/640 https://github.com/neondatabase/postgres/pull/639 --------- Co-authored-by: Konstantin Knizhnik --- compute/patches/rum.patch | 6 +-- pgxn/neon/neon_pgversioncompat.h | 8 +++- pgxn/neon/pagestore_smgr.c | 78 ++++++++++++++++++++++++-------- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/postgres-v17 | 2 +- vendor/revisions.json | 8 ++-- 8 files changed, 76 insertions(+), 32 deletions(-) diff --git a/compute/patches/rum.patch b/compute/patches/rum.patch index b45afe2874..aed1badc13 100644 --- a/compute/patches/rum.patch +++ b/compute/patches/rum.patch @@ -7,7 +7,7 @@ index 255e616..1c6edb7 100644 RelationGetRelationName(index)); +#ifdef NEON_SMGR -+ smgr_start_unlogged_build(index->rd_smgr); ++ smgr_start_unlogged_build(RelationGetSmgr(index)); +#endif + initRumState(&buildstate.rumstate, index); @@ -18,7 +18,7 @@ index 255e616..1c6edb7 100644 rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild); +#ifdef NEON_SMGR -+ smgr_finish_unlogged_build_phase_1(index->rd_smgr); ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index)); +#endif + /* @@ -29,7 +29,7 @@ index 255e616..1c6edb7 100644 } +#ifdef NEON_SMGR -+ smgr_end_unlogged_build(index->rd_smgr); ++ smgr_end_unlogged_build(RelationGetSmgr(index)); +#endif + /* diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index b3ed0c04e8..bf91a02b45 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -86,7 +86,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, #define InvalidRelFileNumber InvalidOid -#define SMgrRelGetRelInfo(reln) \ +#define SMgrRelGetRelInfo(reln) \ (reln->smgr_rnode.node) #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers @@ -148,6 +148,12 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers #endif +#define NRelFileInfoInvalidate(rinfo) do { \ + NInfoGetSpcOid(rinfo) = InvalidOid; \ + NInfoGetDbOid(rinfo) = InvalidOid; \ + NInfoGetRelNumber(rinfo) = InvalidRelFileNumber; \ + } while (0) + #if PG_MAJORVERSION_NUM < 17 #define ProcNumber BackendId #define INVALID_PROC_NUMBER InvalidBackendId diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 5558a903e2..43fd715bbb 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -108,7 +108,7 @@ typedef enum UNLOGGED_BUILD_NOT_PERMANENT } UnloggedBuildPhase; -static SMgrRelation unlogged_build_rel = NULL; +static NRelFileInfo unlogged_build_rel_info; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); @@ -912,8 +912,14 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, { case 0: neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdextend(reln, forkNum, blkno, buffer, skipFsync); + return; + } break; case RELPERSISTENCE_TEMP: @@ -1000,8 +1006,14 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, { case 0: neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); + return; + } break; case RELPERSISTENCE_TEMP: @@ -1376,8 +1388,14 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer { case 0: neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); + break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdread(reln, forkNum, blkno, buffer); + return; + } break; case RELPERSISTENCE_TEMP: @@ -1463,8 +1481,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, { case 0: neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); + break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdreadv(reln, forknum, blocknum, buffers, nblocks); + return; + } break; case RELPERSISTENCE_TEMP: @@ -1597,6 +1621,15 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { +#if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); +#else + mdwrite(reln, forknum, blocknum, buffer, skipFsync); +#endif + return; + } break; case RELPERSISTENCE_TEMP: @@ -1666,6 +1699,11 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); + return; + } break; case RELPERSISTENCE_TEMP: @@ -1706,6 +1744,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + return mdnblocks(reln, forknum); + } break; case RELPERSISTENCE_TEMP: @@ -1775,6 +1817,11 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdtruncate(reln, forknum, old_blocks, nblocks); + return; + } break; case RELPERSISTENCE_TEMP: @@ -1913,7 +1960,6 @@ neon_start_unlogged_build(SMgrRelation reln) */ if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) neon_log(ERROR, "unlogged relation build is already in progress"); - Assert(unlogged_build_rel == NULL); ereport(SmgrTrace, (errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u", @@ -1930,7 +1976,7 @@ neon_start_unlogged_build(SMgrRelation reln) case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: - unlogged_build_rel = reln; + unlogged_build_rel_info = InfoFromSMgrRel(reln); unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT; #ifdef DEBUG_COMPARE_LOCAL if (!IsParallelWorker()) @@ -1951,12 +1997,9 @@ neon_start_unlogged_build(SMgrRelation reln) neon_log(ERROR, "cannot perform unlogged index build, index is not empty "); #endif - unlogged_build_rel = reln; + unlogged_build_rel_info = InfoFromSMgrRel(reln); unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; - /* Make the relation look like it's unlogged */ - reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; - /* * Create the local file. In a parallel build, the leader is expected to * call this first and do it. @@ -1983,17 +2026,16 @@ neon_start_unlogged_build(SMgrRelation reln) static void neon_finish_unlogged_build_phase_1(SMgrRelation reln) { - Assert(unlogged_build_rel == reln); + Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))); ereport(SmgrTrace, (errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u", - RelFileInfoFmt(InfoFromSMgrRel(reln))))); + RelFileInfoFmt((unlogged_build_rel_info))))); if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT) return; Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); - Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); /* * In a parallel build, (only) the leader process performs the 2nd @@ -2001,7 +2043,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) */ if (IsParallelWorker()) { - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; } else @@ -2022,11 +2064,11 @@ neon_end_unlogged_build(SMgrRelation reln) { NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln); - Assert(unlogged_build_rel == reln); + Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))); ereport(SmgrTrace, (errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u", - RelFileInfoFmt(InfoFromNInfoB(rinfob))))); + RelFileInfoFmt(unlogged_build_rel_info)))); if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT) { @@ -2034,7 +2076,6 @@ neon_end_unlogged_build(SMgrRelation reln) BlockNumber nblocks; Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2); - Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); /* * Update the last-written LSN cache. @@ -2055,9 +2096,6 @@ neon_end_unlogged_build(SMgrRelation reln) InfoFromNInfoB(rinfob), MAIN_FORKNUM); - /* Make the relation look permanent again */ - reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT; - /* Remove local copy */ for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) { @@ -2078,7 +2116,7 @@ neon_end_unlogged_build(SMgrRelation reln) mdunlink(rinfob, INIT_FORKNUM, true); #endif } - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; } @@ -2151,7 +2189,7 @@ AtEOXact_neon(XactEvent event, void *arg) * Forget about any build we might have had in progress. The local * file will be unlinked by smgrDoPendingDeletes() */ - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; break; @@ -2163,7 +2201,7 @@ AtEOXact_neon(XactEvent event, void *arg) case XACT_EVENT_PRE_PREPARE: if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) { - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 4cca6f8083..55c0d45abe 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 4cca6f8083483dda9e12eae292cf788d45bd561f +Subproject commit 55c0d45abe6467c02084c2192bca117eda6ce1e7 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index daa81cffcf..de7640f55d 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit daa81cffcf063c54b29a9aabdb6604625f675ad0 +Subproject commit de7640f55da07512834d5cc40c4b3fb376b5f04f diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 15710a76b7..0bf96bd6d7 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc +Subproject commit 0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index e5374b7299..8be779fd3a 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit e5374b72997b0afc8374137674e873f7a558120a +Subproject commit 8be779fd3ab9e87206da96a7e4842ef1abf04f44 diff --git a/vendor/revisions.json b/vendor/revisions.json index 0fc2d3996d..3e999760f4 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.5", - "e5374b72997b0afc8374137674e873f7a558120a" + "8be779fd3ab9e87206da96a7e4842ef1abf04f44" ], "v16": [ "16.9", - "15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc" + "0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198" ], "v15": [ "15.13", - "daa81cffcf063c54b29a9aabdb6604625f675ad0" + "de7640f55da07512834d5cc40c4b3fb376b5f04f" ], "v14": [ "14.18", - "4cca6f8083483dda9e12eae292cf788d45bd561f" + "55c0d45abe6467c02084c2192bca117eda6ce1e7" ] } From cdb6479c8abd87df7c0c535ced25aeef5991a983 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 19 May 2025 11:03:06 +0200 Subject: [PATCH 117/142] pageserver: add gRPC page service schema (#11815) ## Problem For the [communicator project](https://github.com/neondatabase/company_projects/issues/352), we want to move to gRPC for the page service protocol. Touches #11728. ## Summary of changes This patch adds an experimental gRPC Protobuf schema for the page service. It is equivalent to the current page service, but with several improvements, e.g.: * Connection multiplexing. * Reduced head-of-line blocking. * Client-side batching. * Explicit tenant shard routing. * GetPage request classification (normal vs. prefetch). * Explicit rate limiting ("slow down" response status). The API is exposed as a new `pageserver/page_api` package. This is separate from the `pageserver_api` package to reduce the dependency footprint for the communicator. The longer-term plan is to also split out e.g. the WAL ingestion service to a separate gRPC package, e.g. `pageserver/wal_api`. Subsequent PRs will: add Rust domain types for the Protobuf types, expose a gRPC server, and implement the page service. Preliminary prototype benchmarks of this gRPC API is within 10% of baseline libpq performance. We'll do further benchmarking and optimization as the implementation lands in `main` and is deployed to staging. --- Cargo.lock | 10 + Cargo.toml | 2 + pageserver/page_api/Cargo.toml | 13 ++ pageserver/page_api/build.rs | 7 + pageserver/page_api/proto/page_service.proto | 220 +++++++++++++++++++ pageserver/page_api/src/lib.rs | 14 ++ 6 files changed, 266 insertions(+) create mode 100644 pageserver/page_api/Cargo.toml create mode 100644 pageserver/page_api/build.rs create mode 100644 pageserver/page_api/proto/page_service.proto create mode 100644 pageserver/page_api/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 8ca65b58ce..d919537818 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4434,6 +4434,16 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_page_api" +version = "0.1.0" +dependencies = [ + "prost 0.13.3", + "tonic", + "tonic-build", + "workspace_hack", +] + [[package]] name = "papaya" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index 74b281f88f..a280c446b9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ members = [ "pageserver/ctl", "pageserver/client", "pageserver/pagebench", + "pageserver/page_api", "proxy", "safekeeper", "safekeeper/client", @@ -252,6 +253,7 @@ pageserver = { path = "./pageserver" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } pageserver_client = { path = "./pageserver/client" } pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" } +pageserver_page_api = { path = "./pageserver/page_api" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } diff --git a/pageserver/page_api/Cargo.toml b/pageserver/page_api/Cargo.toml new file mode 100644 index 0000000000..c237949226 --- /dev/null +++ b/pageserver/page_api/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "pageserver_page_api" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +prost.workspace = true +tonic.workspace = true +workspace_hack.workspace = true + +[build-dependencies] +tonic-build.workspace = true diff --git a/pageserver/page_api/build.rs b/pageserver/page_api/build.rs new file mode 100644 index 0000000000..ce3c49ed82 --- /dev/null +++ b/pageserver/page_api/build.rs @@ -0,0 +1,7 @@ +fn main() -> Result<(), Box> { + // Generates Rust code from .proto Protobuf schemas. + tonic_build::configure() + .bytes(["."]) + .compile_protos(&["proto/page_service.proto"], &["proto"]) + .map_err(|err| err.into()) +} diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto new file mode 100644 index 0000000000..12e4d2f9db --- /dev/null +++ b/pageserver/page_api/proto/page_service.proto @@ -0,0 +1,220 @@ +// Page service, presented by pageservers for computes. +// +// This is the compute read path. It primarily serves page versions at given +// LSNs, but also base backups, SLRU segments, and relation metadata. +// +// EXPERIMENTAL: this is still under development and subject to change. +// +// Request metadata headers: +// - authorization: JWT token ("Bearer "), if auth is enabled +// - neon-tenant-id: tenant ID ("7c4a1f9e3bd6470c8f3e21a65bd2e980") +// - neon-shard-id: shard ID, as in hex ("0b10" = shard 11 of 16, 0-based) +// - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e") +// +// TODO: consider adding neon-compute-mode ("primary", "static", "replica"). +// However, this will require reconnecting when changing modes. +// +// TODO: write implementation guidance on +// - Health checks +// - Tracing, OpenTelemetry +// - Compression + +syntax = "proto3"; +package page_service; + +service PageService { + // Returns whether a relation exists. + rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse); + + // Fetches a base backup. + rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk); + + // Returns the total size of a database, as # of bytes. + rpc GetDbSize (GetDbSizeRequest) returns (GetDbSizeResponse); + + // Fetches pages. + // + // This is implemented as a bidirectional streaming RPC for performance. Unary + // requests incur costs for e.g. HTTP/2 stream setup, header parsing, + // authentication, and so on -- with streaming, we only pay these costs during + // the initial stream setup. This ~doubles throughput in benchmarks. Other + // RPCs use regular unary requests, since they are not as frequent and + // performance-critical, and this simplifies implementation. + // + // NB: a status response (e.g. errors) will terminate the stream. The stream + // may be shared by e.g. multiple Postgres backends, so we should avoid this. + // Most errors are therefore sent as GetPageResponse.status instead. + rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse); + + // Returns the size of a relation, as # of blocks. + rpc GetRelSize (GetRelSizeRequest) returns (GetRelSizeResponse); + + // Fetches an SLRU segment. + rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse); +} + +// The LSN a request should read at. +message ReadLsn { + // The request's read LSN. Required. + uint64 request_lsn = 1; + // If given, the caller guarantees that the page has not been modified since + // this LSN. Must be smaller than or equal to request_lsn. This allows the + // Pageserver to serve an old page without waiting for the request LSN to + // arrive. Valid for all request types. + // + // It is undefined behaviour to make a request such that the page was, in + // fact, modified between request_lsn and not_modified_since_lsn. The + // Pageserver might detect it and return an error, or it might return the old + // page version or the new page version. Setting not_modified_since_lsn equal + // to request_lsn is always safe, but can lead to unnecessary waiting. + uint64 not_modified_since_lsn = 2; +} + +// A relation identifier. +message RelTag { + uint32 spc_oid = 1; + uint32 db_oid = 2; + uint32 rel_number = 3; + uint32 fork_number = 4; +} + +// Checks whether a relation exists, at the given LSN. Only valid on shard 0, +// other shards will error. +message CheckRelExistsRequest { + ReadLsn read_lsn = 1; + RelTag rel = 2; +} + +message CheckRelExistsResponse { + bool exists = 1; +} + +// Requests a base backup at a given LSN. +message GetBaseBackupRequest { + // The LSN to fetch a base backup at. + ReadLsn read_lsn = 1; + // If true, logical replication slots will not be created. + bool replica = 2; +} + +// Base backup response chunk, returned as an ordered stream. +message GetBaseBackupResponseChunk { + // A basebackup data chunk. The size is undefined, but bounded by the 4 MB + // gRPC message size limit. + bytes chunk = 1; +} + +// Requests the size of a database, as # of bytes. Only valid on shard 0, other +// shards will error. +message GetDbSizeRequest { + ReadLsn read_lsn = 1; + uint32 db_oid = 2; +} + +message GetDbSizeResponse { + uint64 num_bytes = 1; +} + +// Requests one or more pages. +message GetPageRequest { + // A request ID. Will be included in the response. Should be unique for + // in-flight requests on the stream. + uint64 request_id = 1; + // The request class. + GetPageClass request_class = 2; + // The LSN to read at. + ReadLsn read_lsn = 3; + // The relation to read from. + RelTag rel = 4; + // Page numbers to read. Must belong to the remote shard. + // + // Multiple pages will be executed as a single batch by the Pageserver, + // amortizing layer access costs and parallelizing them. This may increase the + // latency of any individual request, but improves the overall latency and + // throughput of the batch as a whole. + // + // TODO: this causes an allocation in the common single-block case. The sender + // can use a SmallVec to stack-allocate it, but Prost will always deserialize + // into a heap-allocated Vec. Consider optimizing this. + // + // TODO: we might be able to avoid a sort or something if we mandate that these + // are always in order. But we can't currenly rely on this on the server, because + // of compatibility with the libpq protocol handler. + repeated uint32 block_number = 5; +} + +// A GetPageRequest class. Primarily intended for observability, but may also be +// used for prioritization in the future. +enum GetPageClass { + // Unknown class. For forwards compatibility: used when the client sends a + // class that the server doesn't know about. + GET_PAGE_CLASS_UNKNOWN = 0; + // A normal request. This is the default. + GET_PAGE_CLASS_NORMAL = 1; + // A prefetch request. NB: can only be classified on pg < 18. + GET_PAGE_CLASS_PREFETCH = 2; + // A background request (e.g. vacuum). + GET_PAGE_CLASS_BACKGROUND = 3; +} + +// A GetPage response. +// +// A batch response will contain all of the requested pages. We could eagerly +// emit individual pages as soon as they are ready, but on a readv() Postgres +// holds buffer pool locks on all pages in the batch and we'll only return once +// the entire batch is ready, so no one can make use of the individual pages. +message GetPageResponse { + // The original request's ID. + uint64 request_id = 1; + // The response status code. + GetPageStatus status = 2; + // A string describing the status, if any. + string reason = 3; + // The 8KB page images, in the same order as the request. Empty if status != OK. + repeated bytes page_image = 4; +} + +// A GetPageResponse status code. Since we use a bidirectional stream, we don't +// want to send errors as gRPC statuses, since this would terminate the stream. +enum GetPageStatus { + // Unknown status. For forwards compatibility: used when the server sends a + // status code that the client doesn't know about. + GET_PAGE_STATUS_UNKNOWN = 0; + // The request was successful. + GET_PAGE_STATUS_OK = 1; + // The page did not exist. The tenant/timeline/shard has already been + // validated during stream setup. + GET_PAGE_STATUS_NOT_FOUND = 2; + // The request was invalid. + GET_PAGE_STATUS_INVALID = 3; + // The tenant is rate limited. Slow down and retry later. + GET_PAGE_STATUS_SLOW_DOWN = 4; + // TODO: consider adding a GET_PAGE_STATUS_LAYER_DOWNLOAD in the case of a + // layer download. This could free up the server task to process other + // requests while the layer download is in progress. +} + +// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on +// shard 0, other shards will error. +message GetRelSizeRequest { + ReadLsn read_lsn = 1; + RelTag rel = 2; +} + +message GetRelSizeResponse { + uint32 num_blocks = 1; +} + +// Requests an SLRU segment. Only valid on shard 0, other shards will error. +message GetSlruSegmentRequest { + ReadLsn read_lsn = 1; + uint32 kind = 2; + uint32 segno = 3; +} + +// Returns an SLRU segment. +// +// These are up 32 pages (256 KB), so we can send them as a single response. +message GetSlruSegmentResponse { + bytes segment = 1; +} diff --git a/pageserver/page_api/src/lib.rs b/pageserver/page_api/src/lib.rs new file mode 100644 index 0000000000..0226d594cb --- /dev/null +++ b/pageserver/page_api/src/lib.rs @@ -0,0 +1,14 @@ +//! This crate provides the Pageserver's page API. It contains: +//! +//! * proto/page_service.proto: the Protobuf schema for the page API. +//! * proto: auto-generated Protobuf types for gRPC. +//! +//! This crate is used by both the client and the server. Try to keep it slim. + +// Code generated by protobuf. +pub mod proto { + tonic::include_proto!("page_service"); + + pub use page_service_client::PageServiceClient; + pub use page_service_server::{PageService, PageServiceServer}; +} From 76a7d37f7e266a946a0de91dae89f7ded66ef09f Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Mon, 19 May 2025 13:10:55 +0300 Subject: [PATCH 118/142] proxy: Drop cancellation ops if they don't fit into the queue (#11950) Add a redis ops batch size argument for proxy and remove timeouts by using try_send() --- proxy/src/binary/proxy.rs | 12 ++++++++++-- proxy/src/cancellation.rs | 20 +++++++++----------- proxy/src/console_redirect_proxy.rs | 4 +--- proxy/src/proxy/mod.rs | 4 +--- proxy/src/proxy/passthrough.rs | 2 +- 5 files changed, 22 insertions(+), 20 deletions(-) diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 51713902bc..f40d5041c1 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -161,8 +161,11 @@ struct ProxyCliArgs { #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)] redis_rps_limit: Vec, /// Cancellation channel size (max queue size for redis kv client) - #[clap(long, default_value = "1024")] + #[clap(long, default_value_t = 1024)] cancellation_ch_size: usize, + /// Cancellation ops batch size for redis + #[clap(long, default_value_t = 8)] + cancellation_batch_size: usize, /// cache for `allowed_ips` (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] allowed_ips_cache: String, @@ -542,7 +545,12 @@ pub async fn run() -> anyhow::Result<()> { if let Some(mut redis_kv_client) = redis_kv_client { maintenance_tasks.spawn(async move { redis_kv_client.try_connect().await?; - handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?; + handle_cancel_messages( + &mut redis_kv_client, + rx_cancel, + args.cancellation_batch_size, + ) + .await?; drop(redis_kv_client); diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index f34fb747ca..a6e7bf85a0 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -30,8 +30,6 @@ use crate::tls::postgres_rustls::MakeRustlsConnect; type IpSubnetKey = IpNet; const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time -const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10); -const BATCH_SIZE: usize = 8; // Message types for sending through mpsc channel pub enum CancelKeyOp { @@ -231,12 +229,13 @@ impl CancelReplyOp { pub async fn handle_cancel_messages( client: &mut RedisKVClient, mut rx: mpsc::Receiver, + batch_size: usize, ) -> anyhow::Result<()> { - let mut batch = Vec::with_capacity(BATCH_SIZE); - let mut pipeline = Pipeline::with_capacity(BATCH_SIZE); + let mut batch = Vec::with_capacity(batch_size); + let mut pipeline = Pipeline::with_capacity(batch_size); loop { - if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 { + if rx.recv_many(&mut batch, batch_size).await == 0 { warn!("shutting down cancellation queue"); break Ok(()); } @@ -367,8 +366,7 @@ impl CancellationHandler { return Err(CancelError::InternalError); }; - tx.send_timeout(op, REDIS_SEND_TIMEOUT) - .await + tx.try_send(op) .map_err(|e| { tracing::warn!("failed to send GetCancelData for {key}: {e}"); }) @@ -570,7 +568,7 @@ impl Session { } // Send the store key op to the cancellation handler and set TTL for the key - pub(crate) async fn write_cancel_key( + pub(crate) fn write_cancel_key( &self, cancel_closure: CancelClosure, ) -> Result<(), CancelError> { @@ -596,14 +594,14 @@ impl Session { expire: CANCEL_KEY_TTL, }; - let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| { + let _ = tx.try_send(op).map_err(|e| { let key = self.key; tracing::warn!("failed to send StoreCancelKey for {key}: {e}"); }); Ok(()) } - pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> { + pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> { let Some(tx) = &self.cancellation_handler.tx else { tracing::warn!("cancellation handler is not available"); return Err(CancelError::InternalError); @@ -619,7 +617,7 @@ impl Session { .guard(RedisMsgKind::HDel), }; - let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| { + let _ = tx.try_send(op).map_err(|e| { let key = self.key; tracing::warn!("failed to send RemoveCancelKey for {key}: {e}"); }); diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 0f2c3def0d..e3184e20d1 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -244,9 +244,7 @@ pub(crate) async fn handle_client( let cancellation_handler_clone = Arc::clone(&cancellation_handler); let session = cancellation_handler_clone.get_key(); - session - .write_cancel_key(node.cancel_closure.clone()) - .await?; + session.write_cancel_key(node.cancel_closure.clone())?; prepare_client_connection(&node, *session.key(), &mut stream).await?; diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index cf331b8bc0..0a86022e78 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -383,9 +383,7 @@ pub(crate) async fn handle_client( let cancellation_handler_clone = Arc::clone(&cancellation_handler); let session = cancellation_handler_clone.get_key(); - session - .write_cancel_key(node.cancel_closure.clone()) - .await?; + session.write_cancel_key(node.cancel_closure.clone())?; prepare_client_connection(&node, *session.key(), &mut stream).await?; diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index c100b8d716..8f9bd2de2d 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -94,7 +94,7 @@ impl ProxyPassthrough { tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database"); } - drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error + drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error res } From 3685ad606d11de706b9d0eb5841b7801d6ae8a7d Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Mon, 19 May 2025 10:56:03 +0000 Subject: [PATCH 119/142] endpoint_storage: Fix metrics test by excluding assertion on macos (#11952) --- endpoint_storage/src/app.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs index 0bd7fe5f28..f44efe6d7a 100644 --- a/endpoint_storage/src/app.rs +++ b/endpoint_storage/src/app.rs @@ -462,6 +462,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH if var(REAL_S3_ENV).is_ok() { assert!(body.contains("remote_storage_s3_deleted_objects_total")); } + + #[cfg(target_os = "linux")] assert!(body.contains("process_threads")); } From 38dbc5f67f3dfbf501fb289f12f193bdec54ff6d Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 19 May 2025 13:17:45 +0200 Subject: [PATCH 120/142] pageserver/page_api: add binary Protobuf descriptor (#11968) ## Problem A binary Protobuf schema descriptor can be used to expose an API reflection service, which in turn allows convenient usage of e.g. `grpcurl` against the gRPC server. Touches #11728. ## Summary of changes * Generate a binary schema descriptor as `pageserver_page_api::proto::FILE_DESCRIPTOR_SET`. * Opportunistically rename the Protobuf package from `page_service` to `page_api`. --- pageserver/page_api/build.rs | 8 +++++++- pageserver/page_api/proto/page_service.proto | 15 ++++++++++++++- pageserver/page_api/src/lib.rs | 7 ++++++- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/pageserver/page_api/build.rs b/pageserver/page_api/build.rs index ce3c49ed82..e96297f10e 100644 --- a/pageserver/page_api/build.rs +++ b/pageserver/page_api/build.rs @@ -1,7 +1,13 @@ +use std::env; +use std::path::PathBuf; + +/// Generates Rust code from .proto Protobuf schemas, along with a binary file +/// descriptor set for Protobuf schema reflection. fn main() -> Result<(), Box> { - // Generates Rust code from .proto Protobuf schemas. + let out_dir = PathBuf::from(env::var("OUT_DIR")?); tonic_build::configure() .bytes(["."]) + .file_descriptor_set_path(out_dir.join("page_api_descriptor.bin")) .compile_protos(&["proto/page_service.proto"], &["proto"]) .map_err(|err| err.into()) } diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto index 12e4d2f9db..f6acb3eeeb 100644 --- a/pageserver/page_api/proto/page_service.proto +++ b/pageserver/page_api/proto/page_service.proto @@ -11,6 +11,19 @@ // - neon-shard-id: shard ID, as in hex ("0b10" = shard 11 of 16, 0-based) // - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e") // +// The service can be accessed via e.g. grpcurl: +// +// ``` +// grpcurl \ +// -plaintext \ +// -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \ +// -H "neon-shard-id: 0b10" \ +// -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \ +// -H "authorization: Bearer $JWT" \ +// -d '{"read_lsn": {"request_lsn": 1234567890}, "rel": {"spc_oid": 1663, "db_oid": 1234, "rel_number": 5678, "fork_number": 0}}' +// localhost:51051 page_api.PageService/CheckRelExists +// ``` +// // TODO: consider adding neon-compute-mode ("primary", "static", "replica"). // However, this will require reconnecting when changing modes. // @@ -20,7 +33,7 @@ // - Compression syntax = "proto3"; -package page_service; +package page_api; service PageService { // Returns whether a relation exists. diff --git a/pageserver/page_api/src/lib.rs b/pageserver/page_api/src/lib.rs index 0226d594cb..0b68d03aaa 100644 --- a/pageserver/page_api/src/lib.rs +++ b/pageserver/page_api/src/lib.rs @@ -7,7 +7,12 @@ // Code generated by protobuf. pub mod proto { - tonic::include_proto!("page_service"); + tonic::include_proto!("page_api"); + + /// File descriptor set for Protobuf schema reflection. This allows using + /// e.g. grpcurl with the API. + pub const FILE_DESCRIPTOR_SET: &[u8] = + tonic::include_file_descriptor_set!("page_api_descriptor"); pub use page_service_client::PageServiceClient; pub use page_service_server::{PageService, PageServiceServer}; From f4150614d0e1bbfa106c3e762670ccec09385cc5 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 19 May 2025 17:47:40 +0200 Subject: [PATCH 121/142] pageserver: don't pass config to `PageHandler` (#11973) ## Problem The gRPC page service API will require decoupling the `PageHandler` from the libpq protocol implementation. As preparation for this, avoid passing in the entire server config to `PageHandler`, and instead explicitly pass in the relevant fields. Touches https://github.com/neondatabase/neon/issues/11728. ## Summary of changes * Change `PageHandler` to take a `GetVectoredConcurrentIo` instead of the entire config. * Change `IoConcurrency::spawn_from_conf` to take a `GetVectoredConcurrentIo`. --- libs/pageserver_api/src/config.rs | 2 +- pageserver/src/basebackup.rs | 2 +- pageserver/src/http/routes.rs | 2 +- pageserver/src/page_service.rs | 12 ++++++------ pageserver/src/pgdatadir_mapping.rs | 6 +++--- pageserver/src/tenant.rs | 6 ++++-- pageserver/src/tenant/storage_layer.rs | 7 +++---- pageserver/src/tenant/timeline.rs | 4 ++-- pageserver/src/tenant/timeline/detach_ancestor.rs | 2 +- 9 files changed, 22 insertions(+), 21 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index f2ba50a86f..2618366469 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -235,7 +235,7 @@ pub enum PageServiceProtocolPipelinedBatchingStrategy { ScatteredLsn, } -#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(tag = "mode", rename_all = "kebab-case")] pub enum GetVectoredConcurrentIo { /// The read path is fully sequential: layers are visited diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 3510ccb529..b49021461e 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -144,7 +144,7 @@ where replica, ctx, io_concurrency: IoConcurrency::spawn_from_conf( - timeline.conf, + timeline.conf.get_vectored_concurrent_io, timeline .gate .enter() diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2edec9dda1..0b36eb5df7 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3199,7 +3199,7 @@ async fn list_aux_files( .await?; let io_concurrency = IoConcurrency::spawn_from_conf( - state.conf, + state.conf.get_vectored_concurrent_io, timeline.gate.enter().map_err(|_| ApiError::Cancelled)?, ); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 101e312ec3..83d9191240 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -18,7 +18,7 @@ use itertools::Itertools; use jsonwebtoken::TokenData; use once_cell::sync::OnceCell; use pageserver_api::config::{ - PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, + GetVectoredConcurrentIo, PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy, }; use pageserver_api::key::rel_block_to_key; @@ -331,10 +331,10 @@ async fn page_service_conn_main( // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. let mut conn_handler = PageServerHandler::new( - conf, tenant_manager, auth, pipelining_config, + conf.get_vectored_concurrent_io, perf_span_fields, connection_ctx, cancel.clone(), @@ -371,7 +371,6 @@ async fn page_service_conn_main( } struct PageServerHandler { - conf: &'static PageServerConf, auth: Option>, claims: Option, @@ -389,6 +388,7 @@ struct PageServerHandler { timeline_handles: Option, pipelining_config: PageServicePipeliningConfig, + get_vectored_concurrent_io: GetVectoredConcurrentIo, gate_guard: GateGuard, } @@ -844,17 +844,16 @@ impl BatchedFeMessage { impl PageServerHandler { #[allow(clippy::too_many_arguments)] pub fn new( - conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, pipelining_config: PageServicePipeliningConfig, + get_vectored_concurrent_io: GetVectoredConcurrentIo, perf_span_fields: ConnectionPerfSpanFields, connection_ctx: RequestContext, cancel: CancellationToken, gate_guard: GateGuard, ) -> Self { PageServerHandler { - conf, auth, claims: None, connection_ctx, @@ -862,6 +861,7 @@ impl PageServerHandler { timeline_handles: Some(TimelineHandles::new(tenant_manager)), cancel, pipelining_config, + get_vectored_concurrent_io, gate_guard, } } @@ -1623,7 +1623,7 @@ impl PageServerHandler { } let io_concurrency = IoConcurrency::spawn_from_conf( - self.conf, + self.get_vectored_concurrent_io, match self.gate_guard.try_clone() { Ok(guard) => guard, Err(_) => { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index d770946580..0f9bfd19a7 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -586,7 +586,7 @@ impl Timeline { // scan directory listing (new), merge with the old results let key_range = rel_tag_sparse_key_range(spcnode, dbnode); let io_concurrency = IoConcurrency::spawn_from_conf( - self.conf, + self.conf.get_vectored_concurrent_io, self.gate .enter() .map_err(|_| PageReconstructError::Cancelled)?, @@ -645,7 +645,7 @@ impl Timeline { ); let io_concurrency = IoConcurrency::spawn_from_conf( - self.conf, + self.conf.get_vectored_concurrent_io, self.gate .enter() .map_err(|_| PageReconstructError::Cancelled)?, @@ -885,7 +885,7 @@ impl Timeline { ); let io_concurrency = IoConcurrency::spawn_from_conf( - self.conf, + self.conf.get_vectored_concurrent_io, self.gate .enter() .map_err(|_| PageReconstructError::Cancelled)?, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 441049f47d..fffd1f4090 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -8596,8 +8596,10 @@ mod tests { lsn: Lsn, ctx: &RequestContext, ) -> Result, GetVectoredError> { - let io_concurrency = - IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap()); + let io_concurrency = IoConcurrency::spawn_from_conf( + tline.conf.get_vectored_concurrent_io, + tline.gate.enter().unwrap(), + ); let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn); let mut res = tline diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 5dfa961b71..9d15e7c4de 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -31,6 +31,7 @@ pub use inmemory_layer::InMemoryLayer; pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName}; +use pageserver_api::config::GetVectoredConcurrentIo; use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use pageserver_api::record::NeonWalRecord; @@ -43,7 +44,6 @@ use self::inmemory_layer::InMemoryLayerFileId; use super::PageReconstructError; use super::layer_map::InMemoryLayerDesc; use super::timeline::{GetVectoredError, ReadPath}; -use crate::config::PageServerConf; use crate::context::{ AccessStatsBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, }; @@ -318,11 +318,10 @@ impl IoConcurrency { } pub(crate) fn spawn_from_conf( - conf: &'static PageServerConf, + conf: GetVectoredConcurrentIo, gate_guard: GateGuard, ) -> IoConcurrency { - use pageserver_api::config::GetVectoredConcurrentIo; - let selected = match conf.get_vectored_concurrent_io { + let selected = match conf { GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential, GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard), }; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d7f5958128..d3c92ab47a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3530,7 +3530,7 @@ impl Timeline { }; let io_concurrency = IoConcurrency::spawn_from_conf( - self_ref.conf, + self_ref.conf.get_vectored_concurrent_io, self_ref .gate .enter() @@ -5559,7 +5559,7 @@ impl Timeline { }); let io_concurrency = IoConcurrency::spawn_from_conf( - self.conf, + self.conf.get_vectored_concurrent_io, self.gate .enter() .map_err(|_| CreateImageLayersError::Cancelled)?, diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 649b33e294..40eda8c785 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -188,7 +188,7 @@ pub(crate) async fn generate_tombstone_image_layer( "removing non-inherited keys by writing an image layer with tombstones at the detach LSN" ); let io_concurrency = IoConcurrency::spawn_from_conf( - detached.conf, + detached.conf.get_vectored_concurrent_io, detached.gate.enter().map_err(|_| Error::ShuttingDown)?, ); let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); From e94acbc816cf9eb453938353f301a648c6ce036c Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Tue, 20 May 2025 11:03:36 +0200 Subject: [PATCH 122/142] fix(compute_ctl): Dollar escaping and tests (#11969) ## Problem In the escaping path we were checking that `${tag}$` or `${outer_tag}$` are present in the string, but that's not enough, as original string surrounded by `$` can also form a 'tag', like `$x$xx$x$`, which is fine on it's own, but cannot be used in the string escaped with `$xx$`. ## Summary of changes Remove `$` from the checks, just check if `{tag}` or `{outer_tag}` are present. Add more test cases and change the catalog test to stress the `drop_subscriptions_before_start: true` path as well. Fixes https://github.com/neondatabase/cloud/issues/29198 --- compute_tools/src/pg_helpers.rs | 6 ++- compute_tools/tests/pg_helpers_tests.rs | 8 ++++ test_runner/regress/test_compute_catalog.py | 53 +++++++++++++++++++++ 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 10d8f2c878..94467a0d2f 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -213,8 +213,10 @@ impl Escaping for PgIdent { // Find the first suitable tag that is not present in the string. // Postgres' max role/DB name length is 63 bytes, so even in the - // worst case it won't take long. - while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) { + // worst case it won't take long. Outer tag is always `tag + "x"`, + // so if `tag` is not present in the string, `outer_tag` is not + // present in the string either. + while self.contains(&tag.to_string()) { tag += "x"; outer_tag = tag.clone() + "x"; } diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 53f2ddad84..04b6ed2256 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -71,6 +71,14 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor ("name$$$", ("$x$name$$$$x$", "xx")), ("name$$$$", ("$x$name$$$$$x$", "xx")), ("name$x$", ("$xx$name$x$$xx$", "xxx")), + ("x", ("$xx$x$xx$", "xxx")), + ("xx", ("$xxx$xx$xxx$", "xxxx")), + ("$x", ("$xx$$x$xx$", "xxx")), + ("x$", ("$xx$x$$xx$", "xxx")), + ("$x$", ("$xx$$x$$xx$", "xxx")), + ("xx$", ("$xxx$xx$$xxx$", "xxxx")), + ("$xx", ("$xxx$$xx$xxx$", "xxxx")), + ("$xx$", ("$xxx$$xx$$xxx$", "xxxx")), ]; for (input, expected) in test_cases { diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index b66b326360..6ee6837cd2 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -19,6 +19,16 @@ TEST_ROLE_NAMES = [ {"name": "role$"}, {"name": "role$$"}, {"name": "role$x$"}, + {"name": "x"}, + {"name": "xx"}, + {"name": "$x"}, + {"name": "x$"}, + {"name": "$x$"}, + {"name": "xx$"}, + {"name": "$xx"}, + {"name": "$xx$"}, + # 63 bytes is the limit for role/DB names in Postgres + {"name": "x" * 63}, ] TEST_DB_NAMES = [ @@ -74,6 +84,43 @@ TEST_DB_NAMES = [ "name": "db name$x$", "owner": "role$x$", }, + { + "name": "x", + "owner": "x", + }, + { + "name": "xx", + "owner": "xx", + }, + { + "name": "$x", + "owner": "$x", + }, + { + "name": "x$", + "owner": "x$", + }, + { + "name": "$x$", + "owner": "$x$", + }, + { + "name": "xx$", + "owner": "xx$", + }, + { + "name": "$xx", + "owner": "$xx", + }, + { + "name": "$xx$", + "owner": "$xx$", + }, + # 63 bytes is the limit for role/DB names in Postgres + { + "name": "x" * 63, + "owner": "x" * 63, + }, ] @@ -146,6 +193,10 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv): """ Test that compute_ctl can create and work with databases and roles with special characters (whitespaces, %, tabs, etc.) in the name. + Also use `drop_subscriptions_before_start: true`. We do not actually + have any subscriptions in this test, so it should be no-op, but it + i) simulates the case when we create a second dev branch together with + a new project creation, and ii) just generally stresses more code paths. """ env = neon_simple_env @@ -159,6 +210,7 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv): **{ "spec": { "skip_pg_catalog_updates": False, + "drop_subscriptions_before_start": True, "cluster": { "roles": TEST_ROLE_NAMES, "databases": TEST_DB_NAMES, @@ -202,6 +254,7 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv): **{ "spec": { "skip_pg_catalog_updates": False, + "drop_subscriptions_before_start": True, "cluster": { "roles": [], "databases": [], From 568779fa8a601b8f790a477dbe1a5b3caa9d6dad Mon Sep 17 00:00:00 2001 From: Konstantin Merenkov Date: Tue, 20 May 2025 17:23:54 +0200 Subject: [PATCH 123/142] proxy/scram: avoid memory copy to improve performance (#11980) Touches #11941 ## Problem Performance of our PBKDF2 was worse than reference. ## Summary of changes Avoided memory copy when HMACing in a tight loop. --- proxy/src/scram/pbkdf2.rs | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs index 9c559e9082..7f48e00c41 100644 --- a/proxy/src/scram/pbkdf2.rs +++ b/proxy/src/scram/pbkdf2.rs @@ -13,22 +13,19 @@ pub(crate) struct Pbkdf2 { // inspired from impl Pbkdf2 { pub(crate) fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self { - let hmac = + // key the HMAC and derive the first block in-place + let mut hmac = Hmac::::new_from_slice(str).expect("HMAC is able to accept all key sizes"); - - let prev = hmac - .clone() - .chain_update(salt) - .chain_update(1u32.to_be_bytes()) - .finalize() - .into_bytes(); + hmac.update(salt); + hmac.update(&1u32.to_be_bytes()); + let init_block = hmac.finalize_reset().into_bytes(); Self { hmac, - // one consumed for the hash above + // one iteration spent above iterations: iterations - 1, - hi: prev, - prev, + hi: init_block, + prev: init_block, } } @@ -44,14 +41,17 @@ impl Pbkdf2 { iterations, } = self; - // only do 4096 iterations per turn before sharing the thread for fairness + // only do up to 4096 iterations per turn for fairness let n = (*iterations).clamp(0, 4096); for _ in 0..n { - *prev = hmac.clone().chain_update(*prev).finalize().into_bytes(); + hmac.update(prev); + let block = hmac.finalize_reset().into_bytes(); - for (hi, prev) in hi.iter_mut().zip(*prev) { - *hi ^= prev; + for (hi_byte, &b) in hi.iter_mut().zip(block.iter()) { + *hi_byte ^= b; } + + *prev = block; } *iterations -= n; From 2e3dc9a8c203ad3a73fc97683a11a928b187bf7f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 20 May 2025 18:38:27 +0300 Subject: [PATCH 124/142] Add rel_size_replica_cache (#11889) ## Problem See Discussion: https://neondb.slack.com/archives/C033RQ5SPDH/p1746645666075799 Issue: https://github.com/neondatabase/cloud/issues/28609 Relation size cache is not correctly updated at PS in case of replicas. ## Summary of changes 1. Have two caches for relation size in timeline: `rel_size_primary_cache` and `rel_size_replica_cache`. 2. `rel_size_primary_cache` is actually what we have now. The only difference is that it is not updated in `get_rel_size`, only by WAL ingestion 3. `rel_size_replica_cache` has limited size (LruCache) and it's key is `(Lsn,RelTag)` . It is updated in `get_rel_size`. Only strict LSN matches are accepted as cache hit. --------- Co-authored-by: Konstantin Knizhnik --- Cargo.lock | 1 + control_plane/src/pageserver.rs | 5 + libs/pageserver_api/src/config.rs | 4 + libs/pageserver_api/src/models.rs | 13 ++ pageserver/Cargo.toml | 1 + pageserver/src/basebackup.rs | 10 +- pageserver/src/metrics.rs | 45 ++++- pageserver/src/page_service.rs | 59 ++++-- pageserver/src/pgdatadir_mapping.rs | 169 ++++++++++++------ pageserver/src/tenant/timeline.rs | 35 ++-- pageserver/src/walingest.rs | 90 +++++----- .../regress/test_attach_tenant_config.py | 1 + test_runner/regress/test_replica_start.py | 110 +++++++++++- 13 files changed, 395 insertions(+), 148 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d919537818..9f4d537b33 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4286,6 +4286,7 @@ dependencies = [ "enumset", "fail", "futures", + "hashlink", "hex", "hex-literal", "http-utils", diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 79e87eba9b..587f3774d4 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -546,6 +546,11 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("Falied to parse 'sampling_ratio'")?, + relsize_snapshot_cache_capacity: settings + .remove("relsize snapshot cache capacity") + .map(|x| x.parse::()) + .transpose() + .context("Falied to parse 'relsize_snapshot_cache_capacity' as integer")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 2618366469..73b6eee554 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -491,6 +491,8 @@ pub struct TenantConfigToml { /// Tenant level performance sampling ratio override. Controls the ratio of get page requests /// that will get perf sampling for the tenant. pub sampling_ratio: Option, + /// Capacity of relsize snapshot cache (used by replicas). + pub relsize_snapshot_cache_capacity: usize, } pub mod defaults { @@ -730,6 +732,7 @@ pub mod tenant_conf_defaults { pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true; pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100; + pub const DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY: usize = 1000; } impl Default for TenantConfigToml { @@ -787,6 +790,7 @@ impl Default for TenantConfigToml { gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB, gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT, sampling_ratio: None, + relsize_snapshot_cache_capacity: DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY, } } } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index e9b37c8ca6..ca26286b86 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -630,6 +630,8 @@ pub struct TenantConfigPatch { pub gc_compaction_ratio_percent: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub sampling_ratio: FieldPatch>, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub relsize_snapshot_cache_capacity: FieldPatch, } /// Like [`crate::config::TenantConfigToml`], but preserves the information @@ -759,6 +761,9 @@ pub struct TenantConfig { #[serde(skip_serializing_if = "Option::is_none")] pub sampling_ratio: Option>, + + #[serde(skip_serializing_if = "Option::is_none")] + pub relsize_snapshot_cache_capacity: Option, } impl TenantConfig { @@ -804,6 +809,7 @@ impl TenantConfig { mut gc_compaction_initial_threshold_kb, mut gc_compaction_ratio_percent, mut sampling_ratio, + mut relsize_snapshot_cache_capacity, } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); @@ -905,6 +911,9 @@ impl TenantConfig { .gc_compaction_ratio_percent .apply(&mut gc_compaction_ratio_percent); patch.sampling_ratio.apply(&mut sampling_ratio); + patch + .relsize_snapshot_cache_capacity + .apply(&mut relsize_snapshot_cache_capacity); Ok(Self { checkpoint_distance, @@ -944,6 +953,7 @@ impl TenantConfig { gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, sampling_ratio, + relsize_snapshot_cache_capacity, }) } @@ -1052,6 +1062,9 @@ impl TenantConfig { .gc_compaction_ratio_percent .unwrap_or(global_conf.gc_compaction_ratio_percent), sampling_ratio: self.sampling_ratio.unwrap_or(global_conf.sampling_ratio), + relsize_snapshot_cache_capacity: self + .relsize_snapshot_cache_capacity + .unwrap_or(global_conf.relsize_snapshot_cache_capacity), } } } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index b7b3e0eaf1..6a9a5a292a 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -30,6 +30,7 @@ crc32c.workspace = true either.workspace = true fail.workspace = true futures.workspace = true +hashlink.workspace = true hex.workspace = true humantime.workspace = true humantime-serde.workspace = true diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index b49021461e..e89baa0bce 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -343,7 +343,7 @@ where // Gather non-relational files from object storage pages. let slru_partitions = self .timeline - .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx) + .get_slru_keyspace(Version::at(self.lsn), self.ctx) .await? .partition( self.timeline.get_shard_identity(), @@ -378,7 +378,7 @@ where // Otherwise only include init forks of unlogged relations. let rels = self .timeline - .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) + .list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx) .await?; for &rel in rels.iter() { // Send init fork as main fork to provide well formed empty @@ -517,7 +517,7 @@ where async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> { let nblocks = self .timeline - .get_rel_size(src, Version::Lsn(self.lsn), self.ctx) + .get_rel_size(src, Version::at(self.lsn), self.ctx) .await?; // If the relation is empty, create an empty file @@ -577,7 +577,7 @@ where let relmap_img = if has_relmap_file { let img = self .timeline - .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) + .get_relmap_file(spcnode, dbnode, Version::at(self.lsn), self.ctx) .await?; if img.len() @@ -631,7 +631,7 @@ where if !has_relmap_file && self .timeline - .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) + .list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx) .await? .is_empty() { diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 8e4dbd6c3e..c50f730f41 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -843,23 +843,50 @@ pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy = Lazy::new(| .expect("failed to define a metric") }); -pub(crate) static RELSIZE_CACHE_ENTRIES: Lazy = Lazy::new(|| { +pub(crate) static RELSIZE_LATEST_CACHE_ENTRIES: Lazy = Lazy::new(|| { register_uint_gauge!( - "pageserver_relsize_cache_entries", - "Number of entries in the relation size cache", + "pageserver_relsize_latest_cache_entries", + "Number of entries in the latest relation size cache", ) .expect("failed to define a metric") }); -pub(crate) static RELSIZE_CACHE_HITS: Lazy = Lazy::new(|| { - register_int_counter!("pageserver_relsize_cache_hits", "Relation size cache hits",) - .expect("failed to define a metric") +pub(crate) static RELSIZE_LATEST_CACHE_HITS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_relsize_latest_cache_hits", + "Latest relation size cache hits", + ) + .expect("failed to define a metric") }); -pub(crate) static RELSIZE_CACHE_MISSES: Lazy = Lazy::new(|| { +pub(crate) static RELSIZE_LATEST_CACHE_MISSES: Lazy = Lazy::new(|| { register_int_counter!( - "pageserver_relsize_cache_misses", - "Relation size cache misses", + "pageserver_relsize_latest_cache_misses", + "Relation size latest cache misses", + ) + .expect("failed to define a metric") +}); + +pub(crate) static RELSIZE_SNAPSHOT_CACHE_ENTRIES: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_relsize_snapshot_cache_entries", + "Number of entries in the pitr relation size cache", + ) + .expect("failed to define a metric") +}); + +pub(crate) static RELSIZE_SNAPSHOT_CACHE_HITS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_relsize_snapshot_cache_hits", + "Pitr relation size cache hits", + ) + .expect("failed to define a metric") +}); + +pub(crate) static RELSIZE_SNAPSHOT_CACHE_MISSES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_relsize_snapshot_cache_misses", + "Relation size snapshot cache misses", ) .expect("failed to define a metric") }); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 83d9191240..e46ba8d3a1 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -62,7 +62,7 @@ use crate::metrics::{ self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS, SmgrOpTimer, TimelineMetrics, }; -use crate::pgdatadir_mapping::Version; +use crate::pgdatadir_mapping::{LsnRange, Version}; use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id, @@ -642,7 +642,7 @@ impl std::fmt::Display for BatchedPageStreamError { struct BatchedGetPageRequest { req: PagestreamGetPageRequest, timer: SmgrOpTimer, - effective_request_lsn: Lsn, + lsn_range: LsnRange, ctx: RequestContext, } @@ -764,12 +764,12 @@ impl BatchedFeMessage { match batching_strategy { PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => { if let Some(last_in_batch) = accum_pages.last() { - if last_in_batch.effective_request_lsn - != this_pages[0].effective_request_lsn + if last_in_batch.lsn_range.effective_lsn + != this_pages[0].lsn_range.effective_lsn { trace!( - accum_lsn = %last_in_batch.effective_request_lsn, - this_lsn = %this_pages[0].effective_request_lsn, + accum_lsn = %last_in_batch.lsn_range.effective_lsn, + this_lsn = %this_pages[0].lsn_range.effective_lsn, "stopping batching because LSN changed" ); @@ -784,15 +784,15 @@ impl BatchedFeMessage { let same_page_different_lsn = accum_pages.iter().any(|batched| { batched.req.rel == this_pages[0].req.rel && batched.req.blkno == this_pages[0].req.blkno - && batched.effective_request_lsn - != this_pages[0].effective_request_lsn + && batched.lsn_range.effective_lsn + != this_pages[0].lsn_range.effective_lsn }); if same_page_different_lsn { trace!( rel=%this_pages[0].req.rel, blkno=%this_pages[0].req.blkno, - lsn=%this_pages[0].effective_request_lsn, + lsn=%this_pages[0].lsn_range.effective_lsn, "stopping batching because same page was requested at different LSNs" ); @@ -1158,7 +1158,7 @@ impl PageServerHandler { .await?; // We're holding the Handle - let effective_request_lsn = match Self::effective_request_lsn( + let effective_lsn = match Self::effective_request_lsn( &shard, shard.get_last_record_lsn(), req.hdr.request_lsn, @@ -1177,7 +1177,10 @@ impl PageServerHandler { pages: smallvec::smallvec![BatchedGetPageRequest { req, timer, - effective_request_lsn, + lsn_range: LsnRange { + effective_lsn, + request_lsn: req.hdr.request_lsn + }, ctx, }], // The executor grabs the batch when it becomes idle. @@ -2127,7 +2130,14 @@ impl PageServerHandler { .await?; let exists = timeline - .get_rel_exists(req.rel, Version::Lsn(lsn), ctx) + .get_rel_exists( + req.rel, + Version::LsnRange(LsnRange { + effective_lsn: lsn, + request_lsn: req.hdr.request_lsn, + }), + ctx, + ) .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { @@ -2154,7 +2164,14 @@ impl PageServerHandler { .await?; let n_blocks = timeline - .get_rel_size(req.rel, Version::Lsn(lsn), ctx) + .get_rel_size( + req.rel, + Version::LsnRange(LsnRange { + effective_lsn: lsn, + request_lsn: req.hdr.request_lsn, + }), + ctx, + ) .await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { @@ -2181,7 +2198,15 @@ impl PageServerHandler { .await?; let total_blocks = timeline - .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx) + .get_db_size( + DEFAULTTABLESPACE_OID, + req.dbnode, + Version::LsnRange(LsnRange { + effective_lsn: lsn, + request_lsn: req.hdr.request_lsn, + }), + ctx, + ) .await?; let db_size = total_blocks as i64 * BLCKSZ as i64; @@ -2214,7 +2239,7 @@ impl PageServerHandler { // Ignore error (trace buffer may be full or tracer may have disconnected). _ = page_trace.try_send(PageTraceEvent { key, - effective_lsn: batch.effective_request_lsn, + effective_lsn: batch.lsn_range.effective_lsn, time, }); } @@ -2229,7 +2254,7 @@ impl PageServerHandler { perf_instrument = true; } - req.effective_request_lsn + req.lsn_range.effective_lsn }) .max() .expect("batch is never empty"); @@ -2283,7 +2308,7 @@ impl PageServerHandler { ( &p.req.rel, &p.req.blkno, - p.effective_request_lsn, + p.lsn_range, p.ctx.attached_child(), ) }), diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 0f9bfd19a7..c6f3929257 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -43,7 +43,9 @@ use crate::aux_file; use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder}; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::metrics::{ - RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD, + RELSIZE_CACHE_MISSES_OLD, RELSIZE_LATEST_CACHE_ENTRIES, RELSIZE_LATEST_CACHE_HITS, + RELSIZE_LATEST_CACHE_MISSES, RELSIZE_SNAPSHOT_CACHE_ENTRIES, RELSIZE_SNAPSHOT_CACHE_HITS, + RELSIZE_SNAPSHOT_CACHE_MISSES, }; use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, @@ -90,6 +92,28 @@ pub enum LsnForTimestamp { NoData(Lsn), } +/// Each request to page server contains LSN range: `not_modified_since..request_lsn`. +/// See comments libs/pageserver_api/src/models.rs. +/// Based on this range and `last_record_lsn` PS calculates `effective_lsn`. +/// But to distinguish requests from primary and replicas we need also to pass `request_lsn`. +#[derive(Debug, Clone, Copy, Default)] +pub struct LsnRange { + pub effective_lsn: Lsn, + pub request_lsn: Lsn, +} + +impl LsnRange { + pub fn at(lsn: Lsn) -> LsnRange { + LsnRange { + effective_lsn: lsn, + request_lsn: lsn, + } + } + pub fn is_latest(&self) -> bool { + self.request_lsn == Lsn::MAX + } +} + #[derive(Debug, thiserror::Error)] pub(crate) enum CalculateLogicalSizeError { #[error("cancelled")] @@ -202,13 +226,13 @@ impl Timeline { io_concurrency: IoConcurrency, ) -> Result { match version { - Version::Lsn(effective_lsn) => { + Version::LsnRange(lsns) => { let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)]; let res = self .get_rel_page_at_lsn_batched( - pages.iter().map(|(tag, blknum)| { - (tag, blknum, effective_lsn, ctx.attached_child()) - }), + pages + .iter() + .map(|(tag, blknum)| (tag, blknum, lsns, ctx.attached_child())), io_concurrency.clone(), ctx, ) @@ -246,7 +270,7 @@ impl Timeline { /// The ordering of the returned vec corresponds to the ordering of `pages`. pub(crate) async fn get_rel_page_at_lsn_batched( &self, - pages: impl ExactSizeIterator, + pages: impl ExactSizeIterator, io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> Vec> { @@ -265,7 +289,7 @@ impl Timeline { let mut req_keyspaces: HashMap = HashMap::with_capacity(pages.len()); - for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() { + for (response_slot_idx, (tag, blknum, lsns, ctx)) in pages.enumerate() { if tag.relnode == 0 { result_slots[response_slot_idx].write(Err(PageReconstructError::Other( RelationError::InvalidRelnode.into(), @@ -274,7 +298,7 @@ impl Timeline { slots_filled += 1; continue; } - + let lsn = lsns.effective_lsn; let nblocks = { let ctx = RequestContextBuilder::from(&ctx) .perf_span(|crnt_perf_span| { @@ -289,7 +313,7 @@ impl Timeline { .attached_child(); match self - .get_rel_size(*tag, Version::Lsn(lsn), &ctx) + .get_rel_size(*tag, Version::LsnRange(lsns), &ctx) .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await { @@ -470,7 +494,7 @@ impl Timeline { )); } - if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) { + if let Some(nblocks) = self.get_cached_rel_size(&tag, version) { return Ok(nblocks); } @@ -488,7 +512,7 @@ impl Timeline { let mut buf = version.get(self, key, ctx).await?; let nblocks = buf.get_u32_le(); - self.update_cached_rel_size(tag, version.get_lsn(), nblocks); + self.update_cached_rel_size(tag, version, nblocks); Ok(nblocks) } @@ -510,7 +534,7 @@ impl Timeline { } // first try to lookup relation in cache - if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) { + if let Some(_nblocks) = self.get_cached_rel_size(&tag, version) { return Ok(true); } // then check if the database was already initialized. @@ -632,7 +656,7 @@ impl Timeline { ) -> Result { assert!(self.tenant_shard_id.is_shard_zero()); let n_blocks = self - .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx) + .get_slru_segment_size(kind, segno, Version::at(lsn), ctx) .await?; let keyspace = KeySpace::single( @@ -867,11 +891,11 @@ impl Timeline { mut f: impl FnMut(TimestampTz) -> ControlFlow, ) -> Result { for segno in self - .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx) + .list_slru_segments(SlruKind::Clog, Version::at(probe_lsn), ctx) .await? { let nblocks = self - .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx) + .get_slru_segment_size(SlruKind::Clog, segno, Version::at(probe_lsn), ctx) .await?; let keyspace = KeySpace::single( @@ -1137,7 +1161,7 @@ impl Timeline { let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { for rel in self - .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx) + .list_rels(*spcnode, *dbnode, Version::at(lsn), ctx) .await? { if self.cancel.is_cancelled() { @@ -1212,7 +1236,7 @@ impl Timeline { result.add_key(rel_dir_to_key(spcnode, dbnode)); let mut rels: Vec = self - .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx) + .list_rels(spcnode, dbnode, Version::at(lsn), ctx) .await? .into_iter() .collect(); @@ -1329,59 +1353,75 @@ impl Timeline { Ok((dense_keyspace, sparse_keyspace)) } - /// Get cached size of relation if it not updated after specified LSN - pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option { - let rel_size_cache = self.rel_size_cache.read().unwrap(); - if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) { - if lsn >= *cached_lsn { - RELSIZE_CACHE_HITS.inc(); - return Some(*nblocks); + /// Get cached size of relation. There are two caches: one for primary updates, it captures the latest state of + /// of the timeline and snapshot cache, which key includes LSN and so can be used by replicas to get relation size + /// at the particular LSN (snapshot). + pub fn get_cached_rel_size(&self, tag: &RelTag, version: Version<'_>) -> Option { + let lsn = version.get_lsn(); + { + let rel_size_cache = self.rel_size_latest_cache.read().unwrap(); + if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) { + if lsn >= *cached_lsn { + RELSIZE_LATEST_CACHE_HITS.inc(); + return Some(*nblocks); + } + RELSIZE_CACHE_MISSES_OLD.inc(); } - RELSIZE_CACHE_MISSES_OLD.inc(); } - RELSIZE_CACHE_MISSES.inc(); + { + let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap(); + if let Some(nblock) = rel_size_cache.get(&(lsn, *tag)) { + RELSIZE_SNAPSHOT_CACHE_HITS.inc(); + return Some(*nblock); + } + } + if version.is_latest() { + RELSIZE_LATEST_CACHE_MISSES.inc(); + } else { + RELSIZE_SNAPSHOT_CACHE_MISSES.inc(); + } None } /// Update cached relation size if there is no more recent update - pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { - let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - - if lsn < rel_size_cache.complete_as_of { - // Do not cache old values. It's safe to cache the size on read, as long as - // the read was at an LSN since we started the WAL ingestion. Reasoning: we - // never evict values from the cache, so if the relation size changed after - // 'lsn', the new value is already in the cache. - return; - } - - match rel_size_cache.map.entry(tag) { - hash_map::Entry::Occupied(mut entry) => { - let cached_lsn = entry.get_mut(); - if lsn >= cached_lsn.0 { - *cached_lsn = (lsn, nblocks); + pub fn update_cached_rel_size(&self, tag: RelTag, version: Version<'_>, nblocks: BlockNumber) { + let lsn = version.get_lsn(); + if version.is_latest() { + let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap(); + match rel_size_cache.entry(tag) { + hash_map::Entry::Occupied(mut entry) => { + let cached_lsn = entry.get_mut(); + if lsn >= cached_lsn.0 { + *cached_lsn = (lsn, nblocks); + } + } + hash_map::Entry::Vacant(entry) => { + entry.insert((lsn, nblocks)); + RELSIZE_LATEST_CACHE_ENTRIES.inc(); } } - hash_map::Entry::Vacant(entry) => { - entry.insert((lsn, nblocks)); - RELSIZE_CACHE_ENTRIES.inc(); + } else { + let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap(); + if rel_size_cache.capacity() != 0 { + rel_size_cache.insert((lsn, tag), nblocks); + RELSIZE_SNAPSHOT_CACHE_ENTRIES.set(rel_size_cache.len() as u64); } } } /// Store cached relation size pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { - let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() { - RELSIZE_CACHE_ENTRIES.inc(); + let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap(); + if rel_size_cache.insert(tag, (lsn, nblocks)).is_none() { + RELSIZE_LATEST_CACHE_ENTRIES.inc(); } } /// Remove cached relation size pub fn remove_cached_rel_size(&self, tag: &RelTag) { - let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - if rel_size_cache.map.remove(tag).is_some() { - RELSIZE_CACHE_ENTRIES.dec(); + let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap(); + if rel_size_cache.remove(tag).is_some() { + RELSIZE_LATEST_CACHE_ENTRIES.dec(); } } } @@ -1585,7 +1625,10 @@ impl DatadirModification<'_> { // check the cache too. This is because eagerly checking the cache results in // less work overall and 10% better performance. It's more work on cache miss // but cache miss is rare. - if let Some(nblocks) = self.tline.get_cached_rel_size(&rel, self.get_lsn()) { + if let Some(nblocks) = self + .tline + .get_cached_rel_size(&rel, Version::Modified(self)) + { Ok(nblocks) } else if !self .tline @@ -2667,7 +2710,7 @@ pub struct DatadirModificationStats { /// timeline to not miss the latest updates. #[derive(Clone, Copy)] pub enum Version<'a> { - Lsn(Lsn), + LsnRange(LsnRange), Modified(&'a DatadirModification<'a>), } @@ -2679,7 +2722,7 @@ impl Version<'_> { ctx: &RequestContext, ) -> Result { match self { - Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await, + Version::LsnRange(lsns) => timeline.get(key, lsns.effective_lsn, ctx).await, Version::Modified(modification) => modification.get(key, ctx).await, } } @@ -2701,12 +2744,26 @@ impl Version<'_> { } } - fn get_lsn(&self) -> Lsn { + pub fn is_latest(&self) -> bool { match self { - Version::Lsn(lsn) => *lsn, + Version::LsnRange(lsns) => lsns.is_latest(), + Version::Modified(_) => true, + } + } + + pub fn get_lsn(&self) -> Lsn { + match self { + Version::LsnRange(lsns) => lsns.effective_lsn, Version::Modified(modification) => modification.lsn, } } + + pub fn at(lsn: Lsn) -> Self { + Version::LsnRange(LsnRange { + effective_lsn: lsn, + request_lsn: lsn, + }) + } } //--- Metadata structs stored in key-value pairs in the repository. diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d3c92ab47a..da2e56d80a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -14,6 +14,7 @@ pub mod span; pub mod uninit; mod walreceiver; +use hashlink::LruCache; use std::array; use std::cmp::{max, min}; use std::collections::btree_map::Entry; @@ -197,16 +198,6 @@ pub struct TimelineResources { pub l0_flush_global_state: l0_flush::L0FlushGlobalState, } -/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL -/// ingestion considerably, because WAL ingestion needs to check on most records if the record -/// implicitly extends the relation. At startup, `complete_as_of` is initialized to the current end -/// of the timeline (disk_consistent_lsn). It's used on reads of relation sizes to check if the -/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`]. -pub(crate) struct RelSizeCache { - pub(crate) complete_as_of: Lsn, - pub(crate) map: HashMap, -} - pub struct Timeline { pub(crate) conf: &'static PageServerConf, tenant_conf: Arc>, @@ -365,7 +356,8 @@ pub struct Timeline { pub walreceiver: Mutex>, /// Relation size cache - pub(crate) rel_size_cache: RwLock, + pub(crate) rel_size_latest_cache: RwLock>, + pub(crate) rel_size_snapshot_cache: Mutex>, download_all_remote_layers_task_info: RwLock>, @@ -2820,6 +2812,13 @@ impl Timeline { self.remote_client.update_config(&new_conf.location); + let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap(); + if let Some(new_capacity) = new_conf.tenant_conf.relsize_snapshot_cache_capacity { + if new_capacity != rel_size_cache.capacity() { + rel_size_cache.set_capacity(new_capacity); + } + } + self.metrics .evictions_with_low_residence_duration .write() @@ -2878,6 +2877,14 @@ impl Timeline { ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded); } + let relsize_snapshot_cache_capacity = { + let loaded_tenant_conf = tenant_conf.load(); + loaded_tenant_conf + .tenant_conf + .relsize_snapshot_cache_capacity + .unwrap_or(conf.default_tenant_conf.relsize_snapshot_cache_capacity) + }; + Arc::new_cyclic(|myself| { let metrics = Arc::new(TimelineMetrics::new( &tenant_shard_id, @@ -2969,10 +2976,8 @@ impl Timeline { last_image_layer_creation_check_instant: Mutex::new(None), last_received_wal: Mutex::new(None), - rel_size_cache: RwLock::new(RelSizeCache { - complete_as_of: disk_consistent_lsn, - map: HashMap::new(), - }), + rel_size_latest_cache: RwLock::new(HashMap::new()), + rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)), download_all_remote_layers_task_info: RwLock::new(None), diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index e60c590f87..c7a6655052 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1684,31 +1684,31 @@ mod tests { // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx) .await?, false ); assert!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx) .await .is_err() ); assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, 1 ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx) .await?, 3 ); @@ -1719,7 +1719,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 0, - Version::Lsn(Lsn(0x20)), + Version::at(Lsn(0x20)), &ctx, io_concurrency.clone() ) @@ -1733,7 +1733,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 0, - Version::Lsn(Lsn(0x30)), + Version::at(Lsn(0x30)), &ctx, io_concurrency.clone() ) @@ -1747,7 +1747,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 0, - Version::Lsn(Lsn(0x40)), + Version::at(Lsn(0x40)), &ctx, io_concurrency.clone() ) @@ -1760,7 +1760,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 1, - Version::Lsn(Lsn(0x40)), + Version::at(Lsn(0x40)), &ctx, io_concurrency.clone() ) @@ -1774,7 +1774,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 0, - Version::Lsn(Lsn(0x50)), + Version::at(Lsn(0x50)), &ctx, io_concurrency.clone() ) @@ -1787,7 +1787,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 1, - Version::Lsn(Lsn(0x50)), + Version::at(Lsn(0x50)), &ctx, io_concurrency.clone() ) @@ -1800,7 +1800,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 2, - Version::Lsn(Lsn(0x50)), + Version::at(Lsn(0x50)), &ctx, io_concurrency.clone() ) @@ -1820,7 +1820,7 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx) .await?, 2 ); @@ -1829,7 +1829,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 0, - Version::Lsn(Lsn(0x60)), + Version::at(Lsn(0x60)), &ctx, io_concurrency.clone() ) @@ -1842,7 +1842,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 1, - Version::Lsn(Lsn(0x60)), + Version::at(Lsn(0x60)), &ctx, io_concurrency.clone() ) @@ -1854,7 +1854,7 @@ mod tests { // should still see the truncated block with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx) .await?, 3 ); @@ -1863,7 +1863,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 2, - Version::Lsn(Lsn(0x50)), + Version::at(Lsn(0x50)), &ctx, io_concurrency.clone() ) @@ -1880,7 +1880,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x68)), &ctx) .await?, 0 ); @@ -1893,7 +1893,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x70)), &ctx) .await?, 2 ); @@ -1902,7 +1902,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 0, - Version::Lsn(Lsn(0x70)), + Version::at(Lsn(0x70)), &ctx, io_concurrency.clone() ) @@ -1915,7 +1915,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 1, - Version::Lsn(Lsn(0x70)), + Version::at(Lsn(0x70)), &ctx, io_concurrency.clone() ) @@ -1932,7 +1932,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx) .await?, 1501 ); @@ -1942,7 +1942,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, blk, - Version::Lsn(Lsn(0x80)), + Version::at(Lsn(0x80)), &ctx, io_concurrency.clone() ) @@ -1956,7 +1956,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 1500, - Version::Lsn(Lsn(0x80)), + Version::at(Lsn(0x80)), &ctx, io_concurrency.clone() ) @@ -1990,13 +1990,13 @@ mod tests { // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, 1 ); @@ -2011,7 +2011,7 @@ mod tests { // Check that rel is not visible anymore assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x30)), &ctx) .await?, false ); @@ -2029,13 +2029,13 @@ mod tests { // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x40)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x40)), &ctx) .await?, 1 ); @@ -2077,26 +2077,26 @@ mod tests { // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx) .await?, false ); assert!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx) .await .is_err() ); assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, relsize ); @@ -2110,7 +2110,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, blkno, - Version::Lsn(lsn), + Version::at(lsn), &ctx, io_concurrency.clone() ) @@ -2131,7 +2131,7 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx) .await?, 1 ); @@ -2144,7 +2144,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, blkno, - Version::Lsn(Lsn(0x60)), + Version::at(Lsn(0x60)), &ctx, io_concurrency.clone() ) @@ -2157,7 +2157,7 @@ mod tests { // should still see all blocks with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx) .await?, relsize ); @@ -2169,7 +2169,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, blkno, - Version::Lsn(Lsn(0x50)), + Version::at(Lsn(0x50)), &ctx, io_concurrency.clone() ) @@ -2193,13 +2193,13 @@ mod tests { assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x80)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx) .await?, relsize ); @@ -2212,7 +2212,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, blkno, - Version::Lsn(Lsn(0x80)), + Version::at(Lsn(0x80)), &ctx, io_concurrency.clone() ) @@ -2250,7 +2250,7 @@ mod tests { assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE + 1 ); @@ -2264,7 +2264,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE ); @@ -2279,7 +2279,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE - 1 ); @@ -2297,7 +2297,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx) .await?, size as BlockNumber ); diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 3616467c00..3eb6b7193c 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -187,6 +187,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, }, "rel_size_v2_enabled": True, + "relsize_snapshot_cache_capacity": 10000, "gc_compaction_enabled": True, "gc_compaction_verification": False, "gc_compaction_initial_threshold_kb": 1024000, diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py index e2a22cc769..c88bc7aace 100644 --- a/test_runner/regress/test_replica_start.py +++ b/test_runner/regress/test_replica_start.py @@ -27,8 +27,9 @@ from contextlib import closing import psycopg2 import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup +from fixtures.neon_fixtures import NeonEnv, PgBin, wait_for_last_flush_lsn, wait_replica_caughtup from fixtures.pg_version import PgVersion from fixtures.utils import query_scalar, skip_on_postgres, wait_until @@ -695,3 +696,110 @@ def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv): with secondary.cursor() as secondary_cur: secondary_cur.execute("select count(*) from t") assert secondary_cur.fetchone() == (n_restarts,) + + +def test_ephemeral_endpoints_vacuum(neon_simple_env: NeonEnv, pg_bin: PgBin): + env = neon_simple_env + endpoint = env.endpoints.create_start("main") + + sql = """ +CREATE TABLE CHAR_TBL(f1 char(4)); +CREATE TABLE FLOAT8_TBL(f1 float8); +CREATE TABLE INT2_TBL(f1 int2); +CREATE TABLE INT4_TBL(f1 int4); +CREATE TABLE INT8_TBL(q1 int8, q2 int8); +CREATE TABLE POINT_TBL(f1 point); +CREATE TABLE TEXT_TBL (f1 text); +CREATE TABLE VARCHAR_TBL(f1 varchar(4)); +CREATE TABLE onek (unique1 int4); +CREATE TABLE onek2 AS SELECT * FROM onek; +CREATE TABLE tenk1 (unique1 int4); +CREATE TABLE tenk2 AS SELECT * FROM tenk1; +CREATE TABLE person (name text, age int4,location point); +CREATE TABLE emp (salary int4, manager name) INHERITS (person); +CREATE TABLE student (gpa float8) INHERITS (person); +CREATE TABLE stud_emp ( percent int4) INHERITS (emp, student); +CREATE TABLE road (name text,thepath path); +CREATE TABLE ihighway () INHERITS (road); +CREATE TABLE shighway(surface text) INHERITS (road); +CREATE TABLE BOOLTBL3 (d text, b bool, o int); +CREATE TABLE booltbl4(isfalse bool, istrue bool, isnul bool); +DROP TABLE BOOLTBL3; +DROP TABLE BOOLTBL4; +CREATE TABLE ceil_floor_round (a numeric); +DROP TABLE ceil_floor_round; +CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8); +DROP TABLE width_bucket_test; +CREATE TABLE num_input_test (n1 numeric); +CREATE TABLE num_variance (a numeric); +INSERT INTO num_variance VALUES (0); +CREATE TABLE snapshot_test (nr integer, snap txid_snapshot); +CREATE TABLE guid1(guid_field UUID, text_field TEXT DEFAULT(now())); +CREATE TABLE guid2(guid_field UUID, text_field TEXT DEFAULT(now())); +CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field); +CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field); +TRUNCATE guid1; +DROP TABLE guid1; +DROP TABLE guid2 CASCADE; +CREATE TABLE numrange_test (nr NUMRANGE); +CREATE INDEX numrange_test_btree on numrange_test(nr); +CREATE TABLE numrange_test2(nr numrange); +CREATE INDEX numrange_test2_hash_idx on numrange_test2 using hash (nr); +INSERT INTO numrange_test2 VALUES('[, 5)'); +CREATE TABLE textrange_test (tr text); +CREATE INDEX textrange_test_btree on textrange_test(tr); +CREATE TABLE test_range_gist(ir int4range); +CREATE INDEX test_range_gist_idx on test_range_gist using gist (ir); +DROP INDEX test_range_gist_idx; +CREATE INDEX test_range_gist_idx on test_range_gist using gist (ir); +CREATE TABLE test_range_spgist(ir int4range); +CREATE INDEX test_range_spgist_idx on test_range_spgist using spgist (ir); +DROP INDEX test_range_spgist_idx; +CREATE INDEX test_range_spgist_idx on test_range_spgist using spgist (ir); +CREATE TABLE test_range_elem(i int4); +CREATE INDEX test_range_elem_idx on test_range_elem (i); +CREATE INDEX ON test_range_elem using spgist(int4range(i,i+10)); +DROP TABLE test_range_elem; +CREATE TABLE test_range_excl(room int4range, speaker int4range, during tsrange, exclude using gist (room with =, during with &&), exclude using gist (speaker with =, during with &&)); +CREATE TABLE f_test(f text, i int); +CREATE TABLE i8r_array (f1 int, f2 text); +CREATE TYPE arrayrange as range (subtype=int4[]); +CREATE TYPE two_ints as (a int, b int); +DROP TYPE two_ints cascade; +CREATE TABLE text_support_test (t text); +CREATE TABLE TEMP_FLOAT (f1 FLOAT8); +CREATE TABLE TEMP_INT4 (f1 INT4); +CREATE TABLE TEMP_INT2 (f1 INT2); +CREATE TABLE TEMP_GROUP (f1 INT4, f2 INT4, f3 FLOAT8); +CREATE TABLE POLYGON_TBL(f1 polygon); +CREATE TABLE quad_poly_tbl (id int, p polygon); +INSERT INTO quad_poly_tbl SELECT (x - 1) * 100 + y, polygon(circle(point(x * 10, y * 10), 1 + (x + y) % 10)) FROM generate_series(1, 200) x, generate_series(1, 100) y; +CREATE TABLE quad_poly_tbl_ord_seq2 AS SELECT 1 FROM quad_poly_tbl; +CREATE TABLE quad_poly_tbl_ord_idx2 AS SELECT 1 FROM quad_poly_tbl; +""" + + with endpoint.cursor() as cur: + lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + env.endpoints.create_start(branch_name="main", lsn=lsn) + log.info(f"lsn: {lsn}") + + for line in sql.split("\n"): + if len(line.strip()) == 0 or line.startswith("--"): + continue + cur.execute(line) + + lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + env.endpoints.create_start(branch_name="main", lsn=lsn) + log.info(f"lsn: {lsn}") + + cur.execute("VACUUM FULL pg_class;") + + for ep in env.endpoints.endpoints: + log.info(f"{ep.endpoint_id} / {ep.pg_port}") + pg_dump_command = ["pg_dumpall", "-f", f"/tmp/dump-{ep.endpoint_id}.sql"] + env_vars = { + "PGPORT": str(ep.pg_port), + "PGUSER": endpoint.default_options["user"], + "PGHOST": endpoint.default_options["host"], + } + pg_bin.run_capture(pg_dump_command, env=env_vars) From f3c9d0adf437f4d8ce2de3a933cda0fba7bb3cc9 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 20 May 2025 19:57:59 +0200 Subject: [PATCH 125/142] proxy(logging): significant changes to json logging internals for performance. (#11974) #11962 Please review each commit separately. Each commit is rather small in goal. The overall goal of this PR is to keep the behaviour identical, but shave away small inefficiencies here and there. --- proxy/src/logging.rs | 484 +++++++++++++++++++++---------------------- 1 file changed, 235 insertions(+), 249 deletions(-) diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index efa3c0b514..a58b55a704 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -1,13 +1,11 @@ -use std::cell::{Cell, RefCell}; +use std::cell::RefCell; use std::collections::HashMap; -use std::hash::BuildHasher; +use std::sync::Arc; use std::sync::atomic::{AtomicU32, Ordering}; -use std::{array, env, fmt, io}; +use std::{env, io}; use chrono::{DateTime, Utc}; -use indexmap::IndexSet; use opentelemetry::trace::TraceContextExt; -use scopeguard::defer; use serde::ser::{SerializeMap, Serializer}; use tracing::subscriber::Interest; use tracing::{Event, Metadata, Span, Subscriber, callsite, span}; @@ -19,7 +17,6 @@ use tracing_subscriber::fmt::{FormatEvent, FormatFields}; use tracing_subscriber::layer::{Context, Layer}; use tracing_subscriber::prelude::*; use tracing_subscriber::registry::{LookupSpan, SpanRef}; -use try_lock::TryLock; /// Initialize logging and OpenTelemetry tracing and exporter. /// @@ -55,7 +52,7 @@ pub async fn init() -> anyhow::Result { StderrWriter { stderr: std::io::stderr(), }, - ["request_id", "session_id", "conn_id"], + &["request_id", "session_id", "conn_id"], )) } else { None @@ -183,50 +180,65 @@ impl Clock for RealClock { /// Name of the field used by tracing crate to store the event message. const MESSAGE_FIELD: &str = "message"; +/// Tracing used to enforce that spans/events have no more than 32 fields. +/// It seems this is no longer the case, but it's still documented in some places. +/// Generally, we shouldn't expect more than 32 fields anyway, so we can try and +/// rely on it for some (minor) performance gains. +const MAX_TRACING_FIELDS: usize = 32; + thread_local! { - /// Protects against deadlocks and double panics during log writing. - /// The current panic handler will use tracing to log panic information. - static REENTRANCY_GUARD: Cell = const { Cell::new(false) }; /// Thread-local instance with per-thread buffer for log writing. - static EVENT_FORMATTER: RefCell = RefCell::new(EventFormatter::new()); + static EVENT_FORMATTER: RefCell = const { RefCell::new(EventFormatter::new()) }; /// Cached OS thread ID. static THREAD_ID: u64 = gettid::gettid(); } +/// Map for values fixed at callsite registration. +// We use papaya here because registration rarely happens post-startup. +// papaya is good for read-heavy workloads. +// +// We use rustc_hash here because callsite::Identifier will always be an integer with low-bit entropy, +// since it's always a pointer to static mutable data. rustc_hash was designed for low-bit entropy. +type CallsiteMap = + papaya::HashMap>; + /// Implements tracing layer to handle events specific to logging. -struct JsonLoggingLayer { +struct JsonLoggingLayer { clock: C, - skipped_field_indices: papaya::HashMap, - callsite_ids: papaya::HashMap, writer: W, - // We use a const generic and arrays to bypass one heap allocation. - extract_fields: IndexSet<&'static str>, - _marker: std::marker::PhantomData<[&'static str; F]>, + + /// tracks which fields of each **event** are duplicates + skipped_field_indices: CallsiteMap, + + span_info: CallsiteMap, + + /// Fields we want to keep track of in a separate json object. + extract_fields: &'static [&'static str], } -impl JsonLoggingLayer { - fn new(clock: C, writer: W, extract_fields: [&'static str; F]) -> Self { +impl JsonLoggingLayer { + fn new(clock: C, writer: W, extract_fields: &'static [&'static str]) -> Self { JsonLoggingLayer { clock, - skipped_field_indices: papaya::HashMap::default(), - callsite_ids: papaya::HashMap::default(), + skipped_field_indices: CallsiteMap::default(), + span_info: CallsiteMap::default(), writer, - extract_fields: IndexSet::from_iter(extract_fields), - _marker: std::marker::PhantomData, + extract_fields, } } #[inline] - fn callsite_id(&self, cs: callsite::Identifier) -> CallsiteId { - *self - .callsite_ids + fn span_info(&self, metadata: &'static Metadata<'static>) -> CallsiteSpanInfo { + self.span_info .pin() - .get_or_insert_with(cs, CallsiteId::next) + .get_or_insert_with(metadata.callsite(), || { + CallsiteSpanInfo::new(metadata, self.extract_fields) + }) + .clone() } } -impl Layer - for JsonLoggingLayer +impl Layer for JsonLoggingLayer where S: Subscriber + for<'a> LookupSpan<'a>, { @@ -237,35 +249,25 @@ where // early, before OTel machinery, and add as event extension. let now = self.clock.now(); - let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| { - if entered.get() { - let mut formatter = EventFormatter::new(); - formatter.format::( - now, - event, - &ctx, - &self.skipped_field_indices, - &self.callsite_ids, - &self.extract_fields, - )?; - self.writer.make_writer().write_all(formatter.buffer()) - } else { - entered.set(true); - defer!(entered.set(false);); + let res: io::Result<()> = EVENT_FORMATTER.with(|f| { + let mut borrow = f.try_borrow_mut(); + let formatter = match borrow.as_deref_mut() { + Ok(formatter) => formatter, + // If the thread local formatter is borrowed, + // then we likely hit an edge case were we panicked during formatting. + // We allow the logging to proceed with an uncached formatter. + Err(_) => &mut EventFormatter::new(), + }; - EVENT_FORMATTER.with_borrow_mut(move |formatter| { - formatter.reset(); - formatter.format::( - now, - event, - &ctx, - &self.skipped_field_indices, - &self.callsite_ids, - &self.extract_fields, - )?; - self.writer.make_writer().write_all(formatter.buffer()) - }) - } + formatter.reset(); + formatter.format( + now, + event, + &ctx, + &self.skipped_field_indices, + self.extract_fields, + )?; + self.writer.make_writer().write_all(formatter.buffer()) }); // In case logging fails we generate a simpler JSON object. @@ -287,50 +289,48 @@ where /// Registers a SpanFields instance as span extension. fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) { let span = ctx.span(id).expect("span must exist"); - let fields = SpanFields::default(); - fields.record_fields(attrs); - // This could deadlock when there's a panic somewhere in the tracing - // event handling and a read or write guard is still held. This includes - // the OTel subscriber. - let mut exts = span.extensions_mut(); + let mut fields = SpanFields::new(self.span_info(span.metadata())); + attrs.record(&mut fields); - exts.insert(fields); + // This is a new span: the extensions should not be locked + // unless some layer spawned a thread to process this span. + // I don't think any layers do that. + span.extensions_mut().insert(fields); } fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) { let span = ctx.span(id).expect("span must exist"); - let ext = span.extensions(); - if let Some(data) = ext.get::() { - data.record_fields(values); + + // assumption: `on_record` is rarely called. + // assumption: a span being updated by one thread, + // and formatted by another thread is even rarer. + let mut ext = span.extensions_mut(); + if let Some(fields) = ext.get_mut::() { + values.record(fields); } } - /// Called (lazily) whenever a new log call is executed. We quickly check - /// for duplicate field names and record duplicates as skippable. Last one - /// wins. + /// Called (lazily) roughly once per event/span instance. We quickly check + /// for duplicate field names and record duplicates as skippable. Last field wins. fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest { + debug_assert!( + metadata.fields().len() <= MAX_TRACING_FIELDS, + "callsite {metadata:?} has too many fields." + ); + if !metadata.is_event() { - self.callsite_id(metadata.callsite()); + // register the span info. + self.span_info(metadata); // Must not be never because we wouldn't get trace and span data. return Interest::always(); } let mut field_indices = SkippedFieldIndices::default(); - let mut seen_fields = HashMap::<&'static str, usize>::new(); + let mut seen_fields = HashMap::new(); for field in metadata.fields() { - use std::collections::hash_map::Entry; - match seen_fields.entry(field.name()) { - Entry::Vacant(entry) => { - // field not seen yet - entry.insert(field.index()); - } - Entry::Occupied(mut entry) => { - // replace currently stored index - let old_index = entry.insert(field.index()); - // ... and append it to list of skippable indices - field_indices.push(old_index); - } + if let Some(old_index) = seen_fields.insert(field.name(), field.index()) { + field_indices.set(old_index); } } @@ -344,110 +344,113 @@ where } } -#[derive(Copy, Clone, Debug, Default)] -#[repr(transparent)] -struct CallsiteId(u32); +/// Any span info that is fixed to a particular callsite. Not variable between span instances. +#[derive(Clone)] +struct CallsiteSpanInfo { + /// index of each field to extract. usize::MAX if not found. + extract: Arc<[usize]>, -impl CallsiteId { - #[inline] - fn next() -> Self { - // Start at 1 to reserve 0 for default. - static COUNTER: AtomicU32 = AtomicU32::new(1); - CallsiteId(COUNTER.fetch_add(1, Ordering::Relaxed)) - } + /// tracks the fixed "callsite ID" for each span. + /// note: this is not stable between runs. + normalized_name: Arc, } -impl fmt::Display for CallsiteId { - #[inline] - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.0.fmt(f) +impl CallsiteSpanInfo { + fn new(metadata: &'static Metadata<'static>, extract_fields: &[&'static str]) -> Self { + // Start at 1 to reserve 0 for default. + static COUNTER: AtomicU32 = AtomicU32::new(1); + + let names: Vec<&'static str> = metadata.fields().iter().map(|f| f.name()).collect(); + + // get all the indices of span fields we want to focus + let extract = extract_fields + .iter() + // use rposition, since we want last match wins. + .map(|f1| names.iter().rposition(|f2| f1 == f2).unwrap_or(usize::MAX)) + .collect(); + + // normalized_name is unique for each callsite, but it is not + // unified across separate proxy instances. + // todo: can we do better here? + let cid = COUNTER.fetch_add(1, Ordering::Relaxed); + let normalized_name = format!("{}#{cid}", metadata.name()).into(); + + Self { + extract, + normalized_name, + } } } /// Stores span field values recorded during the spans lifetime. -#[derive(Default)] struct SpanFields { - // TODO: Switch to custom enum with lasso::Spur for Strings? - fields: papaya::HashMap<&'static str, serde_json::Value>, + values: [serde_json::Value; MAX_TRACING_FIELDS], + + /// cached span info so we can avoid extra hashmap lookups in the hot path. + span_info: CallsiteSpanInfo, } impl SpanFields { - #[inline] - fn record_fields(&self, fields: R) { - fields.record(&mut SpanFieldsRecorder { - fields: self.fields.pin(), - }); + fn new(span_info: CallsiteSpanInfo) -> Self { + Self { + span_info, + values: [const { serde_json::Value::Null }; MAX_TRACING_FIELDS], + } } } -/// Implements a tracing field visitor to convert and store values. -struct SpanFieldsRecorder<'m, S, G> { - fields: papaya::HashMapRef<'m, &'static str, serde_json::Value, S, G>, -} - -impl tracing::field::Visit for SpanFieldsRecorder<'_, S, G> { +impl tracing::field::Visit for SpanFields { #[inline] fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } #[inline] fn record_i64(&mut self, field: &tracing::field::Field, value: i64) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } #[inline] fn record_u64(&mut self, field: &tracing::field::Field, value: u64) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } #[inline] fn record_i128(&mut self, field: &tracing::field::Field, value: i128) { if let Ok(value) = i64::try_from(value) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } else { - self.fields - .insert(field.name(), serde_json::Value::from(format!("{value}"))); + self.values[field.index()] = serde_json::Value::from(format!("{value}")); } } #[inline] fn record_u128(&mut self, field: &tracing::field::Field, value: u128) { if let Ok(value) = u64::try_from(value) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } else { - self.fields - .insert(field.name(), serde_json::Value::from(format!("{value}"))); + self.values[field.index()] = serde_json::Value::from(format!("{value}")); } } #[inline] fn record_bool(&mut self, field: &tracing::field::Field, value: bool) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } #[inline] fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } #[inline] fn record_str(&mut self, field: &tracing::field::Field, value: &str) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } #[inline] fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) { - self.fields - .insert(field.name(), serde_json::Value::from(format!("{value:?}"))); + self.values[field.index()] = serde_json::Value::from(format!("{value:?}")); } #[inline] @@ -456,38 +459,33 @@ impl tracing::field::Visit for SpanFieldsRecor field: &tracing::field::Field, value: &(dyn std::error::Error + 'static), ) { - self.fields - .insert(field.name(), serde_json::Value::from(format!("{value}"))); + self.values[field.index()] = serde_json::Value::from(format!("{value}")); } } /// List of field indices skipped during logging. Can list duplicate fields or /// metafields not meant to be logged. -#[derive(Clone, Default)] +#[derive(Copy, Clone, Default)] struct SkippedFieldIndices { - bits: u64, + // 32-bits is large enough for `MAX_TRACING_FIELDS` + bits: u32, } impl SkippedFieldIndices { #[inline] - fn is_empty(&self) -> bool { + fn is_empty(self) -> bool { self.bits == 0 } #[inline] - fn push(&mut self, index: usize) { - self.bits |= 1u64 - .checked_shl(index as u32) - .expect("field index too large"); + fn set(&mut self, index: usize) { + debug_assert!(index <= 32, "index out of bounds of 32-bit set"); + self.bits |= 1 << index; } #[inline] - fn contains(&self, index: usize) -> bool { - self.bits - & 1u64 - .checked_shl(index as u32) - .expect("field index too large") - != 0 + fn contains(self, index: usize) -> bool { + self.bits & (1 << index) != 0 } } @@ -499,7 +497,7 @@ struct EventFormatter { impl EventFormatter { #[inline] - fn new() -> Self { + const fn new() -> Self { EventFormatter { logline_buffer: Vec::new(), } @@ -515,14 +513,13 @@ impl EventFormatter { self.logline_buffer.clear(); } - fn format( + fn format( &mut self, now: DateTime, event: &Event<'_>, ctx: &Context<'_, S>, - skipped_field_indices: &papaya::HashMap, - callsite_ids: &papaya::HashMap, - extract_fields: &IndexSet<&'static str>, + skipped_field_indices: &CallsiteMap, + extract_fields: &'static [&'static str], ) -> io::Result<()> where S: Subscriber + for<'a> LookupSpan<'a>, @@ -533,8 +530,11 @@ impl EventFormatter { let normalized_meta = event.normalized_metadata(); let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata()); - let skipped_field_indices = skipped_field_indices.pin(); - let skipped_field_indices = skipped_field_indices.get(&meta.callsite()); + let skipped_field_indices = skipped_field_indices + .pin() + .get(&meta.callsite()) + .copied() + .unwrap_or_default(); let mut serialize = || { let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer); @@ -565,9 +565,11 @@ impl EventFormatter { } let spans = SerializableSpans { - ctx, - callsite_ids, - extract: ExtractedSpanFields::<'_, F>::new(extract_fields), + // collect all spans from parent to root. + spans: ctx + .event_span(event) + .map_or(vec![], |parent| parent.scope().collect()), + extracted: ExtractedSpanFields::new(extract_fields), }; serializer.serialize_entry("spans", &spans)?; @@ -620,9 +622,9 @@ impl EventFormatter { } } - if spans.extract.has_values() { + if spans.extracted.has_values() { // TODO: add fields from event, too? - serializer.serialize_entry("extract", &spans.extract)?; + serializer.serialize_entry("extract", &spans.extracted)?; } serializer.end() @@ -635,15 +637,15 @@ impl EventFormatter { } /// Extracts the message field that's mixed will other fields. -struct MessageFieldExtractor<'a, S: serde::ser::SerializeMap> { +struct MessageFieldExtractor { serializer: S, - skipped_field_indices: Option<&'a SkippedFieldIndices>, + skipped_field_indices: SkippedFieldIndices, state: Option>, } -impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> { +impl MessageFieldExtractor { #[inline] - fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self { + fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self { Self { serializer, skipped_field_indices, @@ -665,13 +667,11 @@ impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> { fn accept_field(&self, field: &tracing::field::Field) -> bool { self.state.is_none() && field.name() == MESSAGE_FIELD - && !self - .skipped_field_indices - .is_some_and(|i| i.contains(field.index())) + && !self.skipped_field_indices.contains(field.index()) } } -impl tracing::field::Visit for MessageFieldExtractor<'_, S> { +impl tracing::field::Visit for MessageFieldExtractor { #[inline] fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { if self.accept_field(field) { @@ -751,14 +751,14 @@ impl tracing::field::Visit for MessageFieldExtracto /// can be skipped. // This is entirely optional and only cosmetic, though maybe helps a // bit during log parsing in dashboards when there's no field with empty object. -struct FieldsPresent<'a>(pub bool, Option<&'a SkippedFieldIndices>); +struct FieldsPresent(pub bool, SkippedFieldIndices); // Even though some methods have an overhead (error, bytes) it is assumed the // compiler won't include this since we ignore the value entirely. -impl tracing::field::Visit for FieldsPresent<'_> { +impl tracing::field::Visit for FieldsPresent { #[inline] fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) { - if !self.1.is_some_and(|i| i.contains(field.index())) + if !self.1.contains(field.index()) && field.name() != MESSAGE_FIELD && !field.name().starts_with("log.") { @@ -768,10 +768,7 @@ impl tracing::field::Visit for FieldsPresent<'_> { } /// Serializes the fields directly supplied with a log event. -struct SerializableEventFields<'a, 'event>( - &'a tracing::Event<'event>, - Option<&'a SkippedFieldIndices>, -); +struct SerializableEventFields<'a, 'event>(&'a tracing::Event<'event>, SkippedFieldIndices); impl serde::ser::Serialize for SerializableEventFields<'_, '_> { fn serialize(&self, serializer: S) -> Result @@ -788,15 +785,15 @@ impl serde::ser::Serialize for SerializableEventFields<'_, '_> { } /// A tracing field visitor that skips the message field. -struct MessageFieldSkipper<'a, S: serde::ser::SerializeMap> { +struct MessageFieldSkipper { serializer: S, - skipped_field_indices: Option<&'a SkippedFieldIndices>, + skipped_field_indices: SkippedFieldIndices, state: Result<(), S::Error>, } -impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> { +impl MessageFieldSkipper { #[inline] - fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self { + fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self { Self { serializer, skipped_field_indices, @@ -809,9 +806,7 @@ impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> { self.state.is_ok() && field.name() != MESSAGE_FIELD && !field.name().starts_with("log.") - && !self - .skipped_field_indices - .is_some_and(|i| i.contains(field.index())) + && !self.skipped_field_indices.contains(field.index()) } #[inline] @@ -821,7 +816,7 @@ impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> { } } -impl tracing::field::Visit for MessageFieldSkipper<'_, S> { +impl tracing::field::Visit for MessageFieldSkipper { #[inline] fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { if self.accept_field(field) { @@ -905,18 +900,17 @@ impl tracing::field::Visit for MessageFieldSkipper< /// with the span names as keys. To prevent collision we append a numberic value /// to the name. Also, collects any span fields we're interested in. Last one /// wins. -struct SerializableSpans<'a, 'ctx, Span, const F: usize> +struct SerializableSpans<'ctx, S> where - Span: Subscriber + for<'lookup> LookupSpan<'lookup>, + S: for<'lookup> LookupSpan<'lookup>, { - ctx: &'a Context<'ctx, Span>, - callsite_ids: &'a papaya::HashMap, - extract: ExtractedSpanFields<'a, F>, + spans: Vec>, + extracted: ExtractedSpanFields, } -impl serde::ser::Serialize for SerializableSpans<'_, '_, Span, F> +impl serde::ser::Serialize for SerializableSpans<'_, S> where - Span: Subscriber + for<'lookup> LookupSpan<'lookup>, + S: for<'lookup> LookupSpan<'lookup>, { fn serialize(&self, serializer: Ser) -> Result where @@ -924,25 +918,22 @@ where { let mut serializer = serializer.serialize_map(None)?; - if let Some(leaf_span) = self.ctx.lookup_current() { - for span in leaf_span.scope().from_root() { - // Append a numeric callsite ID to the span name to keep the name unique - // in the JSON object. - let cid = self - .callsite_ids - .pin() - .get(&span.metadata().callsite()) - .copied() - .unwrap_or_default(); + for span in self.spans.iter().rev() { + let ext = span.extensions(); - // Loki turns the # into an underscore during field name concatenation. - serializer.serialize_key(&format_args!("{}#{}", span.metadata().name(), &cid))?; + // all spans should have this extension. + let Some(fields) = ext.get() else { continue }; - serializer.serialize_value(&SerializableSpanFields { - span: &span, - extract: &self.extract, - })?; - } + self.extracted.layer_span(fields); + + let SpanFields { values, span_info } = fields; + serializer.serialize_entry( + &*span_info.normalized_name, + &SerializableSpanFields { + fields: span.metadata().fields(), + values, + }, + )?; } serializer.end() @@ -950,80 +941,77 @@ where } /// Serializes the span fields as object. -struct SerializableSpanFields<'a, 'span, Span, const F: usize> -where - Span: for<'lookup> LookupSpan<'lookup>, -{ - span: &'a SpanRef<'span, Span>, - extract: &'a ExtractedSpanFields<'a, F>, +struct SerializableSpanFields<'span> { + fields: &'span tracing::field::FieldSet, + values: &'span [serde_json::Value; MAX_TRACING_FIELDS], } -impl serde::ser::Serialize for SerializableSpanFields<'_, '_, Span, F> -where - Span: for<'lookup> LookupSpan<'lookup>, -{ +impl serde::ser::Serialize for SerializableSpanFields<'_> { fn serialize(&self, serializer: S) -> Result where S: serde::ser::Serializer, { let mut serializer = serializer.serialize_map(None)?; - let ext = self.span.extensions(); - if let Some(data) = ext.get::() { - for (name, value) in &data.fields.pin() { - serializer.serialize_entry(name, value)?; - // TODO: replace clone with reference, if possible. - self.extract.set(name, value.clone()); + for (field, value) in std::iter::zip(self.fields, self.values) { + if value.is_null() { + continue; } + serializer.serialize_entry(field.name(), value)?; } serializer.end() } } -struct ExtractedSpanFields<'a, const F: usize> { - names: &'a IndexSet<&'static str>, - // TODO: replace TryLock with something local thread and interior mutability. - // serde API doesn't let us use `mut`. - values: TryLock<([Option; F], bool)>, +struct ExtractedSpanFields { + names: &'static [&'static str], + values: RefCell>, } -impl<'a, const F: usize> ExtractedSpanFields<'a, F> { - fn new(names: &'a IndexSet<&'static str>) -> Self { +impl ExtractedSpanFields { + fn new(names: &'static [&'static str]) -> Self { ExtractedSpanFields { names, - values: TryLock::new((array::from_fn(|_| Option::default()), false)), + values: RefCell::new(vec![serde_json::Value::Null; names.len()]), } } - #[inline] - fn set(&self, name: &'static str, value: serde_json::Value) { - if let Some((index, _)) = self.names.get_full(name) { - let mut fields = self.values.try_lock().expect("thread-local use"); - fields.0[index] = Some(value); - fields.1 = true; + fn layer_span(&self, fields: &SpanFields) { + let mut v = self.values.borrow_mut(); + let SpanFields { values, span_info } = fields; + + // extract the fields + for (i, &j) in span_info.extract.iter().enumerate() { + let Some(value) = values.get(j) else { continue }; + + if !value.is_null() { + // TODO: replace clone with reference, if possible. + v[i] = value.clone(); + } } } #[inline] fn has_values(&self) -> bool { - self.values.try_lock().expect("thread-local use").1 + self.values.borrow().iter().any(|v| !v.is_null()) } } -impl serde::ser::Serialize for ExtractedSpanFields<'_, F> { +impl serde::ser::Serialize for ExtractedSpanFields { fn serialize(&self, serializer: S) -> Result where S: serde::ser::Serializer, { let mut serializer = serializer.serialize_map(None)?; - let values = self.values.try_lock().expect("thread-local use"); - for (i, value) in values.0.iter().enumerate() { - if let Some(value) = value { - let key = self.names[i]; - serializer.serialize_entry(key, value)?; + let values = self.values.borrow(); + for (key, value) in std::iter::zip(self.names, &*values) { + if value.is_null() { + continue; } + + serializer.serialize_entry(key, value)?; } serializer.end() @@ -1032,7 +1020,6 @@ impl serde::ser::Serialize for ExtractedSpanFields<'_, F> { #[cfg(test)] mod tests { - use std::marker::PhantomData; use std::sync::{Arc, Mutex, MutexGuard}; use assert_json_diff::assert_json_eq; @@ -1081,10 +1068,9 @@ mod tests { let log_layer = JsonLoggingLayer { clock: clock.clone(), skipped_field_indices: papaya::HashMap::default(), - callsite_ids: papaya::HashMap::default(), + span_info: papaya::HashMap::default(), writer: buffer.clone(), - extract_fields: IndexSet::from_iter(["x"]), - _marker: PhantomData::<[&'static str; 1]>, + extract_fields: &["x"], }; let registry = tracing_subscriber::Registry::default().with(log_layer); From a2b756843ed50ae927d5300cd36c426deb5fb7f6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 20 May 2025 23:00:49 +0000 Subject: [PATCH 126/142] chore(deps): bump setuptools from 70.0.0 to 78.1.1 in the pip group across 1 directory (#11977) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index e6440761be..21a2664555 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3170,19 +3170,24 @@ pbr = "*" [[package]] name = "setuptools" -version = "70.0.0" +version = "78.1.1" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["main"] files = [ - {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"}, - {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"}, + {file = "setuptools-78.1.1-py3-none-any.whl", hash = "sha256:c3a9c4211ff4c309edb8b8c4f1cbfa7ae324c4ba9f91ff254e3d305b9fd54561"}, + {file = "setuptools-78.1.1.tar.gz", hash = "sha256:fcc17fd9cd898242f6b4adfaca46137a9edef687f43e6f78469692a5e70d851d"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov ; platform_python_implementation != \"PyPy\"", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""] +core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"] [[package]] name = "six" From 6f4f3691a55ac4e7599c8a7e44539a266b339be9 Mon Sep 17 00:00:00 2001 From: Alexander Sarantcev <99037063+ephemeralsad@users.noreply.github.com> Date: Wed, 21 May 2025 13:03:26 +0400 Subject: [PATCH 127/142] pageserver: Add tracing endpoint correctness check in config validation (#11970) ## Problem When using an incorrect endpoint string - `"localhost:4317"`, it's a runtime error, but it can be a config error - Closes: https://github.com/neondatabase/neon/issues/11394 ## Summary of changes Add config parse time check via `request::Url::parse` validation. --------- Co-authored-by: Aleksandr Sarantsev --- pageserver/src/config.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 7e773f56b3..62f5b009f7 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -544,6 +544,23 @@ impl PageServerConf { ratio.numerator, ratio.denominator ) ); + + let url = Url::parse(&tracing_config.export_config.endpoint) + .map_err(anyhow::Error::msg) + .with_context(|| { + format!( + "tracing endpoint URL is invalid : {}", + tracing_config.export_config.endpoint + ) + })?; + + ensure!( + url.scheme() == "http" || url.scheme() == "https", + format!( + "tracing endpoint URL must start with http:// or https://: {}", + tracing_config.export_config.endpoint + ) + ); } IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance) @@ -660,4 +677,25 @@ mod tests { PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) .expect("parse_and_validate"); } + + #[test] + fn test_config_tracing_endpoint_is_invalid() { + let input = r#" + control_plane_api = "http://localhost:6666" + + [tracing] + + sampling_ratio = { numerator = 1, denominator = 0 } + + [tracing.export_config] + endpoint = "localhost:4317" + protocol = "http-binary" + timeout = "1ms" + "#; + let config_toml = toml_edit::de::from_str::(input) + .expect("config has valid fields"); + let workdir = Utf8PathBuf::from("/nonexistent"); + PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) + .expect_err("parse_and_validate should fail for endpoint without scheme"); + } } From 08bb72e516c862bb3cc1f81443a461496917a30e Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 21 May 2025 12:08:49 +0100 Subject: [PATCH 128/142] pageserver: allow in-mem reads to be planned during writes (#11937) ## Problem Get page tracing revealed situations where planning an in-memory layer is taking around 150ms. Upon investigation, the culprit is the inner in-mem layer file lock. A batch being written holds the write lock and a read being planned wants the read lock. See [this trace](https://neonprod.grafana.net/explore?schemaVersion=1&panes=%7B%22j61%22:%7B%22datasource%22:%22JMfY_5TVz%22,%22queries%22:%5B%7B%22refId%22:%22traceId%22,%22queryType%22:%22traceql%22,%22query%22:%22412ec4522fe1750798aca54aec2680ac%22,%22datasource%22:%7B%22type%22:%22tempo%22,%22uid%22:%22JMfY_5TVz%22%7D,%22limit%22:20,%22tableType%22:%22traces%22,%22metricsQueryType%22:%22range%22%7D%5D,%22range%22:%7B%22to%22:%221746702606349%22,%22from%22:%221746681006349%22%7D,%22panelsState%22:%7B%22trace%22:%7B%22spanId%22:%2291e9f1879c9bccc0%22%7D%7D%7D,%226d0%22:%7B%22datasource%22:%22JMfY_5TVz%22,%22queries%22:%5B%7B%22refId%22:%22traceId%22,%22queryType%22:%22traceql%22,%22query%22:%2220a4757706b16af0e1fbab83f9d2e925%22,%22datasource%22:%7B%22type%22:%22tempo%22,%22uid%22:%22JMfY_5TVz%22%7D,%22limit%22:20,%22tableType%22:%22traces%22,%22metricsQueryType%22:%22range%22%7D%5D,%22range%22:%7B%22to%22:%221746702614807%22,%22from%22:%221746681014807%22%7D,%22panelsState%22:%7B%22trace%22:%7B%22spanId%22:%2260e7825512bc2a6b%22%7D%7D%7D%7D) for example. ## Summary of changes Lift the index into its own RwLock such that we can at least plan during write IO. I tried to be smarter in https://github.com/neondatabase/neon/pull/11866: arc swap + structurally shared datastructure and that killed ingest perf for small keys. ## Benchmarking * No statistically significant difference for rust inget benchmarks when compared to main. --- .../tenant/storage_layer/inmemory_layer.rs | 128 +++++++++++------- 1 file changed, 76 insertions(+), 52 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 5d558e66cc..200beba115 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -63,7 +63,28 @@ pub struct InMemoryLayer { opened_at: Instant, - /// The above fields never change, except for `end_lsn`, which is only set once. + /// All versions of all pages in the layer are kept here. Indexed + /// by block number and LSN. The [`IndexEntry`] is an offset into the + /// ephemeral file where the page version is stored. + /// + /// We use a separate lock for the index to reduce the critical section + /// during which reads cannot be planned. + /// + /// If you need access to both the index and the underlying file at the same time, + /// respect the following locking order to avoid deadlocks: + /// 1. [`InMemoryLayer::inner`] + /// 2. [`InMemoryLayer::index`] + /// + /// Note that the file backing [`InMemoryLayer::inner`] is append-only, + /// so it is not necessary to hold simultaneous locks on index. + /// This avoids holding index locks across IO, and is crucial for avoiding read tail latency. + /// In particular: + /// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`]. + /// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`]. + index: RwLock>>, + + /// The above fields never change, except for `end_lsn`, which is only set once, + /// and `index` (see rationale there). /// All other changing parts are in `inner`, and protected by a mutex. inner: RwLock, @@ -81,11 +102,6 @@ impl std::fmt::Debug for InMemoryLayer { } pub struct InMemoryLayerInner { - /// All versions of all pages in the layer are kept here. Indexed - /// by block number and LSN. The [`IndexEntry`] is an offset into the - /// ephemeral file where the page version is stored. - index: BTreeMap>, - /// The values are stored in a serialized format in this file. /// Each serialized Value is preceded by a 'u32' length field. /// PerSeg::page_versions map stores offsets into this file. @@ -105,7 +121,7 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = { trailing_ones }; -/// See [`InMemoryLayerInner::index`]. +/// See [`InMemoryLayer::index`]. /// /// For memory efficiency, the data is packed into a u64. /// @@ -425,7 +441,7 @@ impl InMemoryLayer { .page_content_kind(PageContentKind::InMemoryLayer) .attached_child(); - let inner = self.inner.read().await; + let index = self.index.read().await; struct ValueRead { entry_lsn: Lsn, @@ -435,10 +451,7 @@ impl InMemoryLayer { let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); for range in keyspace.ranges.iter() { - for (key, vec_map) in inner - .index - .range(range.start.to_compact()..range.end.to_compact()) - { + for (key, vec_map) in index.range(range.start.to_compact()..range.end.to_compact()) { let key = Key::from_compact(*key); let slice = vec_map.slice_range(lsn_range.clone()); @@ -466,7 +479,7 @@ impl InMemoryLayer { } } } - drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below + drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below let read_from = Arc::clone(self); let read_ctx = ctx.attached_child(); reconstruct_state @@ -573,8 +586,8 @@ impl InMemoryLayer { start_lsn, end_lsn: OnceLock::new(), opened_at: Instant::now(), + index: RwLock::new(BTreeMap::new()), inner: RwLock::new(InMemoryLayerInner { - index: BTreeMap::new(), file, resource_units: GlobalResourceUnits::new(), }), @@ -592,31 +605,39 @@ impl InMemoryLayer { serialized_batch: SerializedValueBatch, ctx: &RequestContext, ) -> anyhow::Result<()> { - let mut inner = self.inner.write().await; - self.assert_writable(); + let (base_offset, metadata) = { + let mut inner = self.inner.write().await; + self.assert_writable(); - let base_offset = inner.file.len(); + let base_offset = inner.file.len(); - let SerializedValueBatch { - raw, - metadata, - max_lsn: _, - len: _, - } = serialized_batch; + let SerializedValueBatch { + raw, + metadata, + max_lsn: _, + len: _, + } = serialized_batch; - // Write the batch to the file - inner.file.write_raw(&raw, ctx).await?; - let new_size = inner.file.len(); + // Write the batch to the file + inner.file.write_raw(&raw, ctx).await?; + let new_size = inner.file.len(); - let expected_new_len = base_offset - .checked_add(raw.len().into_u64()) - // write_raw would error if we were to overflow u64. - // also IndexEntry and higher levels in - //the code don't allow the file to grow that large - .unwrap(); - assert_eq!(new_size, expected_new_len); + let expected_new_len = base_offset + .checked_add(raw.len().into_u64()) + // write_raw would error if we were to overflow u64. + // also IndexEntry and higher levels in + //the code don't allow the file to grow that large + .unwrap(); + assert_eq!(new_size, expected_new_len); + + inner.resource_units.maybe_publish_size(new_size); + + (base_offset, metadata) + }; // Update the index with the new entries + let mut index = self.index.write().await; + for meta in metadata { let SerializedValueMeta { key, @@ -639,7 +660,7 @@ impl InMemoryLayer { will_init, })?; - let vec_map = inner.index.entry(key).or_default(); + let vec_map = index.entry(key).or_default(); let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0; if old.is_some() { // This should not break anything, but is unexpected: ingestion code aims to filter out @@ -658,8 +679,6 @@ impl InMemoryLayer { ); } - inner.resource_units.maybe_publish_size(new_size); - Ok(()) } @@ -680,6 +699,18 @@ impl InMemoryLayer { /// Records the end_lsn for non-dropped layers. /// `end_lsn` is exclusive + /// + /// A note on locking: + /// The current API of [`InMemoryLayer`] does not ensure that there's no ongoing + /// writes while freezing the layer. This is enforced at a higher level via + /// [`crate::tenant::Timeline::write_lock`]. Freeze might be called via two code paths: + /// 1. Via the active [`crate::tenant::timeline::TimelineWriter`]. This holds the + /// Timeline::write_lock for its lifetime. The rolling is handled in + /// [`crate::tenant::timeline::TimelineWriter::put_batch`]. It's a &mut self function + /// so can't be called from different threads. + /// 2. In the background via [`crate::tenant::Timeline::maybe_freeze_ephemeral_layer`]. + /// This only proceeds if try_lock on Timeline::write_lock succeeds (i.e. there's no active writer), + /// hence there can be no concurrent writes pub async fn freeze(&self, end_lsn: Lsn) { assert!( self.start_lsn < end_lsn, @@ -700,8 +731,8 @@ impl InMemoryLayer { #[cfg(debug_assertions)] { - let inner = self.inner.write().await; - for vec_map in inner.index.values() { + let index = self.index.read().await; + for vec_map in index.values() { for (lsn, _) in vec_map.as_slice() { assert!(*lsn < end_lsn); } @@ -724,14 +755,11 @@ impl InMemoryLayer { ) -> Result> { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the - // write lock on it, so we shouldn't block anyone. There's one exception - // though: another thread might have grabbed a reference to this layer - // in `get_layer_for_write' just before the checkpointer called - // `freeze`, and then `write_to_disk` on it. When the thread gets the - // lock, it will see that it's not writeable anymore and retry, but it - // would have to wait until we release it. That race condition is very - // rare though, so we just accept the potential latency hit for now. + // write lock on it, so we shouldn't block anyone. See the comment on + // [`InMemoryLayer::freeze`] to understand how locking between the append path + // and layer flushing works. let inner = self.inner.read().await; + let index = self.index.read().await; use l0_flush::Inner; let _concurrency_permit = match l0_flush_global_state { @@ -743,13 +771,9 @@ impl InMemoryLayer { let key_count = if let Some(key_range) = key_range { let key_range = key_range.start.to_compact()..key_range.end.to_compact(); - inner - .index - .iter() - .filter(|(k, _)| key_range.contains(k)) - .count() + index.iter().filter(|(k, _)| key_range.contains(k)).count() } else { - inner.index.len() + index.len() }; if key_count == 0 { return Ok(None); @@ -772,7 +796,7 @@ impl InMemoryLayer { let file_contents = inner.file.load_to_io_buf(ctx).await?; let file_contents = file_contents.freeze(); - for (key, vec_map) in inner.index.iter() { + for (key, vec_map) in index.iter() { // Write all page versions for (lsn, entry) in vec_map .as_slice() From 136cf1979baf96ff345fa3ff75f619c4d22ccd8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 21 May 2025 13:28:22 +0200 Subject: [PATCH 129/142] Add metric for number of offloaded timelines (#11976) We want to keep track of the number of offloaded timelines. It's a per-tenant shard metric because each shard makes offloading decisions on its own. --- pageserver/src/metrics.rs | 14 +++++++++++++- pageserver/src/tenant.rs | 19 +++++++++++++++++-- test_runner/fixtures/metrics.py | 1 + test_runner/regress/test_timeline_archive.py | 10 ++++++++++ 4 files changed, 41 insertions(+), 3 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index c50f730f41..eae3045a3b 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1066,6 +1066,15 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy = Lazy::new(| .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric") }); +pub(crate) static TENANT_OFFLOADED_TIMELINES: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_tenant_offloaded_timelines", + "Number of offloaded timelines of a tenant", + &["tenant_id", "shard_id"] + ) + .expect("Failed to register pageserver_tenant_offloaded_timelines metric") +}); + pub(crate) static EVICTION_ITERATION_DURATION: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_eviction_iteration_duration_seconds_global", @@ -3551,11 +3560,14 @@ impl TimelineMetrics { } pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { + let tid = tenant_shard_id.tenant_id.to_string(); + let shard_id = tenant_shard_id.shard_slug().to_string(); + // Only shard zero deals in synthetic sizes if tenant_shard_id.is_shard_zero() { - let tid = tenant_shard_id.tenant_id.to_string(); let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); } + let _ = TENANT_OFFLOADED_TIMELINES.remove_label_values(&[&tid, &shard_id]); tenant_throttling::remove_tenant_metrics(tenant_shard_id); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index fffd1f4090..35ddba355d 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -86,8 +86,8 @@ use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::l0_flush::L0FlushGlobalState; use crate::metrics::{ BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS, - INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC, - TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics, + INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_OFFLOADED_TIMELINES, + TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics, }; use crate::task_mgr::TaskKind; use crate::tenant::config::LocationMode; @@ -3348,6 +3348,13 @@ impl TenantShard { activated_timelines += 1; } + let tid = self.tenant_shard_id.tenant_id.to_string(); + let shard_id = self.tenant_shard_id.shard_slug().to_string(); + let offloaded_timeline_count = timelines_offloaded_accessor.len(); + TENANT_OFFLOADED_TIMELINES + .with_label_values(&[&tid, &shard_id]) + .set(offloaded_timeline_count as u64); + self.state.send_modify(move |current_state| { assert!( matches!(current_state, TenantState::Activating(_)), @@ -5560,6 +5567,14 @@ impl TenantShard { } } + // Update metrics + let tid = self.tenant_shard_id.to_string(); + let shard_id = self.tenant_shard_id.shard_slug().to_string(); + let set_key = &[tid.as_str(), shard_id.as_str()][..]; + TENANT_OFFLOADED_TIMELINES + .with_label_values(set_key) + .set(manifest.offloaded_timelines.len() as u64); + // Upload the manifest. Remote storage does no retries internally, so retry here. match backoff::retry( || async { diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 879808b7ba..1dd4fe8316 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -184,6 +184,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( "pageserver_evictions_with_low_residence_duration_total", "pageserver_aux_file_estimated_size", "pageserver_valid_lsn_lease_count", + "pageserver_tenant_offloaded_timelines", counter("pageserver_tenant_throttling_count_accounted_start"), counter("pageserver_tenant_throttling_count_accounted_finish"), counter("pageserver_tenant_throttling_wait_usecs_sum"), diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 4360b42d68..8d46ef8306 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -193,6 +193,11 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent" ) + offloaded_count = ps_http.get_metric_value( + "pageserver_tenant_offloaded_timelines", {"tenant_id": f"{tenant_id}"} + ) + assert offloaded_count == 0 + ps_http.timeline_archival_config( tenant_id, leaf_timeline_id, @@ -244,6 +249,11 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b wait_until(leaf_offloaded) wait_until(parent_offloaded) + offloaded_count = ps_http.get_metric_value( + "pageserver_tenant_offloaded_timelines", {"tenant_id": f"{tenant_id}"} + ) + assert offloaded_count == 2 + # Offloaded child timelines should still prevent deletion with pytest.raises( PageserverApiException, From 5db20af8a70d27e11bf7221bc63895f763386636 Mon Sep 17 00:00:00 2001 From: Konstantin Merenkov Date: Wed, 21 May 2025 17:27:30 +0200 Subject: [PATCH 130/142] Keep the conn info cache on max_client_conn from pgbouncer (#11986) ## Problem Hitting max_client_conn from pgbouncer would lead to invalidation of the conn info cache. Customers would hit the limit on wake_compute. ## Summary of changes `should_retry_wake_compute` detects this specific error from pgbouncer as non-retriable, meaning we won't try to wake up the compute again. --- Cargo.lock | 40 +++++++ libs/proxy/tokio-postgres2/src/error/mod.rs | 21 ++++ proxy/Cargo.toml | 1 + proxy/src/proxy/retry.rs | 70 +++++++++++- proxy/src/proxy/tests/mod.rs | 111 +++++++++++++++++++- workspace_hack/Cargo.toml | 1 + 6 files changed, 240 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9f4d537b33..b52ecec128 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3898,6 +3898,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num" version = "0.4.1" @@ -4182,6 +4192,12 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + [[package]] name = "p256" version = "0.11.1" @@ -5239,6 +5255,7 @@ dependencies = [ "tracing-log", "tracing-opentelemetry", "tracing-subscriber", + "tracing-test", "tracing-utils", "try-lock", "typed-json", @@ -7689,6 +7706,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" dependencies = [ "matchers", + "nu-ansi-term", "once_cell", "regex", "serde", @@ -7702,6 +7720,27 @@ dependencies = [ "tracing-serde", ] +[[package]] +name = "tracing-test" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "557b891436fe0d5e0e363427fc7f217abf9ccd510d5136549847bdcbcd011d68" +dependencies = [ + "tracing-core", + "tracing-subscriber", + "tracing-test-macro", +] + +[[package]] +name = "tracing-test-macro" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04659ddb06c87d233c566112c1c9c5b9e98256d9af50ec3bc9c8327f873a7568" +dependencies = [ + "quote", + "syn 2.0.100", +] + [[package]] name = "tracing-utils" version = "0.1.0" @@ -8554,6 +8593,7 @@ dependencies = [ "tracing", "tracing-core", "tracing-log", + "tracing-subscriber", "url", "uuid", "zeroize", diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs index b12e76e5bf..8149bceeb9 100644 --- a/libs/proxy/tokio-postgres2/src/error/mod.rs +++ b/libs/proxy/tokio-postgres2/src/error/mod.rs @@ -86,6 +86,27 @@ pub struct DbError { } impl DbError { + pub fn new_test_error(code: SqlState, message: String) -> Self { + DbError { + severity: "ERROR".to_string(), + parsed_severity: Some(Severity::Error), + code, + message, + detail: None, + hint: None, + position: None, + where_: None, + schema: None, + table: None, + column: None, + datatype: None, + constraint: None, + file: None, + line: None, + routine: None, + } + } + pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result { let mut severity = None; let mut parsed_severity = None; diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 2cec510d82..ce8610be24 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -127,3 +127,4 @@ rstest.workspace = true walkdir.workspace = true rand_distr = "0.4" tokio-postgres.workspace = true +tracing-test = "0.2" \ No newline at end of file diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index 42d1491782..0879564ced 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -48,7 +48,7 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError { use postgres_client::error::SqlState; // Here are errors that happens after the user successfully authenticated to the database. // TODO: there are pgbouncer errors that should be retried, but they are not listed here. - !matches!( + let non_retriable_pg_errors = matches!( self.code(), &SqlState::TOO_MANY_CONNECTIONS | &SqlState::OUT_OF_MEMORY @@ -56,8 +56,20 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError { | &SqlState::T_R_SERIALIZATION_FAILURE | &SqlState::INVALID_CATALOG_NAME | &SqlState::INVALID_SCHEMA_NAME - | &SqlState::INVALID_PARAMETER_VALUE - ) + | &SqlState::INVALID_PARAMETER_VALUE, + ); + if non_retriable_pg_errors { + return false; + } + // PGBouncer errors that should not trigger a wake_compute retry. + if self.code() == &SqlState::PROTOCOL_VIOLATION { + // Source for the error message: + // https://github.com/pgbouncer/pgbouncer/blob/f15997fe3effe3a94ba8bcc1ea562e6117d1a131/src/client.c#L1070 + return !self + .message() + .contains("no more connections allowed (max_client_conn)"); + } + true } } @@ -110,3 +122,55 @@ pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Durati .base_delay .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1)) } + +#[cfg(test)] +mod tests { + use super::ShouldRetryWakeCompute; + use postgres_client::error::{DbError, SqlState}; + + #[test] + fn should_retry_wake_compute_for_db_error() { + // These SQLStates should NOT trigger a wake_compute retry. + let non_retry_states = [ + SqlState::TOO_MANY_CONNECTIONS, + SqlState::OUT_OF_MEMORY, + SqlState::SYNTAX_ERROR, + SqlState::T_R_SERIALIZATION_FAILURE, + SqlState::INVALID_CATALOG_NAME, + SqlState::INVALID_SCHEMA_NAME, + SqlState::INVALID_PARAMETER_VALUE, + ]; + for state in non_retry_states { + let err = DbError::new_test_error(state.clone(), "oops".to_string()); + assert!( + !err.should_retry_wake_compute(), + "State {state:?} unexpectedly retried" + ); + } + + // Errors coming from pgbouncer should not trigger a wake_compute retry + let non_retry_pgbouncer_errors = ["no more connections allowed (max_client_conn)"]; + for error in non_retry_pgbouncer_errors { + let err = DbError::new_test_error(SqlState::PROTOCOL_VIOLATION, error.to_string()); + assert!( + !err.should_retry_wake_compute(), + "PGBouncer error {error:?} unexpectedly retried" + ); + } + + // These SQLStates should trigger a wake_compute retry. + let retry_states = [ + SqlState::CONNECTION_FAILURE, + SqlState::CONNECTION_EXCEPTION, + SqlState::CONNECTION_DOES_NOT_EXIST, + SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION, + ]; + for state in retry_states { + let err = DbError::new_test_error(state.clone(), "oops".to_string()); + assert!( + err.should_retry_wake_compute(), + "State {state:?} unexpectedly skipped retry" + ); + } + } +} diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index f47636cd71..be6426a63c 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -15,6 +15,7 @@ use rstest::rstest; use rustls::crypto::ring; use rustls::pki_types; use tokio::io::DuplexStream; +use tracing_test::traced_test; use super::connect_compute::ConnectMechanism; use super::retry::CouldRetry; @@ -381,8 +382,14 @@ enum ConnectAction { WakeFail, WakeRetry, Connect, + // connect_once -> Err, could_retry = true, should_retry_wake_compute = true Retry, + // connect_once -> Err, could_retry = true, should_retry_wake_compute = false + RetryNoWake, + // connect_once -> Err, could_retry = false, should_retry_wake_compute = true Fail, + // connect_once -> Err, could_retry = false, should_retry_wake_compute = false + FailNoWake, } #[derive(Clone)] @@ -424,6 +431,7 @@ struct TestConnection; #[derive(Debug)] struct TestConnectError { retryable: bool, + wakeable: bool, kind: crate::error::ErrorKind, } @@ -448,7 +456,7 @@ impl CouldRetry for TestConnectError { } impl ShouldRetryWakeCompute for TestConnectError { fn should_retry_wake_compute(&self) -> bool { - true + self.wakeable } } @@ -471,10 +479,22 @@ impl ConnectMechanism for TestConnectMechanism { ConnectAction::Connect => Ok(TestConnection), ConnectAction::Retry => Err(TestConnectError { retryable: true, + wakeable: true, + kind: ErrorKind::Compute, + }), + ConnectAction::RetryNoWake => Err(TestConnectError { + retryable: true, + wakeable: false, kind: ErrorKind::Compute, }), ConnectAction::Fail => Err(TestConnectError { retryable: false, + wakeable: true, + kind: ErrorKind::Compute, + }), + ConnectAction::FailNoWake => Err(TestConnectError { + retryable: false, + wakeable: false, kind: ErrorKind::Compute, }), x => panic!("expecting action {x:?}, connect is called instead"), @@ -709,3 +729,92 @@ async fn wake_non_retry() { .unwrap_err(); mechanism.verify(); } + +#[tokio::test] +#[traced_test] +async fn fail_but_wake_invalidates_cache() { + let ctx = RequestContext::test(); + let mech = TestConnectMechanism::new(vec![ + ConnectAction::Wake, + ConnectAction::Fail, + ConnectAction::Wake, + ConnectAction::Connect, + ]); + let user = helper_create_connect_info(&mech); + let cfg = config(); + + connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg) + .await + .unwrap(); + + assert!(logs_contain( + "invalidating stalled compute node info cache entry" + )); +} + +#[tokio::test] +#[traced_test] +async fn fail_no_wake_skips_cache_invalidation() { + let ctx = RequestContext::test(); + let mech = TestConnectMechanism::new(vec![ + ConnectAction::Wake, + ConnectAction::FailNoWake, + ConnectAction::Connect, + ]); + let user = helper_create_connect_info(&mech); + let cfg = config(); + + connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg) + .await + .unwrap(); + + assert!(!logs_contain( + "invalidating stalled compute node info cache entry" + )); +} + +#[tokio::test] +#[traced_test] +async fn retry_but_wake_invalidates_cache() { + let _ = env_logger::try_init(); + use ConnectAction::*; + + let ctx = RequestContext::test(); + // Wake → Retry (retryable + wakeable) → Wake → Connect + let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); + let user_info = helper_create_connect_info(&mechanism); + let cfg = config(); + + connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg) + .await + .unwrap(); + mechanism.verify(); + + // Because Retry has wakeable=true, we should see invalidate_cache + assert!(logs_contain( + "invalidating stalled compute node info cache entry" + )); +} + +#[tokio::test] +#[traced_test] +async fn retry_no_wake_skips_invalidation() { + let _ = env_logger::try_init(); + use ConnectAction::*; + + let ctx = RequestContext::test(); + // Wake → RetryNoWake (retryable + NOT wakeable) + let mechanism = TestConnectMechanism::new(vec![Wake, RetryNoWake]); + let user_info = helper_create_connect_info(&mechanism); + let cfg = config(); + + connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg) + .await + .unwrap_err(); + mechanism.verify(); + + // Because RetryNoWake has wakeable=false, we must NOT see invalidate_cache + assert!(!logs_contain( + "invalidating stalled compute node info cache entry" + )); +} diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 69d44b82ea..87d0092fb2 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -107,6 +107,7 @@ tower = { version = "0.4", default-features = false, features = ["balance", "buf tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } tracing-log = { version = "0.2" } +tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } zeroize = { version = "1", features = ["derive", "serde"] } From 95a5f749c83c0fb26ab2cd06c5d840b816f9b9f8 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 21 May 2025 17:42:11 +0200 Subject: [PATCH 131/142] pageserver: use an `Option` for `GcCutoffs::time` (#11984) ## Problem It is not currently possible to disambiguate a timeline with an uninitialized PITR cutoff from one that was created within the PITR window -- both of these have `GcCutoffs::time == Lsn(0)`. For billing metrics, we need to disambiguate these to avoid accidentally billing the entire history when a tenant is initially loaded. Touches https://github.com/neondatabase/cloud/issues/28155. ## Summary of changes Make `GcCutoffs::time` an `Option`, and only set it to `Some` when initialized. A `pitr_interval` of 0 will yield `Some(last_record_lsn)`. This PR takes a conservative approach, and mostly retains the old behavior of consumers by using `unwrap_or_default()` to yield 0 when uninitialized, to avoid accidentally introducing bugs -- except in cases where there is high confidence that the change is beneficial (e.g. for the `pageserver_pitr_history_size` Prometheus metric and to return early during GC). --- pageserver/src/http/routes.rs | 2 +- pageserver/src/tenant.rs | 58 +++++++++-------- pageserver/src/tenant/size.rs | 2 +- pageserver/src/tenant/timeline.rs | 67 ++++++++++---------- pageserver/src/tenant/timeline/compaction.rs | 2 +- 5 files changed, 68 insertions(+), 63 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 0b36eb5df7..0d6791cddd 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -449,7 +449,7 @@ async fn build_timeline_info_common( // Internally we distinguish between the planned GC cutoff (PITR point) and the "applied" GC cutoff (where we // actually trimmed data to), which can pass each other when PITR is changed. let min_readable_lsn = std::cmp::max( - timeline.get_gc_cutoff_lsn(), + timeline.get_gc_cutoff_lsn().unwrap_or_default(), *timeline.get_applied_gc_cutoff_lsn(), ); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 35ddba355d..c15b44469a 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -4594,7 +4594,7 @@ impl TenantShard { target.cutoffs = GcCutoffs { space: space_cutoff, - time: Lsn::INVALID, + time: None, }; } } @@ -4678,7 +4678,7 @@ impl TenantShard { if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() { if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) { target.within_ancestor_pitr = - timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time; + Some(timeline.get_ancestor_lsn()) >= ancestor_gc_cutoffs.time; } } @@ -4691,13 +4691,15 @@ impl TenantShard { } else { 0 }); - timeline.metrics.pitr_history_size.set( - timeline - .get_last_record_lsn() - .checked_sub(target.cutoffs.time) - .unwrap_or(Lsn(0)) - .0, - ); + if let Some(time_cutoff) = target.cutoffs.time { + timeline.metrics.pitr_history_size.set( + timeline + .get_last_record_lsn() + .checked_sub(time_cutoff) + .unwrap_or_default() + .0, + ); + } // Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline? // - this timeline was created while we were finding cutoffs @@ -4706,8 +4708,8 @@ impl TenantShard { let original_cutoffs = target.cutoffs.clone(); // GC cutoffs should never go back target.cutoffs = GcCutoffs { - space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)), - time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)), + space: cutoffs.space.max(original_cutoffs.space), + time: cutoffs.time.max(original_cutoffs.time), } } } @@ -8952,7 +8954,7 @@ mod tests { .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); - guard.cutoffs.time = Lsn(0x30); + guard.cutoffs.time = Some(Lsn(0x30)); guard.cutoffs.space = Lsn(0x30); } @@ -9060,7 +9062,7 @@ mod tests { .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); - guard.cutoffs.time = Lsn(0x40); + guard.cutoffs.time = Some(Lsn(0x40)); guard.cutoffs.space = Lsn(0x40); } tline @@ -9478,7 +9480,7 @@ mod tests { *guard = GcInfo { retain_lsns: vec![], cutoffs: GcCutoffs { - time: Lsn(0x30), + time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), @@ -9562,7 +9564,7 @@ mod tests { .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); - guard.cutoffs.time = Lsn(0x40); + guard.cutoffs.time = Some(Lsn(0x40)); guard.cutoffs.space = Lsn(0x40); } tline @@ -10033,7 +10035,7 @@ mod tests { (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { - time: Lsn(0x30), + time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), @@ -10096,7 +10098,7 @@ mod tests { let verify_result = || async { let gc_horizon = { let gc_info = tline.gc_info.read().unwrap(); - gc_info.cutoffs.time + gc_info.cutoffs.time.unwrap_or_default() }; for idx in 0..10 { assert_eq!( @@ -10174,7 +10176,7 @@ mod tests { .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); - guard.cutoffs.time = Lsn(0x38); + guard.cutoffs.time = Some(Lsn(0x38)); guard.cutoffs.space = Lsn(0x38); } tline @@ -10282,7 +10284,7 @@ mod tests { (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { - time: Lsn(0x30), + time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), @@ -10345,7 +10347,7 @@ mod tests { let verify_result = || async { let gc_horizon = { let gc_info = tline.gc_info.read().unwrap(); - gc_info.cutoffs.time + gc_info.cutoffs.time.unwrap_or_default() }; for idx in 0..10 { assert_eq!( @@ -10531,7 +10533,7 @@ mod tests { *guard = GcInfo { retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { - time: Lsn(0x10), + time: Some(Lsn(0x10)), space: Lsn(0x10), }, leases: Default::default(), @@ -10551,7 +10553,7 @@ mod tests { *guard = GcInfo { retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { - time: Lsn(0x50), + time: Some(Lsn(0x50)), space: Lsn(0x50), }, leases: Default::default(), @@ -11272,7 +11274,7 @@ mod tests { *guard = GcInfo { retain_lsns: vec![(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { - time: Lsn(0x30), + time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), @@ -11661,7 +11663,7 @@ mod tests { (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { - time: Lsn(0x30), + time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), @@ -11724,7 +11726,7 @@ mod tests { let verify_result = || async { let gc_horizon = { let gc_info = tline.gc_info.read().unwrap(); - gc_info.cutoffs.time + gc_info.cutoffs.time.unwrap_or_default() }; for idx in 0..10 { assert_eq!( @@ -11913,7 +11915,7 @@ mod tests { (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { - time: Lsn(0x30), + time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), @@ -11976,7 +11978,7 @@ mod tests { let verify_result = || async { let gc_horizon = { let gc_info = tline.gc_info.read().unwrap(); - gc_info.cutoffs.time + gc_info.cutoffs.time.unwrap_or_default() }; for idx in 0..10 { assert_eq!( @@ -12239,7 +12241,7 @@ mod tests { *guard = GcInfo { retain_lsns: vec![], cutoffs: GcCutoffs { - time: Lsn(0x30), + time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index bf5d9bc87a..d1020cff96 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -235,7 +235,7 @@ pub(super) async fn gather_inputs( // than our internal space cutoff. This means that if someone drops a database and waits for their // PITR interval, they will see synthetic size decrease, even if we are still storing data inside // the space cutoff. - let mut next_pitr_cutoff = gc_info.cutoffs.time; + let mut next_pitr_cutoff = gc_info.cutoffs.time.unwrap_or_default(); // TODO: handle None // If the caller provided a shorter retention period, use that instead of the GC cutoff. let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index da2e56d80a..670e0865df 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -529,29 +529,24 @@ impl GcInfo { /// The `GcInfo` component describing which Lsns need to be retained. Functionally, this /// is a single number (the oldest LSN which we must retain), but it internally distinguishes /// between time-based and space-based retention for observability and consumption metrics purposes. -#[derive(Debug, Clone)] +#[derive(Clone, Debug, Default)] pub(crate) struct GcCutoffs { /// Calculated from the [`pageserver_api::models::TenantConfig::gc_horizon`], this LSN indicates how much /// history we must keep to retain a specified number of bytes of WAL. pub(crate) space: Lsn, - /// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates how much - /// history we must keep to enable reading back at least the PITR interval duration. - pub(crate) time: Lsn, -} - -impl Default for GcCutoffs { - fn default() -> Self { - Self { - space: Lsn::INVALID, - time: Lsn::INVALID, - } - } + /// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates + /// how much history we must keep to enable reading back at least the PITR interval duration. + /// + /// None indicates that the PITR cutoff has not been computed. A PITR interval of 0 will yield + /// Some(last_record_lsn). + pub(crate) time: Option, } impl GcCutoffs { fn select_min(&self) -> Lsn { - std::cmp::min(self.space, self.time) + // NB: if we haven't computed the PITR cutoff yet, we can't GC anything. + self.space.min(self.time.unwrap_or_default()) } } @@ -1088,11 +1083,14 @@ impl Timeline { /// Get the bytes written since the PITR cutoff on this branch, and /// whether this branch's ancestor_lsn is within its parent's PITR. pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) { + // TODO: for backwards compatibility, we return the full history back to 0 when the PITR + // cutoff has not yet been initialized. This should return None instead, but this is exposed + // in external HTTP APIs and callers may not handle a null value. let gc_info = self.gc_info.read().unwrap(); let history = self .get_last_record_lsn() - .checked_sub(gc_info.cutoffs.time) - .unwrap_or(Lsn(0)) + .checked_sub(gc_info.cutoffs.time.unwrap_or_default()) + .unwrap_or_default() .0; (history, gc_info.within_ancestor_pitr) } @@ -1102,9 +1100,10 @@ impl Timeline { self.applied_gc_cutoff_lsn.read() } - /// Read timeline's planned GC cutoff: this is the logical end of history that users - /// are allowed to read (based on configured PITR), even if physically we have more history. - pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn { + /// Read timeline's planned GC cutoff: this is the logical end of history that users are allowed + /// to read (based on configured PITR), even if physically we have more history. Returns None + /// if the PITR cutoff has not yet been initialized. + pub(crate) fn get_gc_cutoff_lsn(&self) -> Option { self.gc_info.read().unwrap().cutoffs.time } @@ -6235,14 +6234,12 @@ impl Timeline { pausable_failpoint!("Timeline::find_gc_cutoffs-pausable"); - if cfg!(test) { + if cfg!(test) && pitr == Duration::ZERO { // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup - if pitr == Duration::ZERO { - return Ok(GcCutoffs { - time: self.get_last_record_lsn(), - space: space_cutoff, - }); - } + return Ok(GcCutoffs { + time: Some(self.get_last_record_lsn()), + space: space_cutoff, + }); } // Calculate a time-based limit on how much to retain: @@ -6256,14 +6253,14 @@ impl Timeline { // PITR is not set. Retain the size-based limit, or the default time retention, // whichever requires less data. GcCutoffs { - time: self.get_last_record_lsn(), + time: Some(self.get_last_record_lsn()), space: std::cmp::max(time_cutoff, space_cutoff), } } (Duration::ZERO, None) => { // PITR is not set, and time lookup failed GcCutoffs { - time: self.get_last_record_lsn(), + time: Some(self.get_last_record_lsn()), space: space_cutoff, } } @@ -6271,7 +6268,7 @@ impl Timeline { // PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR // cannot advance beyond what was already GC'd, and respect space-based retention GcCutoffs { - time: *self.get_applied_gc_cutoff_lsn(), + time: Some(*self.get_applied_gc_cutoff_lsn()), space: space_cutoff, } } @@ -6279,7 +6276,7 @@ impl Timeline { // PITR interval is set and we looked up timestamp successfully. Ignore // size based retention and make time cutoff authoritative GcCutoffs { - time: time_cutoff, + time: Some(time_cutoff), space: time_cutoff, } } @@ -6332,7 +6329,7 @@ impl Timeline { ) }; - let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff); + let mut new_gc_cutoff = space_cutoff.min(time_cutoff.unwrap_or_default()); let standby_horizon = self.standby_horizon.load(); // Hold GC for the standby, but as a safety guard do it only within some // reasonable lag. @@ -6381,7 +6378,7 @@ impl Timeline { async fn gc_timeline( &self, space_cutoff: Lsn, - time_cutoff: Lsn, + time_cutoff: Option, // None if uninitialized retain_lsns: Vec, max_lsn_with_valid_lease: Option, new_gc_cutoff: Lsn, @@ -6400,6 +6397,12 @@ impl Timeline { return Ok(result); } + let Some(time_cutoff) = time_cutoff else { + // The GC cutoff should have been computed by now, but let's be defensive. + info!("Nothing to GC: time_cutoff not yet computed"); + return Ok(result); + }; + // We need to ensure that no one tries to read page versions or create // branches at a point before latest_gc_cutoff_lsn. See branch_timeline() // for details. This will block until the old value is no longer in use. diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 37c1a8f60c..0e4b14c3e4 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1526,7 +1526,7 @@ impl Timeline { info!( "starting shard ancestor compaction, rewriting {} layers and dropping {} layers, \ checked {layers_checked}/{layers_total} layers \ - (latest_gc_cutoff={} pitr_cutoff={})", + (latest_gc_cutoff={} pitr_cutoff={:?})", layers_to_rewrite.len(), drop_layers.len(), *latest_gc_cutoff, From f9fdbc961807e86c9a7de9d1199e62b3a60589d6 Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Wed, 21 May 2025 22:26:05 +0200 Subject: [PATCH 132/142] remove auth_endpoint password from log and command line for local proxy mode (#11991) ## Problem When testing local proxy the auth-endpoint password shows up in command line and log ```bash RUST_LOG=proxy LOGFMT=text cargo run --release --package proxy --bin proxy --features testing -- \ --auth-backend postgres \ --auth-endpoint 'postgresql://postgres:secret_password@127.0.0.1:5432/postgres' \ --tls-cert server.crt \ --tls-key server.key \ --wss 0.0.0.0:4444 ``` ## Summary of changes - Allow to set env variable PGPASSWORD - fall back to use PGPASSWORD env variable when auth-endpoint does not contain password - remove auth-endpoint password from logs in `--features testing` mode Example ```bash export PGPASSWORD=secret_password RUST_LOG=proxy LOGFMT=text cargo run --package proxy --bin proxy --features testing -- \ --auth-backend postgres \ --auth-endpoint 'postgresql://postgres@127.0.0.1:5432/postgres' \ --tls-cert server.crt \ --tls-key server.key \ --wss 0.0.0.0:4444 ``` --- proxy/src/auth/backend/mod.rs | 20 ++++++++++++++++---- proxy/src/binary/proxy.rs | 14 +++++++++++++- proxy/src/url.rs | 6 ++++++ 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 83feed5094..6e5c0a3954 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -80,10 +80,22 @@ impl std::fmt::Display for Backend<'_, ()> { .field(&endpoint.url()) .finish(), #[cfg(any(test, feature = "testing"))] - ControlPlaneClient::PostgresMock(endpoint) => fmt - .debug_tuple("ControlPlane::PostgresMock") - .field(&endpoint.url()) - .finish(), + ControlPlaneClient::PostgresMock(endpoint) => { + let url = endpoint.url(); + match url::Url::parse(url) { + Ok(mut url) => { + let _ = url.set_password(Some("_redacted_")); + let url = url.as_str(); + fmt.debug_tuple("ControlPlane::PostgresMock") + .field(&url) + .finish() + } + Err(_) => fmt + .debug_tuple("ControlPlane::PostgresMock") + .field(&url) + .finish(), + } + } #[cfg(test)] ControlPlaneClient::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(), }, diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index f40d5041c1..5f24940985 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -1,9 +1,13 @@ +#[cfg(any(test, feature = "testing"))] +use std::env; use std::net::SocketAddr; use std::path::PathBuf; use std::pin::pin; use std::sync::Arc; use std::time::Duration; +#[cfg(any(test, feature = "testing"))] +use anyhow::Context; use anyhow::{bail, ensure}; use arc_swap::ArcSwapOption; use futures::future::Either; @@ -35,6 +39,8 @@ use crate::scram::threadpool::ThreadPool; use crate::serverless::GlobalConnPoolOptions; use crate::serverless::cancel_set::CancelSet; use crate::tls::client_config::compute_client_config_with_root_certs; +#[cfg(any(test, feature = "testing"))] +use crate::url::ApiUrl; use crate::{auth, control_plane, http, serverless, usage_metrics}; project_git_version!(GIT_VERSION); @@ -777,7 +783,13 @@ fn build_auth_backend( #[cfg(any(test, feature = "testing"))] AuthBackendType::Postgres => { - let url = args.auth_endpoint.parse()?; + let mut url: ApiUrl = args.auth_endpoint.parse()?; + if url.password().is_none() { + let password = env::var("PGPASSWORD") + .with_context(|| "auth-endpoint does not contain a password and environment variable `PGPASSWORD` is not set")?; + url.set_password(Some(&password)) + .expect("Failed to set password"); + } let api = control_plane::client::mock::MockControlPlane::new( url, !args.is_private_access_proxy, diff --git a/proxy/src/url.rs b/proxy/src/url.rs index 270cd7c24d..7dce36be2f 100644 --- a/proxy/src/url.rs +++ b/proxy/src/url.rs @@ -43,6 +43,12 @@ impl std::ops::Deref for ApiUrl { } } +impl std::ops::DerefMut for ApiUrl { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + impl std::fmt::Display for ApiUrl { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) From 211b824d62b251828cf19b28720729f9c534ae24 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 22 May 2025 14:26:32 +0200 Subject: [PATCH 133/142] pageserver: add branch-local consumption metrics (#11852) ## Problem For billing, we'd like per-branch consumption metrics. Requires https://github.com/neondatabase/neon/pull/11984. Resolves https://github.com/neondatabase/cloud/issues/28155. ## Summary of changes This patch adds two new consumption metrics: * `written_size_since_parent`: `written_size - ancestor_lsn` * `pitr_history_size_since_parent`: `written_size - max(pitr_cutoff, ancestor_lsn)` Note that `pitr_history_size_since_parent` will not be emitted until the PITR cutoff has been computed, and may or may not increase ~immediately when a user increases their PITR window (depending on how much history we have available and whether the tenant is restarted/migrated). --- pageserver/src/consumption_metrics/metrics.rs | 76 ++++- .../src/consumption_metrics/metrics/tests.rs | 293 +++++++++++++++++- pageserver/src/consumption_metrics/upload.rs | 12 +- pageserver/src/tenant/timeline.rs | 7 + .../test_pageserver_metric_collection.py | 3 + 5 files changed, 377 insertions(+), 14 deletions(-) diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index acdf514101..698390f719 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -18,12 +18,25 @@ use crate::tenant::timeline::logical_size::CurrentLogicalSize; // management. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] pub(super) enum Name { - /// Timeline last_record_lsn, absolute + /// Timeline last_record_lsn, absolute. #[serde(rename = "written_size")] WrittenSize, /// Timeline last_record_lsn, incremental #[serde(rename = "written_data_bytes_delta")] WrittenSizeDelta, + /// Written bytes only on this timeline (not including ancestors): + /// written_size - ancestor_lsn + /// + /// On the root branch, this is equivalent to `written_size`. + #[serde(rename = "written_size_since_parent")] + WrittenSizeSinceParent, + /// PITR history size only on this timeline (not including ancestors): + /// last_record_lsn - max(pitr_cutoff, ancestor_lsn). + /// + /// On the root branch, this is its entire PITR history size. Not emitted if GC hasn't computed + /// the PITR cutoff yet. 0 if PITR is disabled. + #[serde(rename = "pitr_history_size_since_parent")] + PitrHistorySizeSinceParent, /// Timeline logical size #[serde(rename = "timeline_logical_size")] LogicalSize, @@ -157,6 +170,32 @@ impl MetricsKey { .incremental_values() } + /// `written_size` - `ancestor_lsn`. + const fn written_size_since_parent( + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> AbsoluteValueFactory { + MetricsKey { + tenant_id, + timeline_id: Some(timeline_id), + metric: Name::WrittenSizeSinceParent, + } + .absolute_values() + } + + /// `written_size` - max(`pitr_cutoff`, `ancestor_lsn`). + const fn pitr_history_size_since_parent( + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> AbsoluteValueFactory { + MetricsKey { + tenant_id, + timeline_id: Some(timeline_id), + metric: Name::PitrHistorySizeSinceParent, + } + .absolute_values() + } + /// Exact [`Timeline::get_current_logical_size`]. /// /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size @@ -334,7 +373,13 @@ impl TenantSnapshot { struct TimelineSnapshot { loaded_at: (Lsn, SystemTime), last_record_lsn: Lsn, + ancestor_lsn: Lsn, current_exact_logical_size: Option, + /// Whether PITR is enabled (pitr_interval > 0). + pitr_enabled: bool, + /// The PITR cutoff LSN. None if not yet initialized. If PITR is disabled, this is approximately + /// Some(last_record_lsn), but may lag behind it since it's computed periodically. + pitr_cutoff: Option, } impl TimelineSnapshot { @@ -354,6 +399,9 @@ impl TimelineSnapshot { } else { let loaded_at = t.loaded_at; let last_record_lsn = t.get_last_record_lsn(); + let ancestor_lsn = t.get_ancestor_lsn(); + let pitr_enabled = !t.get_pitr_interval().is_zero(); + let pitr_cutoff = t.gc_info.read().unwrap().cutoffs.time; let current_exact_logical_size = { let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id); @@ -373,7 +421,10 @@ impl TimelineSnapshot { Ok(Some(TimelineSnapshot { loaded_at, last_record_lsn, + ancestor_lsn, current_exact_logical_size, + pitr_enabled, + pitr_cutoff, })) } } @@ -424,6 +475,8 @@ impl TimelineSnapshot { let up_to = now; + let written_size_last = written_size_now.value.max(prev.1); // don't regress + if let Some(delta) = written_size_now.value.checked_sub(prev.1) { let key_value = written_size_delta_key.from_until(prev.0, up_to, delta); // written_size_delta @@ -441,6 +494,27 @@ impl TimelineSnapshot { }); } + // Compute the branch-local written size. + let written_size_since_parent_key = + MetricsKey::written_size_since_parent(tenant_id, timeline_id); + metrics.push( + written_size_since_parent_key + .at(now, written_size_last.saturating_sub(self.ancestor_lsn.0)), + ); + + // Compute the branch-local PITR history size. Not emitted if GC hasn't yet computed the + // PITR cutoff. 0 if PITR is disabled. + let pitr_history_size_since_parent_key = + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id); + if !self.pitr_enabled { + metrics.push(pitr_history_size_since_parent_key.at(now, 0)); + } else if let Some(pitr_cutoff) = self.pitr_cutoff { + metrics.push(pitr_history_size_since_parent_key.at( + now, + written_size_last.saturating_sub(pitr_cutoff.max(self.ancestor_lsn).0), + )); + } + { let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id); let current_or_previous = self diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs index 5cfb361e40..3379395b87 100644 --- a/pageserver/src/consumption_metrics/metrics/tests.rs +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -12,12 +12,17 @@ fn startup_collected_timeline_metrics_before_advancing() { let cache = HashMap::new(); let initdb_lsn = Lsn(0x10000); + let pitr_cutoff = Lsn(0x11000); let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + let logical_size = 0x42000; let snap = TimelineSnapshot { loaded_at: (disk_consistent_lsn, SystemTime::now()), last_record_lsn: disk_consistent_lsn, - current_exact_logical_size: Some(0x42000), + ancestor_lsn: Lsn(0), + current_exact_logical_size: Some(logical_size), + pitr_enabled: true, + pitr_cutoff: Some(pitr_cutoff), }; let now = DateTime::::from(SystemTime::now()); @@ -33,7 +38,11 @@ fn startup_collected_timeline_metrics_before_advancing() { 0 ), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), - MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) + MetricsKey::written_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0 - pitr_cutoff.0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size) ] ); } @@ -49,7 +58,9 @@ fn startup_collected_timeline_metrics_second_round() { let before = DateTime::::from(before); let initdb_lsn = Lsn(0x10000); + let pitr_cutoff = Lsn(0x11000); let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + let logical_size = 0x42000; let mut metrics = Vec::new(); let cache = HashMap::from([MetricsKey::written_size(tenant_id, timeline_id) @@ -59,7 +70,10 @@ fn startup_collected_timeline_metrics_second_round() { let snap = TimelineSnapshot { loaded_at: (disk_consistent_lsn, init), last_record_lsn: disk_consistent_lsn, - current_exact_logical_size: Some(0x42000), + ancestor_lsn: Lsn(0), + current_exact_logical_size: Some(logical_size), + pitr_enabled: true, + pitr_cutoff: Some(pitr_cutoff), }; snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); @@ -69,7 +83,11 @@ fn startup_collected_timeline_metrics_second_round() { &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), - MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) + MetricsKey::written_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0 - pitr_cutoff.0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size) ] ); } @@ -86,7 +104,9 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() { let before = DateTime::::from(before); let initdb_lsn = Lsn(0x10000); + let pitr_cutoff = Lsn(0x11000); let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + let logical_size = 0x42000; let mut metrics = Vec::new(); let cache = HashMap::from([ @@ -103,7 +123,10 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() { let snap = TimelineSnapshot { loaded_at: (disk_consistent_lsn, init), last_record_lsn: disk_consistent_lsn, - current_exact_logical_size: Some(0x42000), + ancestor_lsn: Lsn(0), + current_exact_logical_size: Some(logical_size), + pitr_enabled: true, + pitr_cutoff: Some(pitr_cutoff), }; snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); @@ -113,16 +136,18 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() { &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), - MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) + MetricsKey::written_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0 - pitr_cutoff.0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size) ] ); } +/// Tests that written sizes do not regress across restarts. #[test] fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { - // it can happen that we lose the inmemorylayer but have previously sent metrics and we - // should never go backwards - let tenant_id = TenantId::generate(); let timeline_id = TimelineId::generate(); @@ -140,7 +165,10 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { let snap = TimelineSnapshot { loaded_at: (Lsn(50), at_restart), last_record_lsn: Lsn(50), + ancestor_lsn: Lsn(0), current_exact_logical_size: None, + pitr_enabled: true, + pitr_cutoff: Some(Lsn(20)), }; let mut cache = HashMap::from([ @@ -169,6 +197,8 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { 0 ), MetricsKey::written_size(tenant_id, timeline_id).at(now, 100), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 100), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 80), ] ); @@ -183,6 +213,157 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0), MetricsKey::written_size(tenant_id, timeline_id).at(later, 100), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 100), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 80), + ] + ); +} + +/// Tests that written sizes do not regress across restarts, even on child branches. +#[test] +fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_ancestor_lsn() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let [later, now, at_restart] = time_backwards(); + + // FIXME: tests would be so much easier if we did not need to juggle back and forth + // SystemTime and DateTime:: ... Could do the conversion only at upload time? + let now = DateTime::::from(now); + let later = DateTime::::from(later); + let before_restart = at_restart - std::time::Duration::from_secs(5 * 60); + let way_before = before_restart - std::time::Duration::from_secs(10 * 60); + let before_restart = DateTime::::from(before_restart); + let way_before = DateTime::::from(way_before); + + let snap = TimelineSnapshot { + loaded_at: (Lsn(50), at_restart), + last_record_lsn: Lsn(50), + ancestor_lsn: Lsn(40), + current_exact_logical_size: None, + pitr_enabled: true, + pitr_cutoff: Some(Lsn(20)), + }; + + let mut cache = HashMap::from([ + MetricsKey::written_size(tenant_id, timeline_id) + .at(before_restart, 100) + .to_kv_pair(), + MetricsKey::written_size_delta(tenant_id, timeline_id) + .from_until( + way_before, + before_restart, + // not taken into account, but the timestamps are important + 999_999_999, + ) + .to_kv_pair(), + ]); + + let mut metrics = Vec::new(); + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( + before_restart, + now, + 0 + ), + MetricsKey::written_size(tenant_id, timeline_id).at(now, 100), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 60), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 60), + ] + ); + + // now if we cache these metrics, and re-run while "still in recovery" + cache.extend(metrics.drain(..).map(|x| x.to_kv_pair())); + + // "still in recovery", because our snapshot did not change + snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0), + MetricsKey::written_size(tenant_id, timeline_id).at(later, 100), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 60), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 60), + ] + ); +} + +/// Tests that written sizes do not regress across restarts, even on child branches and +/// with a PITR cutoff after the branch point. +#[test] +fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_ancestor_lsn_and_pitr_cutoff() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let [later, now, at_restart] = time_backwards(); + + // FIXME: tests would be so much easier if we did not need to juggle back and forth + // SystemTime and DateTime:: ... Could do the conversion only at upload time? + let now = DateTime::::from(now); + let later = DateTime::::from(later); + let before_restart = at_restart - std::time::Duration::from_secs(5 * 60); + let way_before = before_restart - std::time::Duration::from_secs(10 * 60); + let before_restart = DateTime::::from(before_restart); + let way_before = DateTime::::from(way_before); + + let snap = TimelineSnapshot { + loaded_at: (Lsn(50), at_restart), + last_record_lsn: Lsn(50), + ancestor_lsn: Lsn(30), + current_exact_logical_size: None, + pitr_enabled: true, + pitr_cutoff: Some(Lsn(40)), + }; + + let mut cache = HashMap::from([ + MetricsKey::written_size(tenant_id, timeline_id) + .at(before_restart, 100) + .to_kv_pair(), + MetricsKey::written_size_delta(tenant_id, timeline_id) + .from_until( + way_before, + before_restart, + // not taken into account, but the timestamps are important + 999_999_999, + ) + .to_kv_pair(), + ]); + + let mut metrics = Vec::new(); + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( + before_restart, + now, + 0 + ), + MetricsKey::written_size(tenant_id, timeline_id).at(now, 100), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 70), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 60), + ] + ); + + // now if we cache these metrics, and re-run while "still in recovery" + cache.extend(metrics.drain(..).map(|x| x.to_kv_pair())); + + // "still in recovery", because our snapshot did not change + snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0), + MetricsKey::written_size(tenant_id, timeline_id).at(later, 100), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 70), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 60), ] ); } @@ -201,7 +382,10 @@ fn post_restart_current_exact_logical_size_uses_cached() { let snap = TimelineSnapshot { loaded_at: (Lsn(50), at_restart), last_record_lsn: Lsn(50), + ancestor_lsn: Lsn(0), current_exact_logical_size: None, + pitr_enabled: true, + pitr_cutoff: None, }; let cache = HashMap::from([MetricsKey::timeline_logical_size(tenant_id, timeline_id) @@ -286,16 +470,101 @@ fn time_backwards() -> [std::time::SystemTime; N] { times } +/// Tests that disabled PITR history does not yield any history size, even when the PITR cutoff +/// indicates otherwise. +#[test] +fn pitr_disabled_yields_no_history_size() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let mut metrics = Vec::new(); + let cache = HashMap::new(); + + let initdb_lsn = Lsn(0x10000); + let pitr_cutoff = Lsn(0x11000); + let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + + let snap = TimelineSnapshot { + loaded_at: (disk_consistent_lsn, SystemTime::now()), + last_record_lsn: disk_consistent_lsn, + ancestor_lsn: Lsn(0), + current_exact_logical_size: None, + pitr_enabled: false, + pitr_cutoff: Some(pitr_cutoff), + }; + + let now = DateTime::::from(SystemTime::now()); + + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( + snap.loaded_at.1.into(), + now, + 0 + ), + MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), + MetricsKey::written_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 0), + ] + ); +} + +/// Tests that uninitialized PITR cutoff does not emit any history size metric at all. +#[test] +fn pitr_uninitialized_does_not_emit_history_size() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let mut metrics = Vec::new(); + let cache = HashMap::new(); + + let initdb_lsn = Lsn(0x10000); + let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + + let snap = TimelineSnapshot { + loaded_at: (disk_consistent_lsn, SystemTime::now()), + last_record_lsn: disk_consistent_lsn, + ancestor_lsn: Lsn(0), + current_exact_logical_size: None, + pitr_enabled: true, + pitr_cutoff: None, + }; + + let now = DateTime::::from(SystemTime::now()); + + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( + snap.loaded_at.1.into(), + now, + 0 + ), + MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), + MetricsKey::written_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0), + ] + ); +} + pub(crate) const fn metric_examples_old( tenant_id: TenantId, timeline_id: TimelineId, now: DateTime, before: DateTime, -) -> [RawMetric; 5] { +) -> [RawMetric; 7] { [ MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::written_size_delta(tenant_id, timeline_id) .from_until_old_format(before, now, 0), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at_old_format(now, 0), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0), MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1), @@ -307,10 +576,12 @@ pub(crate) const fn metric_examples( timeline_id: TimelineId, now: DateTime, before: DateTime, -) -> [NewRawMetric; 5] { +) -> [NewRawMetric; 7] { [ MetricsKey::written_size(tenant_id, timeline_id).at(now, 0), MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 0), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0), MetricsKey::remote_storage_size(tenant_id).at(now, 0), MetricsKey::synthetic_size(tenant_id).at(now, 1), diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 19c5aec5b3..eba773272a 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -513,6 +513,14 @@ mod tests { line!(), r#"{"type":"incremental","start_time":"2023-09-14T00:00:00.123456789Z","stop_time":"2023-09-15T00:00:00.123456789Z","metric":"written_data_bytes_delta","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, ), + ( + line!(), + r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size_since_parent","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, + ), + ( + line!(), + r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"pitr_history_size_since_parent","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, + ), ( line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"timeline_logical_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, @@ -560,7 +568,7 @@ mod tests { assert_eq!(upgraded_samples, new_samples); } - fn metric_samples_old() -> [RawMetric; 5] { + fn metric_samples_old() -> [RawMetric; 7] { let tenant_id = TenantId::from_array([0; 16]); let timeline_id = TimelineId::from_array([0xff; 16]); @@ -572,7 +580,7 @@ mod tests { super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before) } - fn metric_samples() -> [NewRawMetric; 5] { + fn metric_samples() -> [NewRawMetric; 7] { let tenant_id = TenantId::from_array([0; 16]); let timeline_id = TimelineId::from_array([0xff; 16]); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 670e0865df..a251163419 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2536,6 +2536,13 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } + pub(crate) fn get_pitr_interval(&self) -> Duration { + let tenant_conf = &self.tenant_conf.load().tenant_conf; + tenant_conf + .pitr_interval + .unwrap_or(self.conf.default_tenant_conf.pitr_interval) + } + fn get_compaction_period(&self) -> Duration { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index ffde08a73f..474258c9eb 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -508,6 +508,9 @@ PER_METRIC_VERIFIERS = { "remote_storage_size": CannotVerifyAnything, "written_size": WrittenDataVerifier, "written_data_bytes_delta": WrittenDataDeltaVerifier, + "written_size_since_parent": WrittenDataVerifier, # same as written_size on root + "pitr_cutoff": CannotVerifyAnything, + "pitr_history_size_since_parent": WrittenDataVerifier, # same as written_size on root w/o GC "timeline_logical_size": CannotVerifyAnything, "synthetic_storage_size": SyntheticSizeVerifier, } From 136eaeb74a9945b8b806fd885b0b81e5fe3b9dbb Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Thu, 22 May 2025 16:45:00 +0400 Subject: [PATCH 134/142] pageserver: basebackup cache (hackathon project) (#11989) ## Problem Basebackup cache is on the hot path of compute startup and is generated on every request (may be slow). - Issue: https://github.com/neondatabase/cloud/issues/29353 ## Summary of changes - Add `BasebackupCache` which stores basebackups on local disk. - Basebackup prepare requests are triggered by `XLOG_CHECKPOINT_SHUTDOWN` records in the log. - Limit the size of the cache by number of entries. - Add `basebackup_cache_enabled` feature flag to TenantConfig. - Write tests for the cache ## Not implemented yet - Limit the size of the cache by total size in bytes --------- Co-authored-by: Aleksandr Sarantsev --- control_plane/src/pageserver.rs | 5 + libs/pageserver_api/src/config.rs | 30 ++ libs/pageserver_api/src/models.rs | 13 + pageserver/src/basebackup_cache.rs | 518 +++++++++++++++++++++++++ pageserver/src/bin/pageserver.rs | 14 + pageserver/src/config.rs | 8 + pageserver/src/lib.rs | 1 + pageserver/src/metrics.rs | 36 ++ pageserver/src/page_service.rs | 44 ++- pageserver/src/task_mgr.rs | 4 + pageserver/src/tenant.rs | 17 +- pageserver/src/tenant/timeline.rs | 52 ++- pageserver/src/walingest.rs | 4 + test_runner/regress/test_basebackup.py | 77 ++++ 14 files changed, 815 insertions(+), 8 deletions(-) create mode 100644 pageserver/src/basebackup_cache.rs create mode 100644 test_runner/regress/test_basebackup.py diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 587f3774d4..756f2b02db 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -551,6 +551,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Falied to parse 'relsize_snapshot_cache_capacity' as integer")?, + basebackup_cache_enabled: settings + .remove("basebackup_cache_enabled") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'basebackup_cache_enabled' as bool")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 73b6eee554..0fb2ff38ff 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -183,6 +183,8 @@ pub struct ConfigToml { pub enable_tls_page_service_api: bool, pub dev_mode: bool, pub timeline_import_config: TimelineImportConfig, + #[serde(skip_serializing_if = "Option::is_none")] + pub basebackup_cache_config: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -308,6 +310,26 @@ pub struct TimelineImportConfig { pub import_job_checkpoint_threshold: NonZeroUsize, } +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(default)] +pub struct BasebackupCacheConfig { + #[serde(with = "humantime_serde")] + pub cleanup_period: Duration, + // FIXME: Support max_size_bytes. + // pub max_size_bytes: usize, + pub max_size_entries: i64, +} + +impl Default for BasebackupCacheConfig { + fn default() -> Self { + Self { + cleanup_period: Duration::from_secs(60), + // max_size_bytes: 1024 * 1024 * 1024, // 1 GiB + max_size_entries: 1000, + } + } +} + pub mod statvfs { pub mod mock { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -491,8 +513,14 @@ pub struct TenantConfigToml { /// Tenant level performance sampling ratio override. Controls the ratio of get page requests /// that will get perf sampling for the tenant. pub sampling_ratio: Option, + /// Capacity of relsize snapshot cache (used by replicas). pub relsize_snapshot_cache_capacity: usize, + + /// Enable preparing basebackup on XLOG_CHECKPOINT_SHUTDOWN and using it in basebackup requests. + // FIXME: Remove skip_serializing_if when the feature is stable. + #[serde(skip_serializing_if = "std::ops::Not::not")] + pub basebackup_cache_enabled: bool, } pub mod defaults { @@ -666,6 +694,7 @@ impl Default for ConfigToml { import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(), import_job_checkpoint_threshold: NonZeroUsize::new(128).unwrap(), }, + basebackup_cache_config: None, } } } @@ -791,6 +820,7 @@ impl Default for TenantConfigToml { gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT, sampling_ratio: None, relsize_snapshot_cache_capacity: DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY, + basebackup_cache_enabled: false, } } } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ca26286b86..383939a13f 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -632,6 +632,8 @@ pub struct TenantConfigPatch { pub sampling_ratio: FieldPatch>, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub relsize_snapshot_cache_capacity: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub basebackup_cache_enabled: FieldPatch, } /// Like [`crate::config::TenantConfigToml`], but preserves the information @@ -764,6 +766,9 @@ pub struct TenantConfig { #[serde(skip_serializing_if = "Option::is_none")] pub relsize_snapshot_cache_capacity: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub basebackup_cache_enabled: Option, } impl TenantConfig { @@ -810,6 +815,7 @@ impl TenantConfig { mut gc_compaction_ratio_percent, mut sampling_ratio, mut relsize_snapshot_cache_capacity, + mut basebackup_cache_enabled, } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); @@ -914,6 +920,9 @@ impl TenantConfig { patch .relsize_snapshot_cache_capacity .apply(&mut relsize_snapshot_cache_capacity); + patch + .basebackup_cache_enabled + .apply(&mut basebackup_cache_enabled); Ok(Self { checkpoint_distance, @@ -954,6 +963,7 @@ impl TenantConfig { gc_compaction_ratio_percent, sampling_ratio, relsize_snapshot_cache_capacity, + basebackup_cache_enabled, }) } @@ -1065,6 +1075,9 @@ impl TenantConfig { relsize_snapshot_cache_capacity: self .relsize_snapshot_cache_capacity .unwrap_or(global_conf.relsize_snapshot_cache_capacity), + basebackup_cache_enabled: self + .basebackup_cache_enabled + .unwrap_or(global_conf.basebackup_cache_enabled), } } } diff --git a/pageserver/src/basebackup_cache.rs b/pageserver/src/basebackup_cache.rs new file mode 100644 index 0000000000..3a8ec555f7 --- /dev/null +++ b/pageserver/src/basebackup_cache.rs @@ -0,0 +1,518 @@ +use std::{collections::HashMap, sync::Arc}; + +use async_compression::tokio::write::GzipEncoder; +use camino::{Utf8Path, Utf8PathBuf}; +use metrics::core::{AtomicU64, GenericCounter}; +use pageserver_api::{config::BasebackupCacheConfig, models::TenantState}; +use tokio::{ + io::{AsyncWriteExt, BufWriter}, + sync::mpsc::{UnboundedReceiver, UnboundedSender}, +}; +use tokio_util::sync::CancellationToken; +use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, + shard::TenantShardId, +}; + +use crate::{ + basebackup::send_basebackup_tarball, + context::{DownloadBehavior, RequestContext}, + metrics::{BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ}, + task_mgr::TaskKind, + tenant::{ + Timeline, + mgr::{TenantManager, TenantSlot}, + }, +}; + +pub struct BasebackupPrepareRequest { + pub tenant_shard_id: TenantShardId, + pub timeline_id: TimelineId, + pub lsn: Lsn, +} + +pub type BasebackupPrepareSender = UnboundedSender; +pub type BasebackupPrepareReceiver = UnboundedReceiver; + +type BasebackupRemoveEntrySender = UnboundedSender; +type BasebackupRemoveEntryReceiver = UnboundedReceiver; + +/// BasebackupCache stores cached basebackup archives for timelines on local disk. +/// +/// The main purpose of this cache is to speed up the startup process of compute nodes +/// after scaling to zero. +/// Thus, the basebackup is stored only for the latest LSN of the timeline and with +/// fixed set of parameters (gzip=true, full_backup=false, replica=false, prev_lsn=none). +/// +/// The cache receives prepare requests through the `BasebackupPrepareSender` channel, +/// generates a basebackup from the timeline in the background, and stores it on disk. +/// +/// Basebackup requests are pretty rare. We expect ~thousands of entries in the cache +/// and ~1 RPS for get requests. +pub struct BasebackupCache { + data_dir: Utf8PathBuf, + config: BasebackupCacheConfig, + tenant_manager: Arc, + remove_entry_sender: BasebackupRemoveEntrySender, + + entries: std::sync::Mutex>, + + cancel: CancellationToken, + + read_hit_count: GenericCounter, + read_miss_count: GenericCounter, + read_err_count: GenericCounter, + + prepare_ok_count: GenericCounter, + prepare_skip_count: GenericCounter, + prepare_err_count: GenericCounter, +} + +impl BasebackupCache { + /// Creates a BasebackupCache and spawns the background task. + /// The initialization of the cache is performed in the background and does not + /// block the caller. The cache will return `None` for any get requests until + /// initialization is complete. + pub fn spawn( + runtime_handle: &tokio::runtime::Handle, + data_dir: Utf8PathBuf, + config: Option, + prepare_receiver: BasebackupPrepareReceiver, + tenant_manager: Arc, + cancel: CancellationToken, + ) -> Arc { + let (remove_entry_sender, remove_entry_receiver) = tokio::sync::mpsc::unbounded_channel(); + + let enabled = config.is_some(); + + let cache = Arc::new(BasebackupCache { + data_dir, + config: config.unwrap_or_default(), + tenant_manager, + remove_entry_sender, + + entries: std::sync::Mutex::new(HashMap::new()), + + cancel, + + read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]), + read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]), + read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]), + + prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]), + prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]), + prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]), + }); + + if enabled { + runtime_handle.spawn( + cache + .clone() + .background(prepare_receiver, remove_entry_receiver), + ); + } + + cache + } + + /// Gets a basebackup entry from the cache. + /// If the entry is found, opens a file with the basebackup archive and returns it. + /// The open file descriptor will prevent the file system from deleting the file + /// even if the entry is removed from the cache in the background. + pub async fn get( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Option { + // Fast path. Check if the entry exists using the in-memory state. + let tti = TenantTimelineId::new(tenant_id, timeline_id); + if self.entries.lock().unwrap().get(&tti) != Some(&lsn) { + self.read_miss_count.inc(); + return None; + } + + let path = self.entry_path(tenant_id, timeline_id, lsn); + + match tokio::fs::File::open(path).await { + Ok(file) => { + self.read_hit_count.inc(); + Some(file) + } + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + // We may end up here if the basebackup was concurrently removed by the cleanup task. + self.read_miss_count.inc(); + } else { + self.read_err_count.inc(); + tracing::warn!("Unexpected error opening basebackup cache file: {:?}", e); + } + None + } + } + } + + // Private methods. + + fn entry_filename(tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> String { + // The default format for LSN is 0/ABCDEF. + // The backslash is not filename friendly, so serialize it as plain hex. + let lsn = lsn.0; + format!("basebackup_{tenant_id}_{timeline_id}_{lsn:016X}.tar.gz") + } + + fn entry_path(&self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> Utf8PathBuf { + self.data_dir + .join(Self::entry_filename(tenant_id, timeline_id, lsn)) + } + + fn entry_tmp_path( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Utf8PathBuf { + self.data_dir + .join("tmp") + .join(Self::entry_filename(tenant_id, timeline_id, lsn)) + } + + fn parse_entry_filename(filename: &str) -> Option<(TenantId, TimelineId, Lsn)> { + let parts: Vec<&str> = filename + .strip_prefix("basebackup_")? + .strip_suffix(".tar.gz")? + .split('_') + .collect(); + if parts.len() != 3 { + return None; + } + let tenant_id = parts[0].parse::().ok()?; + let timeline_id = parts[1].parse::().ok()?; + let lsn = Lsn(u64::from_str_radix(parts[2], 16).ok()?); + + Some((tenant_id, timeline_id, lsn)) + } + + async fn cleanup(&self) -> anyhow::Result<()> { + // Cleanup tmp directory. + let tmp_dir = self.data_dir.join("tmp"); + let mut tmp_dir = tokio::fs::read_dir(&tmp_dir).await?; + while let Some(dir_entry) = tmp_dir.next_entry().await? { + if let Err(e) = tokio::fs::remove_file(dir_entry.path()).await { + tracing::warn!("Failed to remove basebackup cache tmp file: {:#}", e); + } + } + + // Remove outdated entries. + let entries_old = self.entries.lock().unwrap().clone(); + let mut entries_new = HashMap::new(); + for (tenant_shard_id, tenant_slot) in self.tenant_manager.list() { + if !tenant_shard_id.is_shard_zero() { + continue; + } + let TenantSlot::Attached(tenant) = tenant_slot else { + continue; + }; + let tenant_id = tenant_shard_id.tenant_id; + + for timeline in tenant.list_timelines() { + let tti = TenantTimelineId::new(tenant_id, timeline.timeline_id); + if let Some(&entry_lsn) = entries_old.get(&tti) { + if timeline.get_last_record_lsn() <= entry_lsn { + entries_new.insert(tti, entry_lsn); + } + } + } + } + + for (&tti, &lsn) in entries_old.iter() { + if !entries_new.contains_key(&tti) { + self.remove_entry_sender + .send(self.entry_path(tti.tenant_id, tti.timeline_id, lsn)) + .unwrap(); + } + } + + BASEBACKUP_CACHE_ENTRIES.set(entries_new.len() as i64); + *self.entries.lock().unwrap() = entries_new; + + Ok(()) + } + + async fn on_startup(&self) -> anyhow::Result<()> { + // Create data_dir and tmp directory if they do not exist. + tokio::fs::create_dir_all(&self.data_dir.join("tmp")) + .await + .map_err(|e| { + anyhow::anyhow!( + "Failed to create basebackup cache data_dir {:?}: {:?}", + self.data_dir, + e + ) + })?; + + // Read existing entries from the data_dir and add them to in-memory state. + let mut entries = HashMap::new(); + let mut dir = tokio::fs::read_dir(&self.data_dir).await?; + while let Some(dir_entry) = dir.next_entry().await? { + let filename = dir_entry.file_name(); + + if filename == "tmp" { + // Skip the tmp directory. + continue; + } + + let parsed = Self::parse_entry_filename(filename.to_string_lossy().as_ref()); + let Some((tenant_id, timeline_id, lsn)) = parsed else { + tracing::warn!("Invalid basebackup cache file name: {:?}", filename); + continue; + }; + + let tti = TenantTimelineId::new(tenant_id, timeline_id); + + use std::collections::hash_map::Entry::*; + + match entries.entry(tti) { + Occupied(mut entry) => { + let entry_lsn = *entry.get(); + // Leave only the latest entry, remove the old one. + if lsn < entry_lsn { + self.remove_entry_sender.send(self.entry_path( + tenant_id, + timeline_id, + lsn, + ))?; + } else if lsn > entry_lsn { + self.remove_entry_sender.send(self.entry_path( + tenant_id, + timeline_id, + entry_lsn, + ))?; + entry.insert(lsn); + } else { + // Two different filenames parsed to the same timline_id and LSN. + // Should never happen. + return Err(anyhow::anyhow!( + "Duplicate basebackup cache entry with the same LSN: {:?}", + filename + )); + } + } + Vacant(entry) => { + entry.insert(lsn); + } + } + } + + BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64); + *self.entries.lock().unwrap() = entries; + + Ok(()) + } + + async fn background( + self: Arc, + mut prepare_receiver: BasebackupPrepareReceiver, + mut remove_entry_receiver: BasebackupRemoveEntryReceiver, + ) { + // Panic in the background is a safe fallback. + // It will drop receivers and the cache will be effectively disabled. + self.on_startup() + .await + .expect("Failed to initialize basebackup cache"); + + let mut cleanup_ticker = tokio::time::interval(self.config.cleanup_period); + cleanup_ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + + loop { + tokio::select! { + Some(req) = prepare_receiver.recv() => { + if let Err(err) = self.prepare_basebackup( + req.tenant_shard_id, + req.timeline_id, + req.lsn, + ).await { + tracing::info!("Failed to prepare basebackup: {:#}", err); + self.prepare_err_count.inc(); + continue; + } + } + Some(req) = remove_entry_receiver.recv() => { + if let Err(e) = tokio::fs::remove_file(req).await { + tracing::warn!("Failed to remove basebackup cache file: {:#}", e); + } + } + _ = cleanup_ticker.tick() => { + self.cleanup().await.unwrap_or_else(|e| { + tracing::warn!("Failed to clean up basebackup cache: {:#}", e); + }); + } + _ = self.cancel.cancelled() => { + tracing::info!("BasebackupCache background task cancelled"); + break; + } + } + } + } + + /// Prepare a basebackup for the given timeline. + /// + /// If the basebackup already exists with a higher LSN or the timeline already + /// has a higher last_record_lsn, skip the preparation. + /// + /// The basebackup is prepared in a temporary directory and then moved to the final + /// location to make the operation atomic. + async fn prepare_basebackup( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + req_lsn: Lsn, + ) -> anyhow::Result<()> { + tracing::info!( + tenant_id = %tenant_shard_id.tenant_id, + %timeline_id, + %req_lsn, + "Preparing basebackup for timeline", + ); + + let tti = TenantTimelineId::new(tenant_shard_id.tenant_id, timeline_id); + + { + let entries = self.entries.lock().unwrap(); + if let Some(&entry_lsn) = entries.get(&tti) { + if entry_lsn >= req_lsn { + tracing::info!( + %timeline_id, + %req_lsn, + %entry_lsn, + "Basebackup entry already exists for timeline with higher LSN, skipping basebackup", + ); + self.prepare_skip_count.inc(); + return Ok(()); + } + } + + if entries.len() as i64 >= self.config.max_size_entries { + tracing::info!( + %timeline_id, + %req_lsn, + "Basebackup cache is full, skipping basebackup", + ); + self.prepare_skip_count.inc(); + return Ok(()); + } + } + + let tenant = self + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + let tenant_state = tenant.current_state(); + if tenant_state != TenantState::Active { + anyhow::bail!( + "Tenant {} is not active, current state: {:?}", + tenant_shard_id.tenant_id, + tenant_state + ) + } + + let timeline = tenant.get_timeline(timeline_id, true)?; + + let last_record_lsn = timeline.get_last_record_lsn(); + if last_record_lsn > req_lsn { + tracing::info!( + %timeline_id, + %req_lsn, + %last_record_lsn, + "Timeline has a higher LSN than the requested one, skipping basebackup", + ); + self.prepare_skip_count.inc(); + return Ok(()); + } + + let entry_tmp_path = self.entry_tmp_path(tenant_shard_id.tenant_id, timeline_id, req_lsn); + + let res = self + .prepare_basebackup_tmp(&entry_tmp_path, &timeline, req_lsn) + .await; + + if let Err(err) = res { + tracing::info!("Failed to prepare basebackup tmp file: {:#}", err); + // Try to clean up tmp file. If we fail, the background clean up task will take care of it. + match tokio::fs::remove_file(&entry_tmp_path).await { + Ok(_) => {} + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => { + tracing::info!("Failed to remove basebackup tmp file: {:?}", e); + } + } + return Err(err); + } + + // Move the tmp file to the final location atomically. + let entry_path = self.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn); + tokio::fs::rename(&entry_tmp_path, &entry_path).await?; + + let mut entries = self.entries.lock().unwrap(); + if let Some(old_lsn) = entries.insert(tti, req_lsn) { + // Remove the old entry if it exists. + self.remove_entry_sender + .send(self.entry_path(tenant_shard_id.tenant_id, timeline_id, old_lsn)) + .unwrap(); + } + BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64); + + self.prepare_ok_count.inc(); + Ok(()) + } + + /// Prepares a basebackup in a temporary file. + async fn prepare_basebackup_tmp( + &self, + emptry_tmp_path: &Utf8Path, + timeline: &Arc, + req_lsn: Lsn, + ) -> anyhow::Result<()> { + let ctx = RequestContext::new(TaskKind::BasebackupCache, DownloadBehavior::Download); + let ctx = ctx.with_scope_timeline(timeline); + + let file = tokio::fs::File::create(emptry_tmp_path).await?; + let mut writer = BufWriter::new(file); + + let mut encoder = GzipEncoder::with_quality( + &mut writer, + // Level::Best because compression is not on the hot path of basebackup requests. + // The decompression is almost not affected by the compression level. + async_compression::Level::Best, + ); + + // We may receive a request before the WAL record is applied to the timeline. + // Wait for the requested LSN to be applied. + timeline + .wait_lsn( + req_lsn, + crate::tenant::timeline::WaitLsnWaiter::BaseBackupCache, + crate::tenant::timeline::WaitLsnTimeout::Default, + &ctx, + ) + .await?; + + send_basebackup_tarball( + &mut encoder, + timeline, + Some(req_lsn), + None, + false, + false, + &ctx, + ) + .await?; + + encoder.shutdown().await?; + writer.flush().await?; + writer.into_inner().sync_all().await?; + + Ok(()) + } +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 4c2572a577..6001ea0345 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -16,6 +16,7 @@ use http_utils::tls_certs::ReloadingCertificateResolver; use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric}; use metrics::set_build_info_metric; use nix::sys::socket::{setsockopt, sockopt}; +use pageserver::basebackup_cache::BasebackupCache; use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields}; use pageserver::controller_upcall_client::StorageControllerUpcallClient; use pageserver::deletion_queue::DeletionQueue; @@ -541,6 +542,8 @@ fn start_pageserver( pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone()); // Scan the local 'tenants/' directory and start loading the tenants + let (basebackup_prepare_sender, basebackup_prepare_receiver) = + tokio::sync::mpsc::unbounded_channel(); let deletion_queue_client = deletion_queue.new_client(); let background_purges = mgr::BackgroundPurges::default(); let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( @@ -551,12 +554,22 @@ fn start_pageserver( remote_storage: remote_storage.clone(), deletion_queue_client, l0_flush_global_state, + basebackup_prepare_sender, }, order, shutdown_pageserver.clone(), ))?; let tenant_manager = Arc::new(tenant_manager); + let basebackup_cache = BasebackupCache::spawn( + BACKGROUND_RUNTIME.handle(), + conf.basebackup_cache_dir(), + conf.basebackup_cache_config.clone(), + basebackup_prepare_receiver, + Arc::clone(&tenant_manager), + shutdown_pageserver.child_token(), + ); + BACKGROUND_RUNTIME.spawn({ let shutdown_pageserver = shutdown_pageserver.clone(); let drive_init = async move { @@ -763,6 +776,7 @@ fn start_pageserver( } else { None }, + basebackup_cache, ); // All started up! Now just sit and wait for shutdown signal. diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 62f5b009f7..e8b3b7b3ab 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -232,6 +232,8 @@ pub struct PageServerConf { pub dev_mode: bool, pub timeline_import_config: pageserver_api::config::TimelineImportConfig, + + pub basebackup_cache_config: Option, } /// Token for authentication to safekeepers @@ -261,6 +263,10 @@ impl PageServerConf { self.workdir.join("metadata.json") } + pub fn basebackup_cache_dir(&self) -> Utf8PathBuf { + self.workdir.join("basebackup_cache") + } + pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf { // Encode a version in the filename, so that if we ever switch away from JSON we can // increment this. @@ -407,6 +413,7 @@ impl PageServerConf { enable_tls_page_service_api, dev_mode, timeline_import_config, + basebackup_cache_config, } = config_toml; let mut conf = PageServerConf { @@ -461,6 +468,7 @@ impl PageServerConf { enable_tls_page_service_api, dev_mode, timeline_import_config, + basebackup_cache_config, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 42454e7356..71d9c6603f 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -3,6 +3,7 @@ mod auth; pub mod basebackup; +pub mod basebackup_cache; pub mod config; pub mod consumption_metrics; pub mod context; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index eae3045a3b..3076c7f1d6 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -4359,6 +4359,42 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) { .set(u64::try_from(num_threads.get()).unwrap()); } +pub(crate) static BASEBACKUP_CACHE_READ: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_basebackup_cache_read_total", + "Number of read accesses to the basebackup cache grouped by hit/miss/error", + &["result"] + ) + .expect("failed to define a metric") +}); + +pub(crate) static BASEBACKUP_CACHE_PREPARE: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_basebackup_cache_prepare_total", + "Number of prepare requests processed by the basebackup cache grouped by ok/skip/error", + &["result"] + ) + .expect("failed to define a metric") +}); + +pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy = Lazy::new(|| { + register_int_gauge!( + "pageserver_basebackup_cache_entries_total", + "Number of entries in the basebackup cache" + ) + .expect("failed to define a metric") +}); + +// FIXME: Support basebackup cache size metrics. +#[allow(dead_code)] +pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy = Lazy::new(|| { + register_int_gauge!( + "pageserver_basebackup_cache_size_bytes", + "Total size of all basebackup cache entries on disk in bytes" + ) + .expect("failed to define a metric") +}); + static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_config_ignored_items", diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index e46ba8d3a1..69519dfa87 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -9,7 +9,6 @@ use std::sync::Arc; use std::time::{Duration, Instant, SystemTime}; use std::{io, str}; -use crate::PERF_TRACE_TARGET; use anyhow::{Context, bail}; use async_compression::tokio::write::GzipEncoder; use bytes::Buf; @@ -52,8 +51,10 @@ use utils::simple_rcu::RcuReadGuard; use utils::sync::gate::{Gate, GateGuard}; use utils::sync::spsc_fold; +use crate::PERF_TRACE_TARGET; use crate::auth::check_permission; use crate::basebackup::BasebackupError; +use crate::basebackup_cache::BasebackupCache; use crate::config::PageServerConf; use crate::context::{ DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, @@ -107,6 +108,7 @@ pub fn spawn( perf_trace_dispatch: Option, tcp_listener: tokio::net::TcpListener, tls_config: Option>, + basebackup_cache: Arc, ) -> Listener { let cancel = CancellationToken::new(); let libpq_ctx = RequestContext::todo_child( @@ -128,6 +130,7 @@ pub fn spawn( conf.pg_auth_type, tls_config, conf.page_service_pipelining.clone(), + basebackup_cache, libpq_ctx, cancel.clone(), ) @@ -186,6 +189,7 @@ pub async fn libpq_listener_main( auth_type: AuthType, tls_config: Option>, pipelining_config: PageServicePipeliningConfig, + basebackup_cache: Arc, listener_ctx: RequestContext, listener_cancel: CancellationToken, ) -> Connections { @@ -229,6 +233,7 @@ pub async fn libpq_listener_main( auth_type, tls_config.clone(), pipelining_config.clone(), + Arc::clone(&basebackup_cache), connection_ctx, connections_cancel.child_token(), gate_guard, @@ -271,6 +276,7 @@ async fn page_service_conn_main( auth_type: AuthType, tls_config: Option>, pipelining_config: PageServicePipeliningConfig, + basebackup_cache: Arc, connection_ctx: RequestContext, cancel: CancellationToken, gate_guard: GateGuard, @@ -336,6 +342,7 @@ async fn page_service_conn_main( pipelining_config, conf.get_vectored_concurrent_io, perf_span_fields, + basebackup_cache, connection_ctx, cancel.clone(), gate_guard, @@ -390,6 +397,8 @@ struct PageServerHandler { pipelining_config: PageServicePipeliningConfig, get_vectored_concurrent_io: GetVectoredConcurrentIo, + basebackup_cache: Arc, + gate_guard: GateGuard, } @@ -849,6 +858,7 @@ impl PageServerHandler { pipelining_config: PageServicePipeliningConfig, get_vectored_concurrent_io: GetVectoredConcurrentIo, perf_span_fields: ConnectionPerfSpanFields, + basebackup_cache: Arc, connection_ctx: RequestContext, cancel: CancellationToken, gate_guard: GateGuard, @@ -862,6 +872,7 @@ impl PageServerHandler { cancel, pipelining_config, get_vectored_concurrent_io, + basebackup_cache, gate_guard, } } @@ -2493,6 +2504,8 @@ impl PageServerHandler { .map_err(QueryError::Disconnected)?; self.flush_cancellable(pgb, &self.cancel).await?; + let mut from_cache = false; + // Send a tarball of the latest layer on the timeline. Compress if not // fullbackup. TODO Compress in that case too (tests need to be updated) if full_backup { @@ -2510,7 +2523,33 @@ impl PageServerHandler { .map_err(map_basebackup_error)?; } else { let mut writer = BufWriter::new(pgb.copyout_writer()); - if gzip { + + let cached = { + // Basebackup is cached only for this combination of parameters. + if timeline.is_basebackup_cache_enabled() + && gzip + && lsn.is_some() + && prev_lsn.is_none() + { + self.basebackup_cache + .get(tenant_id, timeline_id, lsn.unwrap()) + .await + } else { + None + } + }; + + if let Some(mut cached) = cached { + from_cache = true; + tokio::io::copy(&mut cached, &mut writer) + .await + .map_err(|e| { + map_basebackup_error(BasebackupError::Client( + e, + "handle_basebackup_request,cached,copy", + )) + })?; + } else if gzip { let mut encoder = GzipEncoder::with_quality( &mut writer, // NOTE using fast compression because it's on the critical path @@ -2569,6 +2608,7 @@ impl PageServerHandler { info!( lsn_await_millis = lsn_awaited_after.as_millis(), basebackup_millis = basebackup_after.as_millis(), + %from_cache, "basebackup complete" ); diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index d4873e60a1..55272b2125 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -380,6 +380,10 @@ pub enum TaskKind { DetachAncestor, ImportPgdata, + + /// Background task of [`crate::basebackup_cache::BasebackupCache`]. + /// Prepares basebackups and clears outdated entries. + BasebackupCache, } #[derive(Default)] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c15b44469a..bf3f71e35a 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -78,6 +78,7 @@ use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, Uninit use self::timeline::{ EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError, }; +use crate::basebackup_cache::BasebackupPrepareSender; use crate::config::PageServerConf; use crate::context; use crate::context::RequestContextBuilder; @@ -157,6 +158,7 @@ pub struct TenantSharedResources { pub remote_storage: GenericRemoteStorage, pub deletion_queue_client: DeletionQueueClient, pub l0_flush_global_state: L0FlushGlobalState, + pub basebackup_prepare_sender: BasebackupPrepareSender, } /// A [`TenantShard`] is really an _attached_ tenant. The configuration @@ -317,12 +319,15 @@ pub struct TenantShard { gc_cs: tokio::sync::Mutex<()>, walredo_mgr: Option>, - // provides access to timeline data sitting in the remote storage + /// Provides access to timeline data sitting in the remote storage. pub(crate) remote_storage: GenericRemoteStorage, - // Access to global deletion queue for when this tenant wants to schedule a deletion + /// Access to global deletion queue for when this tenant wants to schedule a deletion. deletion_queue_client: DeletionQueueClient, + /// A channel to send async requests to prepare a basebackup for the basebackup cache. + basebackup_prepare_sender: BasebackupPrepareSender, + /// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`]. cached_logical_sizes: tokio::sync::Mutex>, cached_synthetic_tenant_size: Arc, @@ -1286,6 +1291,7 @@ impl TenantShard { remote_storage, deletion_queue_client, l0_flush_global_state, + basebackup_prepare_sender, } = resources; let attach_mode = attached_conf.location.attach_mode; @@ -1301,6 +1307,7 @@ impl TenantShard { remote_storage.clone(), deletion_queue_client, l0_flush_global_state, + basebackup_prepare_sender, )); // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if @@ -4239,6 +4246,7 @@ impl TenantShard { remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, l0_flush_global_state: L0FlushGlobalState, + basebackup_prepare_sender: BasebackupPrepareSender, ) -> TenantShard { assert!(!attached_conf.location.generation.is_none()); @@ -4342,6 +4350,7 @@ impl TenantShard { ongoing_timeline_detach: std::sync::Mutex::default(), gc_block: Default::default(), l0_flush_global_state, + basebackup_prepare_sender, } } @@ -5261,6 +5270,7 @@ impl TenantShard { pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(), l0_compaction_trigger: self.l0_compaction_trigger.clone(), l0_flush_global_state: self.l0_flush_global_state.clone(), + basebackup_prepare_sender: self.basebackup_prepare_sender.clone(), } } @@ -5843,6 +5853,8 @@ pub(crate) mod harness { ) -> anyhow::Result> { let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); + let (basebackup_requst_sender, _) = tokio::sync::mpsc::unbounded_channel(); + let tenant = Arc::new(TenantShard::new( TenantState::Attaching, self.conf, @@ -5860,6 +5872,7 @@ pub(crate) mod harness { self.deletion_queue.new_client(), // TODO: ideally we should run all unit tests with both configs L0FlushGlobalState::new(L0FlushConfig::default()), + basebackup_requst_sender, )); let preload = tenant diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a251163419..54dc3b2d0b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -24,8 +24,6 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; -use crate::PERF_TRACE_TARGET; -use crate::walredo::RedoAttemptType; use anyhow::{Context, Result, anyhow, bail, ensure}; use arc_swap::{ArcSwap, ArcSwapOption}; use bytes::Bytes; @@ -94,10 +92,12 @@ use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer}; use super::tasks::log_compaction_error; use super::upload_queue::NotInitialized; use super::{ - AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded, + AttachedTenantConf, BasebackupPrepareSender, GcError, HeatMapTimeline, MaybeOffloaded, debug_assert_current_span_has_tenant_and_timeline_id, }; +use crate::PERF_TRACE_TARGET; use crate::aux_file::AuxFileSizeEstimator; +use crate::basebackup_cache::BasebackupPrepareRequest; use crate::config::PageServerConf; use crate::context::{ DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, @@ -131,6 +131,7 @@ use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::timeline::logical_size::CurrentLogicalSize; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; use crate::walingest::WalLagCooldown; +use crate::walredo::RedoAttemptType; use crate::{ZERO_PAGE, task_mgr, walredo}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] @@ -196,6 +197,7 @@ pub struct TimelineResources { pub pagestream_throttle_metrics: Arc, pub l0_compaction_trigger: Arc, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, + pub basebackup_prepare_sender: BasebackupPrepareSender, } pub struct Timeline { @@ -439,6 +441,9 @@ pub struct Timeline { pub(crate) rel_size_v2_status: ArcSwapOption, wait_lsn_log_slow: tokio::sync::Semaphore, + + /// A channel to send async requests to prepare a basebackup for the basebackup cache. + basebackup_prepare_sender: BasebackupPrepareSender, } pub(crate) enum PreviousHeatmap { @@ -1028,6 +1033,7 @@ pub(crate) enum WaitLsnWaiter<'a> { Tenant, PageService, HttpEndpoint, + BaseBackupCache, } /// Argument to [`Timeline::shutdown`]. @@ -1554,7 +1560,8 @@ impl Timeline { } WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService - | WaitLsnWaiter::HttpEndpoint => unreachable!( + | WaitLsnWaiter::HttpEndpoint + | WaitLsnWaiter::BaseBackupCache => unreachable!( "tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind() ), @@ -2459,6 +2466,41 @@ impl Timeline { false } } + + pub(crate) fn is_basebackup_cache_enabled(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .basebackup_cache_enabled + .unwrap_or(self.conf.default_tenant_conf.basebackup_cache_enabled) + } + + /// Prepare basebackup for the given LSN and store it in the basebackup cache. + /// The method is asynchronous and returns immediately. + /// The actual basebackup preparation is performed in the background + /// by the basebackup cache on a best-effort basis. + pub(crate) fn prepare_basebackup(&self, lsn: Lsn) { + if !self.is_basebackup_cache_enabled() { + return; + } + if !self.tenant_shard_id.is_shard_zero() { + // In theory we should never get here, but just in case check it. + // Preparing basebackup doesn't make sense for shards other than shard zero. + return; + } + + let res = self + .basebackup_prepare_sender + .send(BasebackupPrepareRequest { + tenant_shard_id: self.tenant_shard_id, + timeline_id: self.timeline_id, + lsn, + }); + if let Err(e) = res { + // May happen during shutdown, it's not critical. + info!("Failed to send shutdown checkpoint: {e:#}"); + } + } } /// Number of times we will compute partition within a checkpoint distance. @@ -3028,6 +3070,8 @@ impl Timeline { rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status), wait_lsn_log_slow: tokio::sync::Semaphore::new(1), + + basebackup_prepare_sender: resources.basebackup_prepare_sender, }; result.repartition_threshold = diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index c7a6655052..c1a3b79915 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1316,6 +1316,10 @@ impl WalIngest { } }); + if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { + modification.tline.prepare_basebackup(lsn); + } + Ok(()) } diff --git a/test_runner/regress/test_basebackup.py b/test_runner/regress/test_basebackup.py new file mode 100644 index 0000000000..b083c394c7 --- /dev/null +++ b/test_runner/regress/test_basebackup.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from fixtures.utils import wait_until + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + + +def test_basebackup_cache(neon_env_builder: NeonEnvBuilder): + """ + Simple test for basebackup cache. + 1. Check that we always hit the cache after compute restart. + 2. Check that we eventually delete old basebackup files, but not the latest one. + 3. Check that we delete basebackup file for timeline with active compute. + """ + + neon_env_builder.pageserver_config_override = """ + tenant_config = { basebackup_cache_enabled = true } + basebackup_cache_config = { cleanup_period = '1s' } + """ + + env = neon_env_builder.init_start() + ep = env.endpoints.create("main") + ps = env.pageserver + ps_http = ps.http_client() + + # 1. Check that we always hit the cache after compute restart. + for i in range(3): + ep.start() + ep.stop() + + def check_metrics(i=i): + metrics = ps_http.get_metrics() + # Never miss. + # The first time compute_ctl sends `get_basebackup` with lsn=None, we do not cache such requests. + # All other requests should be a hit + assert ( + metrics.query_one( + "pageserver_basebackup_cache_read_total", {"result": "miss"} + ).value + == 0 + ) + # All but the first requests are hits. + assert ( + metrics.query_one("pageserver_basebackup_cache_read_total", {"result": "hit"}).value + == i + ) + # Every compute shut down should trigger a prepare reuest. + assert ( + metrics.query_one( + "pageserver_basebackup_cache_prepare_total", {"result": "ok"} + ).value + == i + 1 + ) + + wait_until(check_metrics) + + # 2. Check that we eventually delete old basebackup files, but not the latest one. + def check_bb_file_count(): + bb_files = list(ps.workdir.joinpath("basebackup_cache").iterdir()) + # tmp dir + 1 basebackup file. + assert len(bb_files) == 2 + + wait_until(check_bb_file_count) + + # 3. Check that we delete basebackup file for timeline with active compute. + ep.start() + ep.safe_psql("create table t1 as select generate_series(1, 10) as n") + + def check_bb_dir_empty(): + bb_files = list(ps.workdir.joinpath("basebackup_cache").iterdir()) + # only tmp dir. + assert len(bb_files) == 1 + + wait_until(check_bb_dir_empty) From e69ae739ff8b44e3274dff804e41cf76fcefd244 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 22 May 2025 16:20:50 +0100 Subject: [PATCH 135/142] fix(compute_ctl): fix rsyslogd restart race. (#11988) Add retry loop around waiting for rsyslog start ## Problem ## Summary of changes --------- Co-authored-by: Konstantin Knizhnik Co-authored-by: Matthias van de Meent Co-authored-by: Konstantin Knizhnik --- compute_tools/src/rsyslog.rs | 37 ++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs index 7be97046a0..23036e9234 100644 --- a/compute_tools/src/rsyslog.rs +++ b/compute_tools/src/rsyslog.rs @@ -27,6 +27,23 @@ fn get_rsyslog_pid() -> Option { } } +fn wait_for_rsyslog_pid() -> Result { + for attempt in 1..=50 { + match get_rsyslog_pid() { + Some(pid) => return Ok(pid), + None => { + info!( + "rsyslogd is not running, attempt {}/50. Waiting...", + attempt + ); + std::thread::sleep(std::time::Duration::from_millis(2)); + } + } + } + + Err(anyhow::anyhow!("rsyslogd did not start after 50 attempts")) +} + // Restart rsyslogd to apply the new configuration. // This is necessary, because there is no other way to reload the rsyslog configuration. // @@ -36,14 +53,14 @@ fn get_rsyslog_pid() -> Option { // TODO: test it properly // fn restart_rsyslog() -> Result<()> { - let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?; - info!("rsyslogd is running with pid: {}, restart it", old_pid); - // kill it to restart let _ = Command::new("pkill") .arg("rsyslogd") .output() - .context("Failed to stop rsyslogd")?; + .context("Failed to restart rsyslogd")?; + + // ensure rsyslogd is running + wait_for_rsyslog_pid()?; Ok(()) } @@ -131,15 +148,11 @@ pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result return Ok(()); } - // When new config is empty we can simply remove the configuration file. + // Nothing to configure if new_config.is_empty() { - info!("removing rsyslog config file: {}", POSTGRES_LOGS_CONF_PATH); - match std::fs::remove_file(POSTGRES_LOGS_CONF_PATH) { - Ok(_) => {} - Err(err) if err.kind() == ErrorKind::NotFound => {} - Err(err) => return Err(err.into()), - } - restart_rsyslog()?; + // When the configuration is removed, PostgreSQL will stop sending data + // to the files watched by rsyslog, so restarting rsyslog is more effort + // than just ignoring this change. return Ok(()); } From cf81330fbc8ecbfc720f0cafaf6b1ec57793928f Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 22 May 2025 21:15:05 +0200 Subject: [PATCH 136/142] fix(compute_ctl): Wait for rsyslog longer and with backoff (#12002) ## Problem https://github.com/neondatabase/neon/pull/11988 waits only for max ~200ms, so we still see failures, which self-resolve after several operation retries. ## Summary of changes Change it to waiting for at least 5 seconds, starting with 2 ms sleep between iterations and x2 sleep on each next iteration. It could be that it's not a problem with a slow `rsyslog` start, but a longer wait won't hurt. If it won't start, we should debug why `inittab` doesn't start it, or maybe there is another problem. --- compute_tools/src/rsyslog.rs | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs index 23036e9234..c873697623 100644 --- a/compute_tools/src/rsyslog.rs +++ b/compute_tools/src/rsyslog.rs @@ -28,20 +28,37 @@ fn get_rsyslog_pid() -> Option { } fn wait_for_rsyslog_pid() -> Result { - for attempt in 1..=50 { + const MAX_WAIT: Duration = Duration::from_secs(5); + const INITIAL_SLEEP: Duration = Duration::from_millis(2); + + let mut sleep_duration = INITIAL_SLEEP; + let start = std::time::Instant::now(); + let mut attempts = 1; + + for attempt in 1.. { + attempts = attempt; match get_rsyslog_pid() { Some(pid) => return Ok(pid), None => { + if start.elapsed() >= MAX_WAIT { + break; + } info!( - "rsyslogd is not running, attempt {}/50. Waiting...", - attempt + "rsyslogd is not running, attempt {}. Sleeping for {} ms", + attempt, + sleep_duration.as_millis() ); - std::thread::sleep(std::time::Duration::from_millis(2)); + std::thread::sleep(sleep_duration); + sleep_duration *= 2; } } } - Err(anyhow::anyhow!("rsyslogd did not start after 50 attempts")) + Err(anyhow::anyhow!( + "rsyslogd is not running after waiting for {} seconds and {} attempts", + attempts, + start.elapsed().as_secs() + )) } // Restart rsyslogd to apply the new configuration. From 8ff25dca8eb579f7c17e0cc0833c8ad8fa3344f7 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 23 May 2025 08:08:32 +0300 Subject: [PATCH 137/142] Add online_advisor extension (#11898) ## Problem Detect problems with Postgres optimiser: lack of indexes and statistics ## Summary of changes https://github.com/knizhnik/online_advisor Add online_advistor extension to docker image --------- Co-authored-by: Konstantin Knizhnik --- compute/compute-node.Dockerfile | 34 +++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 17e50697db..3e2c09493f 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -582,6 +582,38 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control +######################################################################################### +# +# Layer "online_advisor-build" +# compile online_advisor extension +# +######################################################################################### +FROM build-deps AS online_advisor-src +ARG PG_VERSION + +# online_advisor supports all Postgres version starting from PG14, but prior to PG17 has to be included in preload_shared_libraries +# last release 1.0 - May 15, 2025 +WORKDIR /ext-src +RUN case "${PG_VERSION:?}" in \ + "v17") \ + ;; \ + *) \ + echo "skipping the version of online_advistor for $PG_VERSION" && exit 0 \ + ;; \ + esac && \ + wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \ + echo "059b7d9e5a90013a58bdd22e9505b88406ce05790675eb2d8434e5b215652d54 online_advisor.tar.gz" | sha256sum --check && \ + mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C . + +FROM pg-build AS online_advisor-build +COPY --from=online_advisor-src /ext-src/ /ext-src/ +WORKDIR /ext-src/ +RUN if [ -d online_advisor-src ]; then \ + cd online_advisor-src && \ + make -j install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/online_advisor.control; \ + fi + ######################################################################################### # # Layer "pg_hashids-build" @@ -1648,6 +1680,7 @@ COPY --from=pg_jsonschema-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_graphql-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_tiktoken-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=hypopg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=online_advisor-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_hashids-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rum-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgtap-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -1823,6 +1856,7 @@ COPY --from=pgjwt-src /ext-src/ /ext-src/ COPY --from=pg_graphql-src /ext-src/ /ext-src/ #COPY --from=pg_tiktoken-src /ext-src/ /ext-src/ COPY --from=hypopg-src /ext-src/ /ext-src/ +COPY --from=online_advisor-src /ext-src/ /ext-src/ COPY --from=pg_hashids-src /ext-src/ /ext-src/ COPY --from=rum-src /ext-src/ /ext-src/ COPY --from=pgtap-src /ext-src/ /ext-src/ From d5023f2b8966ba47251ac62dae7188c40cc56cd8 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 23 May 2025 11:48:06 +0300 Subject: [PATCH 138/142] Restrict pump prefetch state only to regular backends (#12000) ## Problem See https://github.com/neondatabase/neon/issues/11997 This guard prevents race condition with pump prefetch state (initiated by timeout). Assert checks that prefetching is also done under guard. But prewarm knows nothing about it. ## Summary of changes Pump prefetch state only in regular backends. Prewarming is done by background workers now. Also it seems to have not sense to pump prefetch state in any other background workers: parallel executors, vacuum,... because they are short living and can not leave unconsumed responses in socket. --------- Co-authored-by: Konstantin Knizhnik --- pgxn/neon/communicator.c | 5 +++-- pgxn/neon/file_cache.c | 4 ++++ pgxn/neon/neon.h | 2 ++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c index 9609f186b9..2655a45bcc 100644 --- a/pgxn/neon/communicator.c +++ b/pgxn/neon/communicator.c @@ -717,7 +717,7 @@ prefetch_read(PrefetchRequest *slot) Assert(slot->status == PRFS_REQUESTED); Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_receive); - Assert(readpage_reentrant_guard); + Assert(readpage_reentrant_guard || AmPrewarmWorker); if (slot->status != PRFS_REQUESTED || slot->response != NULL || @@ -800,7 +800,7 @@ communicator_prefetch_receive(BufferTag tag) PrfHashEntry *entry; PrefetchRequest hashkey; - Assert(readpage_reentrant_guard); + Assert(readpage_reentrant_guard || AmPrewarmWorker); /* do not pump prefetch state in prewarm worker */ hashkey.buftag = tag; entry = prfh_lookup(MyPState->prf_hash, &hashkey); if (entry != NULL && prefetch_wait_for(entry->slot->my_ring_index)) @@ -2450,6 +2450,7 @@ void communicator_reconfigure_timeout_if_needed(void) { bool needs_set = MyPState->ring_receive != MyPState->ring_unused && + !AmPrewarmWorker && /* do not pump prefetch state in prewarm worker */ readahead_getpage_pull_timeout_ms > 0; if (needs_set != timeout_set) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 176fd9643f..45a4695495 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -201,6 +201,8 @@ static shmem_request_hook_type prev_shmem_request_hook; bool lfc_store_prefetch_result; bool lfc_prewarm_update_ws_estimation; +bool AmPrewarmWorker; + #define LFC_ENABLED() (lfc_ctl->limit != 0) /* @@ -845,6 +847,8 @@ lfc_prewarm_main(Datum main_arg) PrewarmWorkerState* ws; uint32 worker_id = DatumGetInt32(main_arg); + AmPrewarmWorker = true; + pqsignal(SIGTERM, die); BackgroundWorkerUnblockSignals(); diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index a2e81feb5f..431dacb708 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -23,6 +23,8 @@ extern int wal_acceptor_connection_timeout; extern int readahead_getpage_pull_timeout_ms; extern bool disable_wal_prev_lsn_checks; +extern bool AmPrewarmWorker; + #if PG_MAJORVERSION_NUM >= 17 extern uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; extern uint32 WAIT_EVENT_NEON_LFC_READ; From 06ce7040413234f4460f89fbca666b3a19f86446 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 23 May 2025 10:57:35 +0200 Subject: [PATCH 139/142] Cargo.toml: upgrade Tonic to 0.13.1 (#11995) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem We're about to implement a gRPC interface for Pageserver. Let's upgrade Tonic first, to avoid a more painful migration later. It's currently only used by storage-broker. Touches #11728. ## Summary of changes Upgrade Tonic 0.12.3 → 0.13.1. Also opportunistically upgrade Prost 0.13.3 → 0.13.5. This transitively pulls in Indexmap 2.0.1 → 2.9.0, but it doesn't appear to be used in any particularly critical code paths. --- Cargo.lock | 174 +++++++++++++---------- Cargo.toml | 6 +- storage_broker/src/bin/storage_broker.rs | 13 +- workspace_hack/Cargo.toml | 11 +- 4 files changed, 114 insertions(+), 90 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b52ecec128..422af2c97e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1276,7 +1276,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "indexmap 2.0.1", + "indexmap 2.9.0", "jsonwebtoken", "regex", "remote_storage", @@ -1308,7 +1308,7 @@ dependencies = [ "flate2", "futures", "http 1.1.0", - "indexmap 2.0.1", + "indexmap 2.9.0", "itertools 0.10.5", "jsonwebtoken", "metrics", @@ -2597,7 +2597,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.9", - "indexmap 2.0.1", + "indexmap 2.9.0", "slab", "tokio", "tokio-util", @@ -2616,7 +2616,7 @@ dependencies = [ "futures-sink", "futures-util", "http 1.1.0", - "indexmap 2.0.1", + "indexmap 2.9.0", "slab", "tokio", "tokio-util", @@ -2863,14 +2863,14 @@ dependencies = [ "pprof", "regex", "routerify", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-pemfile 2.1.1", "serde", "serde_json", "serde_path_to_error", "thiserror 1.0.69", "tokio", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-stream", "tokio-util", "tracing", @@ -3200,12 +3200,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.0.1" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e" +checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" dependencies = [ "equivalent", - "hashbrown 0.14.5", + "hashbrown 0.15.2", "serde", ] @@ -3228,7 +3228,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88" dependencies = [ "ahash", - "indexmap 2.0.1", + "indexmap 2.9.0", "is-terminal", "itoa", "log", @@ -3251,7 +3251,7 @@ dependencies = [ "crossbeam-utils", "dashmap 6.1.0", "env_logger", - "indexmap 2.0.1", + "indexmap 2.9.0", "itoa", "log", "num-format", @@ -4112,7 +4112,7 @@ dependencies = [ "opentelemetry-http", "opentelemetry-proto", "opentelemetry_sdk", - "prost 0.13.3", + "prost 0.13.5", "reqwest", "thiserror 1.0.69", ] @@ -4125,8 +4125,8 @@ checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6" dependencies = [ "opentelemetry", "opentelemetry_sdk", - "prost 0.13.3", - "tonic", + "prost 0.13.5", + "tonic 0.12.3", ] [[package]] @@ -4339,7 +4339,7 @@ dependencies = [ "reqwest", "rpds", "rstest", - "rustls 0.23.18", + "rustls 0.23.27", "scopeguard", "send-future", "serde", @@ -4358,7 +4358,7 @@ dependencies = [ "tokio-epoll-uring", "tokio-io-timeout", "tokio-postgres", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-stream", "tokio-tar", "tokio-util", @@ -4455,8 +4455,8 @@ dependencies = [ name = "pageserver_page_api" version = "0.1.0" dependencies = [ - "prost 0.13.3", - "tonic", + "prost 0.13.5", + "tonic 0.13.1", "tonic-build", "workspace_hack", ] @@ -4837,14 +4837,14 @@ dependencies = [ "bytes", "once_cell", "pq_proto", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-pemfile 2.1.1", "serde", "thiserror 1.0.69", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-util", "tracing", ] @@ -4951,7 +4951,7 @@ dependencies = [ "inferno 0.12.0", "num", "paste", - "prost 0.13.3", + "prost 0.13.5", ] [[package]] @@ -5056,12 +5056,12 @@ dependencies = [ [[package]] name = "prost" -version = "0.13.3" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" dependencies = [ "bytes", - "prost-derive 0.13.3", + "prost-derive 0.13.5", ] [[package]] @@ -5099,7 +5099,7 @@ dependencies = [ "once_cell", "petgraph", "prettyplease", - "prost 0.13.3", + "prost 0.13.5", "prost-types 0.13.3", "regex", "syn 2.0.100", @@ -5121,9 +5121,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.13.3" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", "itertools 0.12.1", @@ -5147,7 +5147,7 @@ version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" dependencies = [ - "prost 0.13.3", + "prost 0.13.5", ] [[package]] @@ -5195,7 +5195,7 @@ dependencies = [ "hyper 0.14.30", "hyper 1.4.1", "hyper-util", - "indexmap 2.0.1", + "indexmap 2.9.0", "ipnet", "itertools 0.10.5", "itoa", @@ -5229,7 +5229,7 @@ dependencies = [ "rsa", "rstest", "rustc-hash 1.1.0", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", "scopeguard", @@ -5248,7 +5248,7 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-postgres2", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-tungstenite 0.21.0", "tokio-util", "tracing", @@ -5472,13 +5472,13 @@ dependencies = [ "num-bigint", "percent-encoding", "pin-project-lite", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-native-certs 0.8.0", "ryu", "sha1_smol", "socket2", "tokio", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-util", "url", ] @@ -5926,15 +5926,15 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.18" +version = "0.23.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c9cc1d47e243d655ace55ed38201c19ae02c148ae56412ab8750e8f0166ab7f" +checksum = "730944ca083c1c233a75c09f199e973ca499344a2b7ba9e755c457e86fb4a321" dependencies = [ "log", "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.102.8", + "rustls-webpki 0.103.3", "subtle", "zeroize", ] @@ -6023,6 +6023,17 @@ dependencies = [ "untrusted", ] +[[package]] +name = "rustls-webpki" +version = "0.103.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.12" @@ -6074,7 +6085,7 @@ dependencies = [ "regex", "remote_storage", "reqwest", - "rustls 0.23.18", + "rustls 0.23.27", "safekeeper_api", "safekeeper_client", "scopeguard", @@ -6091,7 +6102,7 @@ dependencies = [ "tokio", "tokio-io-timeout", "tokio-postgres", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-stream", "tokio-tar", "tokio-util", @@ -6263,7 +6274,7 @@ checksum = "255914a8e53822abd946e2ce8baa41d4cded6b8e938913b7f7b9da5b7ab44335" dependencies = [ "httpdate", "reqwest", - "rustls 0.23.18", + "rustls 0.23.27", "sentry-backtrace", "sentry-contexts", "sentry-core", @@ -6692,11 +6703,11 @@ dependencies = [ "metrics", "once_cell", "parking_lot 0.12.1", - "prost 0.13.3", - "rustls 0.23.18", + "prost 0.13.5", + "rustls 0.23.27", "tokio", - "tokio-rustls 0.26.0", - "tonic", + "tokio-rustls 0.26.2", + "tonic 0.13.1", "tonic-build", "tracing", "utils", @@ -6738,7 +6749,7 @@ dependencies = [ "regex", "reqwest", "routerify", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-native-certs 0.8.0", "safekeeper_api", "safekeeper_client", @@ -6753,7 +6764,7 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-util", "tracing", "utils", @@ -6791,7 +6802,7 @@ dependencies = [ "postgres_ffi", "remote_storage", "reqwest", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-native-certs 0.8.0", "serde", "serde_json", @@ -7325,10 +7336,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab" dependencies = [ "ring", - "rustls 0.23.18", + "rustls 0.23.27", "tokio", "tokio-postgres", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "x509-certificate", ] @@ -7372,12 +7383,11 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.26.0" +version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" +checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" dependencies = [ - "rustls 0.23.18", - "rustls-pki-types", + "rustls 0.23.27", "tokio", ] @@ -7475,7 +7485,7 @@ version = "0.22.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38" dependencies = [ - "indexmap 2.0.1", + "indexmap 2.9.0", "serde", "serde_spanned", "toml_datetime", @@ -7487,6 +7497,27 @@ name = "tonic" version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "percent-encoding", + "pin-project", + "prost 0.13.5", + "tokio-stream", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9" dependencies = [ "async-trait", "base64 0.22.1", @@ -7499,13 +7530,12 @@ dependencies = [ "hyper-util", "percent-encoding", "pin-project", - "prost 0.13.3", + "prost 0.13.5", "rustls-native-certs 0.8.0", - "rustls-pemfile 2.1.1", "tokio", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-stream", - "tower 0.4.13", + "tower 0.5.2", "tower-layer", "tower-service", "tracing", @@ -7513,9 +7543,9 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.12.3" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11" +checksum = "eac6f67be712d12f0b41328db3137e0d0757645d8904b4cb7d51cd9c2279e847" dependencies = [ "prettyplease", "proc-macro2", @@ -7533,16 +7563,11 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" dependencies = [ "futures-core", "futures-util", - "indexmap 1.9.3", "pin-project", "pin-project-lite", - "rand 0.8.5", - "slab", "tokio", - "tokio-util", "tower-layer", "tower-service", - "tracing", ] [[package]] @@ -7553,9 +7578,12 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" dependencies = [ "futures-core", "futures-util", + "indexmap 2.9.0", "pin-project-lite", + "slab", "sync_wrapper 1.0.1", "tokio", + "tokio-util", "tower-layer", "tower-service", "tracing", @@ -7883,7 +7911,7 @@ dependencies = [ "base64 0.22.1", "log", "once_cell", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-pki-types", "url", "webpki-roots", @@ -8078,7 +8106,7 @@ dependencies = [ "pageserver_api", "postgres_ffi", "pprof", - "prost 0.13.3", + "prost 0.13.5", "remote_storage", "serde", "serde_json", @@ -8534,8 +8562,7 @@ dependencies = [ "hyper 0.14.30", "hyper 1.4.1", "hyper-util", - "indexmap 1.9.3", - "indexmap 2.0.1", + "indexmap 2.9.0", "itertools 0.12.1", "lazy_static", "libc", @@ -8557,16 +8584,16 @@ dependencies = [ "percent-encoding", "prettyplease", "proc-macro2", - "prost 0.13.3", + "prost 0.13.5", "quote", "rand 0.8.5", "regex", "regex-automata 0.4.3", "regex-syntax 0.8.2", "reqwest", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-pki-types", - "rustls-webpki 0.102.8", + "rustls-webpki 0.103.3", "scopeguard", "sec1 0.7.3", "serde", @@ -8584,12 +8611,11 @@ dependencies = [ "time", "time-macros", "tokio", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-stream", "tokio-util", "toml_edit", - "tonic", - "tower 0.4.13", + "tower 0.5.2", "tracing", "tracing-core", "tracing-log", diff --git a/Cargo.toml b/Cargo.toml index a280c446b9..c8e2c38c85 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -149,7 +149,7 @@ pin-project-lite = "0.2" pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] } procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency -prost = "0.13" +prost = "0.13.5" rand = "0.8" redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" @@ -199,7 +199,7 @@ tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } toml = "0.8" toml_edit = "0.22" -tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]} +tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "prost", "tls-ring", "tls-native-roots"] } tower = { version = "0.5.2", default-features = false } tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] } @@ -280,7 +280,7 @@ criterion = "0.5.1" rcgen = "0.13" rstest = "0.18" camino-tempfile = "1.0.2" -tonic-build = "0.12" +tonic-build = "0.13.1" [patch.crates-io] diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 476d5f03ea..bae5ccb36c 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -17,12 +17,14 @@ use std::pin::Pin; use std::sync::Arc; use std::time::Duration; +use bytes::Bytes; use camino::Utf8PathBuf; use clap::{Parser, command}; use futures::future::OptionFuture; use futures_core::Stream; use futures_util::StreamExt; -use http_body_util::Full; +use http_body_util::combinators::BoxBody; +use http_body_util::{Empty, Full}; use http_utils::tls_certs::ReloadingCertificateResolver; use hyper::body::Incoming; use hyper::header::CONTENT_TYPE; @@ -46,7 +48,6 @@ use tokio::net::TcpListener; use tokio::sync::broadcast; use tokio::sync::broadcast::error::RecvError; use tokio::time; -use tonic::body::{self, BoxBody, empty_body}; use tonic::codegen::Service; use tonic::{Code, Request, Response, Status}; use tracing::*; @@ -634,7 +635,7 @@ impl BrokerService for Broker { // We serve only metrics and healthcheck through http1. async fn http1_handler( req: hyper::Request, -) -> Result, Infallible> { +) -> Result>, Infallible> { let resp = match (req.method(), req.uri().path()) { (&Method::GET, "/metrics") => { let mut buffer = vec![]; @@ -645,16 +646,16 @@ async fn http1_handler( hyper::Response::builder() .status(StatusCode::OK) .header(CONTENT_TYPE, encoder.format_type()) - .body(body::boxed(Full::new(bytes::Bytes::from(buffer)))) + .body(BoxBody::new(Full::new(Bytes::from(buffer)))) .unwrap() } (&Method::GET, "/status") => hyper::Response::builder() .status(StatusCode::OK) - .body(empty_body()) + .body(BoxBody::new(Empty::new())) .unwrap(), _ => hyper::Response::builder() .status(StatusCode::NOT_FOUND) - .body(empty_body()) + .body(BoxBody::new(Empty::new())) .unwrap(), }; Ok(resp) diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 87d0092fb2..9e1123ac0e 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -53,8 +53,7 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["client", "http1", "http2", "runtime", "server", "stream"] } hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] } hyper-util = { version = "0.1", features = ["client-legacy", "http1", "http2", "server", "service"] } -indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } -indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } +indexmap = { version = "2", features = ["serde"] } itertools = { version = "0.12" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } @@ -82,7 +81,7 @@ regex-syntax = { version = "0.8" } reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "rustls-tls-native-roots", "stream"] } rustls = { version = "0.23", default-features = false, features = ["logging", "ring", "std", "tls12"] } rustls-pki-types = { version = "1", features = ["std"] } -rustls-webpki = { version = "0.102", default-features = false, features = ["ring", "std"] } +rustls-webpki = { version = "0.103", default-features = false, features = ["ring", "std"] } scopeguard = { version = "1" } sec1 = { version = "0.7", features = ["pem", "serde", "std", "subtle"] } serde = { version = "1", features = ["alloc", "derive"] } @@ -102,8 +101,7 @@ tokio-rustls = { version = "0.26", default-features = false, features = ["loggin tokio-stream = { version = "0.1" } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } toml_edit = { version = "0.22", features = ["serde"] } -tonic = { version = "0.12", default-features = false, features = ["codegen", "prost", "tls-roots"] } -tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] } +tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "log"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } tracing-log = { version = "0.2" } @@ -125,8 +123,7 @@ either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } -indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } -indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } +indexmap = { version = "2", features = ["serde"] } itertools = { version = "0.12" } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } From 87fc0a03747c4c1e24b2a8ec1479ea97150fa433 Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Fri, 23 May 2025 11:37:19 +0200 Subject: [PATCH 140/142] periodic pagebench on hetzner runners (#11963) ## Problem - Benchmark periodic pagebench had inconsistent benchmarking results even when run with the same commit hash. Hypothesis is this was due to running on dedicated but virtualized EC instance with varying CPU frequency. - the dedicated instance type used for the benchmark is quite "old" and we increasingly get `An error occurred (InsufficientInstanceCapacity) when calling the StartInstances operation (reached max retries: 2): Insufficient capacity.` - periodic pagebench uses a snapshot of pageserver timelines to have the same layer structure in each run and get consistent performance. Re-creating the snapshot was a painful manual process (see https://github.com/neondatabase/cloud/issues/27051 and https://github.com/neondatabase/cloud/issues/27653) ## Summary of changes - Run the periodic pagebench on a custom hetzner GitHub runner with large nvme disk and governor set to defined perf profile - provide a manual dispatch option for the workflow that allows to create a new snapshot - keep the manual dispatch option to specify a commit hash useful for bi-secting regressions - always use the newest created snapshot (S3 bucket uses date suffix in S3 key, example `s3://neon-github-public-dev/performance/pagebench/shared-snapshots-2025-05-17/` - `--ignore` `test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py` in regular benchmarks run for each commit - improve perf copying snapshot by using `cp` subprocess instead of traversing tree in python ## Example runs with code in this PR: - run which creates new snapshot https://github.com/neondatabase/neon/actions/runs/15083408849/job/42402986376#step:19:55 - run which uses latest snapshot - https://github.com/neondatabase/neon/actions/runs/15084907676/job/42406240745#step:11:65 --- .github/workflows/build_and_test.yml | 3 +- .github/workflows/periodic_pagebench.yml | 281 ++++++++++++------ scripts/benchmark_durations.py | 6 - test_runner/fixtures/neon_fixtures.py | 7 +- ...er_max_throughput_getpage_at_latest_lsn.py | 8 +- 5 files changed, 197 insertions(+), 108 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a887db2ab1..9f2fa3d52c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -314,7 +314,8 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: ${{ github.ref_name == 'main' }} - extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} + # test_pageserver_max_throughput_getpage_at_latest_lsn is run in separate workflow periodic_pagebench.yml because it needs snapshots + extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} --ignore=test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }} pg_version: v16 aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index 532da435c2..317db94052 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -1,4 +1,4 @@ -name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region +name: Periodic pagebench performance test on unit-perf hetzner runner on: schedule: @@ -8,7 +8,7 @@ on: # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '0 */3 * * *' # Runs every 3 hours + - cron: '0 */4 * * *' # Runs every 4 hours workflow_dispatch: # Allows manual triggering of the workflow inputs: commit_hash: @@ -16,6 +16,11 @@ on: description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.' required: false default: '' + recreate_snapshots: + type: boolean + description: 'Recreate snapshots - !!!WARNING!!! We should only recreate snapshots if the previous ones are no longer compatible. Otherwise benchmarking results are not comparable across runs.' + required: false + default: false defaults: run: @@ -29,13 +34,13 @@ permissions: contents: read jobs: - trigger_bench_on_ec2_machine_in_eu_central_1: + run_periodic_pagebench_test: permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write pull-requests: write - runs-on: [ self-hosted, small ] + runs-on: [ self-hosted, unit-perf ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: @@ -44,10 +49,13 @@ jobs: options: --init timeout-minutes: 360 # Set the timeout to 6 hours env: - API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }} RUN_ID: ${{ github.run_id }} - AWS_DEFAULT_REGION : "eu-central-1" - AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74" + DEFAULT_PG_VERSION: 16 + BUILD_TYPE: release + RUST_BACKTRACE: 1 + # NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS: 1 - doesn't work without root in container + S3_BUCKET: neon-github-public-dev + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" steps: # we don't need the neon source code because we run everything remotely # however we still need the local github actions to run the allure step below @@ -56,99 +64,194 @@ jobs: with: egress-policy: audit - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up the environment which depends on $RUNNER_TEMP on nvme drive + id: set-env + shell: bash -euxo pipefail {0} + run: | + { + echo "NEON_DIR=${RUNNER_TEMP}/neon" + echo "NEON_BIN=${RUNNER_TEMP}/neon/bin" + echo "POSTGRES_DISTRIB_DIR=${RUNNER_TEMP}/neon/pg_install" + echo "LD_LIBRARY_PATH=${RUNNER_TEMP}/neon/pg_install/v${DEFAULT_PG_VERSION}/lib" + echo "BACKUP_DIR=${RUNNER_TEMP}/instance_store/saved_snapshots" + echo "TEST_OUTPUT=${RUNNER_TEMP}/neon/test_output" + echo "PERF_REPORT_DIR=${RUNNER_TEMP}/neon/test_output/perf-report-local" + echo "ALLURE_DIR=${RUNNER_TEMP}/neon/test_output/allure-results" + echo "ALLURE_RESULTS_DIR=${RUNNER_TEMP}/neon/test_output/allure-results/results" + } >> "$GITHUB_ENV" - - name: Show my own (github runner) external IP address - usefull for IP allowlisting - run: curl https://ifconfig.me + echo "allure_results_dir=${RUNNER_TEMP}/neon/test_output/allure-results/results" >> "$GITHUB_OUTPUT" - - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine) - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }} - role-duration-seconds: 3600 - - - name: Start EC2 instance and wait for the instance to boot up - run: | - aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID - aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID - sleep 60 # sleep some time to allow cloudinit and our API server to start up - - - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US - run: | - public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text) - echo "Public IP of the EC2 instance: $public_ip" - echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV - + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # max 5 hours (needed in case commit hash is still being built) - name: Determine commit hash + id: commit_hash + shell: bash -euxo pipefail {0} env: INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }} run: | - if [ -z "$INPUT_COMMIT_HASH" ]; then - echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV + if [[ -z "${INPUT_COMMIT_HASH}" ]]; then + COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha') + echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV + echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT" echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV else - echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV + COMMIT_HASH="${INPUT_COMMIT_HASH}" + echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV + echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT" echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV fi + - name: Checkout the neon repository at given commit hash + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + ref: ${{ steps.commit_hash.outputs.commit_hash }} - - name: Start Bench with run_id + # does not reuse ./.github/actions/download because we need to download the artifact for the given commit hash + # example artifact + # s3://neon-github-public-dev/artifacts/48b870bc078bd2c450eb7b468e743b9c118549bf/15036827400/1/neon-Linux-X64-release-artifact.tar.zst /instance_store/artifacts/neon-Linux-release-artifact.tar.zst + - name: Determine artifact S3_KEY for given commit hash and download and extract artifact + id: artifact_prefix + shell: bash -euxo pipefail {0} + env: + ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst + COMMIT_HASH: ${{ env.COMMIT_HASH }} + COMMIT_HASH_TYPE: ${{ env.COMMIT_HASH_TYPE }} run: | - curl -k -X 'POST' \ - "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -H "Authorization: Bearer $API_KEY" \ - -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}" + attempt=0 + max_attempts=24 # 5 minutes * 24 = 2 hours - - name: Poll Test Status - id: poll_step - run: | - status="" - while [[ "$status" != "failure" && "$status" != "success" ]]; do - response=$(curl -k -X 'GET' \ - "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \ - -H 'accept: application/json' \ - -H "Authorization: Bearer $API_KEY") - echo "Response: $response" - set +x - status=$(echo $response | jq -r '.status') - echo "Test status: $status" - if [[ "$status" == "failure" ]]; then - echo "Test failed" - exit 1 # Fail the job step if status is failure - elif [[ "$status" == "success" || "$status" == "null" ]]; then + while [[ $attempt -lt $max_attempts ]]; do + # the following command will fail until the artifacts are available ... + S3_KEY=$(aws s3api list-objects-v2 --bucket "$S3_BUCKET" --prefix "artifacts/$COMMIT_HASH/" \ + | jq -r '.Contents[]?.Key' \ + | grep "neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst" \ + | sort --version-sort \ + | tail -1) || true # ... thus ignore errors from the command + if [[ -n "${S3_KEY}" ]]; then + echo "Artifact found: $S3_KEY" + echo "S3_KEY=$S3_KEY" >> $GITHUB_ENV break - elif [[ "$status" == "too_many_runs" ]]; then - echo "Too many runs already running" - echo "too_many_runs=true" >> "$GITHUB_OUTPUT" - exit 1 fi - - sleep 60 # Poll every 60 seconds + + # Increment attempt counter and sleep for 5 minutes + attempt=$((attempt + 1)) + echo "Attempt $attempt of $max_attempts to find artifacts in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH failed. Retrying in 5 minutes..." + sleep 300 # Sleep for 5 minutes done - - name: Retrieve Test Logs - if: always() && steps.poll_step.outputs.too_many_runs != 'true' - run: | - curl -k -X 'GET' \ - "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \ - -H 'accept: application/gzip' \ - -H "Authorization: Bearer $API_KEY" \ - --output "test_log_${GITHUB_RUN_ID}.gz" + if [[ -z "${S3_KEY}" ]]; then + echo "Error: artifact not found in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH" after 2 hours + else + mkdir -p $(dirname $ARCHIVE) + time aws s3 cp --only-show-errors s3://$S3_BUCKET/${S3_KEY} ${ARCHIVE} + mkdir -p ${NEON_DIR} + time tar -xf ${ARCHIVE} -C ${NEON_DIR} + rm -f ${ARCHIVE} + fi - - name: Unzip Test Log and Print it into this job's log - if: always() && steps.poll_step.outputs.too_many_runs != 'true' + - name: Download snapshots from S3 + if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.recreate_snapshots == 'false' || github.event.inputs.recreate_snapshots == '' }} + id: download_snapshots + shell: bash -euxo pipefail {0} run: | - gzip -d "test_log_${GITHUB_RUN_ID}.gz" - cat "test_log_${GITHUB_RUN_ID}" + # Download the snapshots from S3 + mkdir -p ${TEST_OUTPUT} + mkdir -p $BACKUP_DIR + cd $BACKUP_DIR + mkdir parts + cd parts + PART=$(aws s3api list-objects-v2 --bucket $S3_BUCKET --prefix performance/pagebench/ \ + | jq -r '.Contents[]?.Key' \ + | grep -E 'shared-snapshots-[0-9]{4}-[0-9]{2}-[0-9]{2}' \ + | sort \ + | tail -1) + echo "Latest PART: $PART" + if [[ -z "$PART" ]]; then + echo "ERROR: No matching S3 key found" >&2 + exit 1 + fi + S3_KEY=$(dirname $PART) + time aws s3 cp --only-show-errors --recursive s3://${S3_BUCKET}/$S3_KEY/ . + cd $TEST_OUTPUT + time cat $BACKUP_DIR/parts/* | zstdcat | tar --extract --preserve-permissions + rm -rf ${BACKUP_DIR} + + - name: Cache poetry deps + uses: actions/cache@v4 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + shell: bash -euxo pipefail {0} + run: ./scripts/pysync + + # we need high number of open files for pagebench + - name: show ulimits + shell: bash -euxo pipefail {0} + run: | + ulimit -a + + - name: Run pagebench testcase + shell: bash -euxo pipefail {0} + env: + CI: false # need to override this env variable set by github to enforce using snapshots + run: | + export PLATFORM=hetzner-unit-perf-${COMMIT_HASH_TYPE} + # report the commit hash of the neon repository in the revision of the test results + export GITHUB_SHA=${COMMIT_HASH} + rm -rf ${PERF_REPORT_DIR} + rm -rf ${ALLURE_RESULTS_DIR} + mkdir -p ${PERF_REPORT_DIR} + mkdir -p ${ALLURE_RESULTS_DIR} + PARAMS="--alluredir=${ALLURE_RESULTS_DIR} --tb=short --verbose -rA" + EXTRA_PARAMS="--out-dir ${PERF_REPORT_DIR} --durations-path $TEST_OUTPUT/benchmark_durations.json" + # run only two selected tests + # environment set by parent: + # RUST_BACKTRACE=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release + ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_throughput_with_n_tenants ${EXTRA_PARAMS} + ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant ${EXTRA_PARAMS} + + - name: upload the performance metrics to the Neon performance database which is used by grafana dashboards to display the results + shell: bash -euxo pipefail {0} + run: | + export REPORT_FROM="$PERF_REPORT_DIR" + export GITHUB_SHA=${COMMIT_HASH} + time ./scripts/generate_and_push_perf_report.sh + + - name: Upload test results + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-store + with: + report-dir: ${{ steps.set-env.outputs.allure_results_dir }} + unique-key: ${{ env.BUILD_TYPE }}-${{ env.DEFAULT_PG_VERSION }}-${{ runner.arch }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + - name: Upload snapshots + if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.recreate_snapshots != 'false' && github.event.inputs.recreate_snapshots != '' }} + id: upload_snapshots + shell: bash -euxo pipefail {0} + run: | + mkdir -p $BACKUP_DIR + cd $TEST_OUTPUT + tar --create --preserve-permissions --file - shared-snapshots | zstd -o $BACKUP_DIR/shared_snapshots.tar.zst + cd $BACKUP_DIR + mkdir parts + split -b 1G shared_snapshots.tar.zst ./parts/shared_snapshots.tar.zst.part. + SNAPSHOT_DATE=$(date +%F) # YYYY-MM-DD + cd parts + time aws s3 cp --recursive . s3://${S3_BUCKET}/performance/pagebench/shared-snapshots-${SNAPSHOT_DATE}/ + - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 @@ -157,26 +260,22 @@ jobs: slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} - + - name: Cleanup Test Resources if: always() + shell: bash -euxo pipefail {0} + env: + ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst run: | - curl -k -X 'POST' \ - "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \ - -H 'accept: application/json' \ - -H "Authorization: Bearer $API_KEY" \ - -d '' + # Cleanup the test resources + if [[ -d "${BACKUP_DIR}" ]]; then + rm -rf ${BACKUP_DIR} + fi + if [[ -d "${TEST_OUTPUT}" ]]; then + rm -rf ${TEST_OUTPUT} + fi + if [[ -d "${NEON_DIR}" ]]; then + rm -rf ${NEON_DIR} + fi + rm -rf $(dirname $ARCHIVE) - - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine) - if: always() && steps.poll_step.outputs.too_many_runs != 'true' - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }} - role-duration-seconds: 3600 - - - name: Stop EC2 instance and wait for the instance to be stopped - if: always() && steps.poll_step.outputs.too_many_runs != 'true' - run: | - aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID - aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID diff --git a/scripts/benchmark_durations.py b/scripts/benchmark_durations.py index a9a90c7370..c74ef9d899 100755 --- a/scripts/benchmark_durations.py +++ b/scripts/benchmark_durations.py @@ -32,12 +32,6 @@ BENCHMARKS_DURATION_QUERY = """ # the total duration varies from 8 to 40 minutes. # We use some pre-collected durations as a fallback to have a better distribution. FALLBACK_DURATION = { - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-13-30]": 400.15, - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-6-30]": 372.521, - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-13-30]": 420.017, - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-6-30]": 373.769, - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-13-30]": 678.742, - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-6-30]": 512.135, "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 58.036, "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 22.104, "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 126.073, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index e413b3c6d2..5c92f2e2d0 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -682,7 +682,7 @@ class NeonEnvBuilder: log.info( f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}" ) - shutil.copytree(tenants_from_dir, tenants_to_dir) + subprocess.run(["cp", "-a", tenants_from_dir, tenants_to_dir], check=True) else: log.info( f"Creating overlayfs mount of pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}" @@ -698,8 +698,9 @@ class NeonEnvBuilder: shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True) if self.test_overlay_dir is None: log.info("Copying local_fs_remote_storage directory from snapshot") - shutil.copytree( - repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage" + subprocess.run( + ["cp", "-a", f"{repo_dir / 'local_fs_remote_storage'}", f"{self.repo_dir}"], + check=True, ) else: log.info("Creating overlayfs mount of local_fs_remote_storage directory from snapshot") diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 8874fe663b..41696bf887 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -14,7 +14,7 @@ from fixtures.neon_fixtures import ( PgBin, wait_for_last_flush_lsn, ) -from fixtures.utils import get_scale_for_db, humantime_to_ms, skip_on_ci +from fixtures.utils import get_scale_for_db, humantime_to_ms from performance.pageserver.util import setup_pageserver_with_tenants @@ -36,9 +36,6 @@ if TYPE_CHECKING: @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) @pytest.mark.parametrize("n_tenants", [500]) @pytest.mark.timeout(10000) -@skip_on_ci( - "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI" -) def test_pageserver_characterize_throughput_with_n_tenants( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, @@ -63,9 +60,6 @@ def test_pageserver_characterize_throughput_with_n_tenants( @pytest.mark.parametrize("n_clients", [1, 64]) @pytest.mark.parametrize("n_tenants", [1]) @pytest.mark.timeout(2400) -@skip_on_ci( - "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI" -) def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, From 6768a71c8656dd9bb28bcd57f042c7306cb23c9e Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Fri, 23 May 2025 20:41:12 +0100 Subject: [PATCH 141/142] proxy(tokio-postgres): refactor typeinfo query to occur earlier (#11993) ## Problem For #11992 I realised we need to get the type info before executing the query. This is important to know how to decode rows with custom types, eg the following query: ```sql CREATE TYPE foo AS ENUM ('foo','bar','baz'); SELECT ARRAY['foo'::foo, 'bar'::foo, 'baz'::foo] AS data; ``` Getting that to work was harder that it seems. The original tokio-postgres setup has a split between `Client` and `Connection`, where messages are passed between. Because multiple clients were supported, each client message included a dedicated response channel. Each request would be terminated by the `ReadyForQuery` message. The flow I opted to use for parsing types early would not trigger a `ReadyForQuery`. The flow is as follows: ``` PARSE "" // parse the user provided query DESCRIBE "" // describe the query, returning param/result type oids FLUSH // force postgres to flush the responses early // wait for descriptions // check if we know the types, if we don't then // setup the typeinfo query and execute it against each OID: PARSE typeinfo // prepare our typeinfo query DESCRIBE typeinfo FLUSH // force postgres to flush the responses early // wait for typeinfo statement // for each OID we don't know: BIND typeinfo EXECUTE FLUSH // wait for type info, might reveal more OIDs to inspect // close the typeinfo query, we cache the OID->type map and this is kinder to pgbouncer. CLOSE typeinfo // finally once we know all the OIDs: BIND "" // bind the user provided query - already parsed - to the user provided params EXECUTE // run the user provided query SYNC // commit the transaction ``` ## Summary of changes Please review commit by commit. The main challenge was allowing one query to issue multiple sub-queries. To do this I first made sure that the client could fully own the connection, which required removing any shared client state. I then had to replace the way responses are sent to the client, by using only a single permanent channel. This required some additional effort to track which query is being processed. Lastly I had to modify the query/typeinfo functions to not issue `sync` commands, so it would fit into the desired flow above. To note: the flow above does force an extra roundtrip into each query. I don't know yet if this has a measurable latency overhead. --- .../src/message/frontend.rs | 7 + libs/proxy/postgres-types2/src/lib.rs | 124 +------- libs/proxy/postgres-types2/src/type_gen.rs | 21 +- libs/proxy/tokio-postgres2/src/client.rs | 198 ++++++++---- libs/proxy/tokio-postgres2/src/codec.rs | 13 +- libs/proxy/tokio-postgres2/src/connect.rs | 8 +- libs/proxy/tokio-postgres2/src/connection.rs | 92 ++---- .../tokio-postgres2/src/generic_client.rs | 22 +- libs/proxy/tokio-postgres2/src/lib.rs | 7 - libs/proxy/tokio-postgres2/src/prepare.rs | 285 ++++++++++++------ libs/proxy/tokio-postgres2/src/query.rs | 271 ++++------------- .../proxy/tokio-postgres2/src/simple_query.rs | 36 +-- libs/proxy/tokio-postgres2/src/statement.rs | 54 +--- libs/proxy/tokio-postgres2/src/transaction.rs | 19 +- proxy/src/serverless/sql_over_http.rs | 88 +++--- 15 files changed, 500 insertions(+), 745 deletions(-) diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs index b447290ea8..9faed2c065 100644 --- a/libs/proxy/postgres-protocol2/src/message/frontend.rs +++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs @@ -25,6 +25,7 @@ where Ok(()) } +#[derive(Debug)] pub enum BindError { Conversion(Box), Serialization(io::Error), @@ -288,6 +289,12 @@ pub fn sync(buf: &mut BytesMut) { write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); } +#[inline] +pub fn flush(buf: &mut BytesMut) { + buf.put_u8(b'H'); + write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); +} + #[inline] pub fn terminate(buf: &mut BytesMut) { buf.put_u8(b'X'); diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs index b6bcabc922..7c9874bda3 100644 --- a/libs/proxy/postgres-types2/src/lib.rs +++ b/libs/proxy/postgres-types2/src/lib.rs @@ -9,7 +9,6 @@ use std::error::Error; use std::fmt; use std::sync::Arc; -use bytes::BytesMut; use fallible_iterator::FallibleIterator; #[doc(inline)] pub use postgres_protocol2::Oid; @@ -27,41 +26,6 @@ macro_rules! accepts { ) } -/// Generates an implementation of `ToSql::to_sql_checked`. -/// -/// All `ToSql` implementations should use this macro. -macro_rules! to_sql_checked { - () => { - fn to_sql_checked( - &self, - ty: &$crate::Type, - out: &mut $crate::private::BytesMut, - ) -> ::std::result::Result< - $crate::IsNull, - Box, - > { - $crate::__to_sql_checked(self, ty, out) - } - }; -} - -// WARNING: this function is not considered part of this crate's public API. -// It is subject to change at any time. -#[doc(hidden)] -pub fn __to_sql_checked( - v: &T, - ty: &Type, - out: &mut BytesMut, -) -> Result> -where - T: ToSql, -{ - if !T::accepts(ty) { - return Err(Box::new(WrongType::new::(ty.clone()))); - } - v.to_sql(ty, out) -} - // mod pg_lsn; #[doc(hidden)] pub mod private; @@ -142,7 +106,7 @@ pub enum Kind { /// An array type along with the type of its elements. Array(Type), /// A range type along with the type of its elements. - Range(Type), + Range(Oid), /// A multirange type along with the type of its elements. Multirange(Type), /// A domain type along with its underlying type. @@ -377,43 +341,6 @@ pub enum IsNull { No, } -/// A trait for types that can be converted into Postgres values. -pub trait ToSql: fmt::Debug { - /// Converts the value of `self` into the binary format of the specified - /// Postgres `Type`, appending it to `out`. - /// - /// The caller of this method is responsible for ensuring that this type - /// is compatible with the Postgres `Type`. - /// - /// The return value indicates if this value should be represented as - /// `NULL`. If this is the case, implementations **must not** write - /// anything to `out`. - fn to_sql(&self, ty: &Type, out: &mut BytesMut) -> Result> - where - Self: Sized; - - /// Determines if a value of this type can be converted to the specified - /// Postgres `Type`. - fn accepts(ty: &Type) -> bool - where - Self: Sized; - - /// An adaptor method used internally by Rust-Postgres. - /// - /// *All* implementations of this method should be generated by the - /// `to_sql_checked!()` macro. - fn to_sql_checked( - &self, - ty: &Type, - out: &mut BytesMut, - ) -> Result>; - - /// Specify the encode format - fn encode_format(&self, _ty: &Type) -> Format { - Format::Binary - } -} - /// Supported Postgres message format types /// /// Using Text format in a message assumes a Postgres `SERVER_ENCODING` of `UTF8` @@ -424,52 +351,3 @@ pub enum Format { /// Compact, typed binary format Binary, } - -impl ToSql for &str { - fn to_sql(&self, ty: &Type, w: &mut BytesMut) -> Result> { - match *ty { - ref ty if ty.name() == "ltree" => types::ltree_to_sql(self, w), - ref ty if ty.name() == "lquery" => types::lquery_to_sql(self, w), - ref ty if ty.name() == "ltxtquery" => types::ltxtquery_to_sql(self, w), - _ => types::text_to_sql(self, w), - } - Ok(IsNull::No) - } - - fn accepts(ty: &Type) -> bool { - match *ty { - Type::VARCHAR | Type::TEXT | Type::BPCHAR | Type::NAME | Type::UNKNOWN => true, - ref ty - if (ty.name() == "citext" - || ty.name() == "ltree" - || ty.name() == "lquery" - || ty.name() == "ltxtquery") => - { - true - } - _ => false, - } - } - - to_sql_checked!(); -} - -macro_rules! simple_to { - ($t:ty, $f:ident, $($expected:ident),+) => { - impl ToSql for $t { - fn to_sql(&self, - _: &Type, - w: &mut BytesMut) - -> Result> { - types::$f(*self, w); - Ok(IsNull::No) - } - - accepts!($($expected),+); - - to_sql_checked!(); - } - } -} - -simple_to!(u32, oid_to_sql, OID); diff --git a/libs/proxy/postgres-types2/src/type_gen.rs b/libs/proxy/postgres-types2/src/type_gen.rs index a1bc3f85c0..6e6163e343 100644 --- a/libs/proxy/postgres-types2/src/type_gen.rs +++ b/libs/proxy/postgres-types2/src/type_gen.rs @@ -393,7 +393,7 @@ impl Inner { } } - pub fn oid(&self) -> Oid { + pub const fn const_oid(&self) -> Oid { match *self { Inner::Bool => 16, Inner::Bytea => 17, @@ -580,7 +580,14 @@ impl Inner { Inner::TstzmultiRangeArray => 6153, Inner::DatemultiRangeArray => 6155, Inner::Int8multiRangeArray => 6157, + Inner::Other(_) => u32::MAX, + } + } + + pub fn oid(&self) -> Oid { + match *self { Inner::Other(ref u) => u.oid, + _ => self.const_oid(), } } @@ -727,17 +734,17 @@ impl Inner { Inner::JsonbArray => &Kind::Array(Type(Inner::Jsonb)), Inner::AnyRange => &Kind::Pseudo, Inner::EventTrigger => &Kind::Pseudo, - Inner::Int4Range => &Kind::Range(Type(Inner::Int4)), + Inner::Int4Range => &const { Kind::Range(Inner::Int4.const_oid()) }, Inner::Int4RangeArray => &Kind::Array(Type(Inner::Int4Range)), - Inner::NumRange => &Kind::Range(Type(Inner::Numeric)), + Inner::NumRange => &const { Kind::Range(Inner::Numeric.const_oid()) }, Inner::NumRangeArray => &Kind::Array(Type(Inner::NumRange)), - Inner::TsRange => &Kind::Range(Type(Inner::Timestamp)), + Inner::TsRange => &const { Kind::Range(Inner::Timestamp.const_oid()) }, Inner::TsRangeArray => &Kind::Array(Type(Inner::TsRange)), - Inner::TstzRange => &Kind::Range(Type(Inner::Timestamptz)), + Inner::TstzRange => &const { Kind::Range(Inner::Timestamptz.const_oid()) }, Inner::TstzRangeArray => &Kind::Array(Type(Inner::TstzRange)), - Inner::DateRange => &Kind::Range(Type(Inner::Date)), + Inner::DateRange => &const { Kind::Range(Inner::Date.const_oid()) }, Inner::DateRangeArray => &Kind::Array(Type(Inner::DateRange)), - Inner::Int8Range => &Kind::Range(Type(Inner::Int8)), + Inner::Int8Range => &const { Kind::Range(Inner::Int8.const_oid()) }, Inner::Int8RangeArray => &Kind::Array(Type(Inner::Int8Range)), Inner::Jsonpath => &Kind::Simple, Inner::JsonpathArray => &Kind::Array(Type(Inner::Jsonpath)), diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index 186eb07000..a7edfc076a 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -1,14 +1,12 @@ use std::collections::HashMap; use std::fmt; use std::net::IpAddr; -use std::sync::Arc; use std::task::{Context, Poll}; use std::time::Duration; use bytes::BytesMut; use fallible_iterator::FallibleIterator; use futures_util::{TryStreamExt, future, ready}; -use parking_lot::Mutex; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; use serde::{Deserialize, Serialize}; @@ -16,29 +14,52 @@ use tokio::sync::mpsc; use crate::codec::{BackendMessages, FrontendMessage}; use crate::config::{Host, SslMode}; -use crate::connection::{Request, RequestMessages}; use crate::query::RowStream; use crate::simple_query::SimpleQueryStream; use crate::types::{Oid, Type}; use crate::{ - CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Statement, Transaction, - TransactionBuilder, query, simple_query, + CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Transaction, TransactionBuilder, + query, simple_query, }; pub struct Responses { + /// new messages from conn receiver: mpsc::Receiver, + /// current batch of messages cur: BackendMessages, + /// number of total queries sent. + waiting: usize, + /// number of ReadyForQuery messages received. + received: usize, } impl Responses { pub fn poll_next(&mut self, cx: &mut Context<'_>) -> Poll> { loop { - match self.cur.next().map_err(Error::parse)? { - Some(Message::ErrorResponse(body)) => return Poll::Ready(Err(Error::db(body))), - Some(message) => return Poll::Ready(Ok(message)), - None => {} + // get the next saved message + if let Some(message) = self.cur.next().map_err(Error::parse)? { + let received = self.received; + + // increase the query head if this is the last message. + if let Message::ReadyForQuery(_) = message { + self.received += 1; + } + + // check if the client has skipped this query. + if received + 1 < self.waiting { + // grab the next message. + continue; + } + + // convenience: turn the error messaage into a proper error. + let res = match message { + Message::ErrorResponse(body) => Err(Error::db(body)), + message => Ok(message), + }; + return Poll::Ready(res); } + // get the next batch of messages. match ready!(self.receiver.poll_recv(cx)) { Some(messages) => self.cur = messages, None => return Poll::Ready(Err(Error::closed())), @@ -55,44 +76,87 @@ impl Responses { /// (corresponding to the queries in the [crate::prepare] module). #[derive(Default)] pub(crate) struct CachedTypeInfo { - /// A statement for basic information for a type from its - /// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its - /// fallback). - pub(crate) typeinfo: Option, - /// Cache of types already looked up. pub(crate) types: HashMap, } pub struct InnerClient { - sender: mpsc::UnboundedSender, + sender: mpsc::UnboundedSender, + responses: Responses, /// A buffer to use when writing out postgres commands. - buffer: Mutex, + buffer: BytesMut, } impl InnerClient { - pub fn send(&self, messages: RequestMessages) -> Result { - let (sender, receiver) = mpsc::channel(1); - let request = Request { messages, sender }; - self.sender.send(request).map_err(|_| Error::closed())?; - - Ok(Responses { - receiver, - cur: BackendMessages::empty(), - }) + pub fn start(&mut self) -> Result { + self.responses.waiting += 1; + Ok(PartialQuery(Some(self))) } - /// Call the given function with a buffer to be used when writing out - /// postgres commands. - pub fn with_buf(&self, f: F) -> R + // pub fn send_with_sync(&mut self, f: F) -> Result<&mut Responses, Error> + // where + // F: FnOnce(&mut BytesMut) -> Result<(), Error>, + // { + // self.start()?.send_with_sync(f) + // } + + pub fn send_simple_query(&mut self, query: &str) -> Result<&mut Responses, Error> { + self.responses.waiting += 1; + + self.buffer.clear(); + // simple queries do not need sync. + frontend::query(query, &mut self.buffer).map_err(Error::encode)?; + let buf = self.buffer.split().freeze(); + self.send_message(FrontendMessage::Raw(buf)) + } + + fn send_message(&mut self, messages: FrontendMessage) -> Result<&mut Responses, Error> { + self.sender.send(messages).map_err(|_| Error::closed())?; + Ok(&mut self.responses) + } +} + +pub struct PartialQuery<'a>(Option<&'a mut InnerClient>); + +impl Drop for PartialQuery<'_> { + fn drop(&mut self) { + if let Some(client) = self.0.take() { + client.buffer.clear(); + frontend::sync(&mut client.buffer); + let buf = client.buffer.split().freeze(); + let _ = client.send_message(FrontendMessage::Raw(buf)); + } + } +} + +impl<'a> PartialQuery<'a> { + pub fn send_with_flush(&mut self, f: F) -> Result<&mut Responses, Error> where - F: FnOnce(&mut BytesMut) -> R, + F: FnOnce(&mut BytesMut) -> Result<(), Error>, { - let mut buffer = self.buffer.lock(); - let r = f(&mut buffer); - buffer.clear(); - r + let client = self.0.as_deref_mut().unwrap(); + + client.buffer.clear(); + f(&mut client.buffer)?; + frontend::flush(&mut client.buffer); + let buf = client.buffer.split().freeze(); + client.send_message(FrontendMessage::Raw(buf)) + } + + pub fn send_with_sync(mut self, f: F) -> Result<&'a mut Responses, Error> + where + F: FnOnce(&mut BytesMut) -> Result<(), Error>, + { + let client = self.0.as_deref_mut().unwrap(); + + client.buffer.clear(); + f(&mut client.buffer)?; + frontend::sync(&mut client.buffer); + let buf = client.buffer.split().freeze(); + let _ = client.send_message(FrontendMessage::Raw(buf)); + + Ok(&mut self.0.take().unwrap().responses) } } @@ -109,7 +173,7 @@ pub struct SocketConfig { /// The client is one half of what is returned when a connection is established. Users interact with the database /// through this client object. pub struct Client { - inner: Arc, + inner: InnerClient, cached_typeinfo: CachedTypeInfo, socket_config: SocketConfig, @@ -120,17 +184,24 @@ pub struct Client { impl Client { pub(crate) fn new( - sender: mpsc::UnboundedSender, + sender: mpsc::UnboundedSender, + receiver: mpsc::Receiver, socket_config: SocketConfig, ssl_mode: SslMode, process_id: i32, secret_key: i32, ) -> Client { Client { - inner: Arc::new(InnerClient { + inner: InnerClient { sender, + responses: Responses { + receiver, + cur: BackendMessages::empty(), + waiting: 0, + received: 0, + }, buffer: Default::default(), - }), + }, cached_typeinfo: Default::default(), socket_config, @@ -145,19 +216,29 @@ impl Client { self.process_id } - pub(crate) fn inner(&self) -> &Arc { - &self.inner + pub(crate) fn inner_mut(&mut self) -> &mut InnerClient { + &mut self.inner } /// Pass text directly to the Postgres backend to allow it to sort out typing itself and /// to save a roundtrip - pub async fn query_raw_txt(&self, statement: &str, params: I) -> Result + pub async fn query_raw_txt( + &mut self, + statement: &str, + params: I, + ) -> Result where S: AsRef, I: IntoIterator>, I::IntoIter: ExactSizeIterator, { - query::query_txt(&self.inner, statement, params).await + query::query_txt( + &mut self.inner, + &mut self.cached_typeinfo, + statement, + params, + ) + .await } /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows. @@ -173,12 +254,15 @@ impl Client { /// Prepared statements should be use for any query which contains user-specified data, as they provided the /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass /// them to this method! - pub async fn simple_query(&self, query: &str) -> Result, Error> { + pub async fn simple_query(&mut self, query: &str) -> Result, Error> { self.simple_query_raw(query).await?.try_collect().await } - pub(crate) async fn simple_query_raw(&self, query: &str) -> Result { - simple_query::simple_query(self.inner(), query).await + pub(crate) async fn simple_query_raw( + &mut self, + query: &str, + ) -> Result { + simple_query::simple_query(self.inner_mut(), query).await } /// Executes a sequence of SQL statements using the simple query protocol. @@ -191,15 +275,11 @@ impl Client { /// Prepared statements should be use for any query which contains user-specified data, as they provided the /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass /// them to this method! - pub async fn batch_execute(&self, query: &str) -> Result { - simple_query::batch_execute(self.inner(), query).await + pub async fn batch_execute(&mut self, query: &str) -> Result { + simple_query::batch_execute(self.inner_mut(), query).await } pub async fn discard_all(&mut self) -> Result { - // clear the prepared statements that are about to be nuked from the postgres session - - self.cached_typeinfo.typeinfo = None; - self.batch_execute("discard all").await } @@ -208,7 +288,7 @@ impl Client { /// The transaction will roll back by default - use the `commit` method to commit it. pub async fn transaction(&mut self) -> Result, Error> { struct RollbackIfNotDone<'me> { - client: &'me Client, + client: &'me mut Client, done: bool, } @@ -218,14 +298,7 @@ impl Client { return; } - let buf = self.client.inner().with_buf(|buf| { - frontend::query("ROLLBACK", buf).unwrap(); - buf.split().freeze() - }); - let _ = self - .client - .inner() - .send(RequestMessages::Single(FrontendMessage::Raw(buf))); + let _ = self.client.inner.send_simple_query("ROLLBACK"); } } @@ -239,7 +312,7 @@ impl Client { client: self, done: false, }; - self.batch_execute("BEGIN").await?; + cleaner.client.batch_execute("BEGIN").await?; cleaner.done = true; } @@ -265,11 +338,6 @@ impl Client { } } - /// Query for type information - pub(crate) async fn get_type_inner(&mut self, oid: Oid) -> Result { - crate::prepare::get_type(&self.inner, &mut self.cached_typeinfo, oid).await - } - /// Determines if the connection to the server has already closed. /// /// In that case, all future queries will fail. diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs index f1fd9b47b3..daa5371426 100644 --- a/libs/proxy/tokio-postgres2/src/codec.rs +++ b/libs/proxy/tokio-postgres2/src/codec.rs @@ -1,21 +1,16 @@ use std::io; -use bytes::{Buf, Bytes, BytesMut}; +use bytes::{Bytes, BytesMut}; use fallible_iterator::FallibleIterator; use postgres_protocol2::message::backend; -use postgres_protocol2::message::frontend::CopyData; use tokio_util::codec::{Decoder, Encoder}; pub enum FrontendMessage { Raw(Bytes), - CopyData(CopyData>), } pub enum BackendMessage { - Normal { - messages: BackendMessages, - request_complete: bool, - }, + Normal { messages: BackendMessages }, Async(backend::Message), } @@ -44,7 +39,6 @@ impl Encoder for PostgresCodec { fn encode(&mut self, item: FrontendMessage, dst: &mut BytesMut) -> io::Result<()> { match item { FrontendMessage::Raw(buf) => dst.extend_from_slice(&buf), - FrontendMessage::CopyData(data) => data.write(dst), } Ok(()) @@ -57,7 +51,6 @@ impl Decoder for PostgresCodec { fn decode(&mut self, src: &mut BytesMut) -> Result, io::Error> { let mut idx = 0; - let mut request_complete = false; while let Some(header) = backend::Header::parse(&src[idx..])? { let len = header.len() as usize + 1; @@ -82,7 +75,6 @@ impl Decoder for PostgresCodec { idx += len; if header.tag() == backend::READY_FOR_QUERY_TAG { - request_complete = true; break; } } @@ -92,7 +84,6 @@ impl Decoder for PostgresCodec { } else { Ok(Some(BackendMessage::Normal { messages: BackendMessages(src.split_to(idx)), - request_complete, })) } } diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs index 7c3a358bba..39a0a87c74 100644 --- a/libs/proxy/tokio-postgres2/src/connect.rs +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -59,9 +59,11 @@ where connect_timeout: config.connect_timeout, }; - let (sender, receiver) = mpsc::unbounded_channel(); + let (client_tx, conn_rx) = mpsc::unbounded_channel(); + let (conn_tx, client_rx) = mpsc::channel(4); let client = Client::new( - sender, + client_tx, + client_rx, socket_config, config.ssl_mode, process_id, @@ -74,7 +76,7 @@ where .map(|m| BackendMessage::Async(Message::NoticeResponse(m))) .collect(); - let connection = Connection::new(stream, delayed, parameters, receiver); + let connection = Connection::new(stream, delayed, parameters, conn_tx, conn_rx); Ok((client, connection)) } diff --git a/libs/proxy/tokio-postgres2/src/connection.rs b/libs/proxy/tokio-postgres2/src/connection.rs index 99d6f3f8e2..fe0372b266 100644 --- a/libs/proxy/tokio-postgres2/src/connection.rs +++ b/libs/proxy/tokio-postgres2/src/connection.rs @@ -4,7 +4,6 @@ use std::pin::Pin; use std::task::{Context, Poll}; use bytes::BytesMut; -use fallible_iterator::FallibleIterator; use futures_util::{Sink, Stream, ready}; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; @@ -19,30 +18,12 @@ use crate::error::DbError; use crate::maybe_tls_stream::MaybeTlsStream; use crate::{AsyncMessage, Error, Notification}; -pub enum RequestMessages { - Single(FrontendMessage), -} - -pub struct Request { - pub messages: RequestMessages, - pub sender: mpsc::Sender, -} - -pub struct Response { - sender: PollSender, -} - #[derive(PartialEq, Debug)] enum State { Active, Closing, } -enum WriteReady { - Terminating, - WaitingOnRead, -} - /// A connection to a PostgreSQL database. /// /// This is one half of what is returned when a new connection is established. It performs the actual IO with the @@ -56,9 +37,11 @@ pub struct Connection { pub stream: Framed, PostgresCodec>, /// HACK: we need this in the Neon Proxy to forward params. pub parameters: HashMap, - receiver: mpsc::UnboundedReceiver, + + sender: PollSender, + receiver: mpsc::UnboundedReceiver, + pending_responses: VecDeque, - responses: VecDeque, state: State, } @@ -71,14 +54,15 @@ where stream: Framed, PostgresCodec>, pending_responses: VecDeque, parameters: HashMap, - receiver: mpsc::UnboundedReceiver, + sender: mpsc::Sender, + receiver: mpsc::UnboundedReceiver, ) -> Connection { Connection { stream, parameters, + sender: PollSender::new(sender), receiver, pending_responses, - responses: VecDeque::new(), state: State::Active, } } @@ -110,7 +94,7 @@ where } }; - let (mut messages, request_complete) = match message { + let messages = match message { BackendMessage::Async(Message::NoticeResponse(body)) => { let error = DbError::parse(&mut body.fields()).map_err(Error::parse)?; return Poll::Ready(Ok(AsyncMessage::Notice(error))); @@ -131,41 +115,19 @@ where continue; } BackendMessage::Async(_) => unreachable!(), - BackendMessage::Normal { - messages, - request_complete, - } => (messages, request_complete), + BackendMessage::Normal { messages } => messages, }; - let mut response = match self.responses.pop_front() { - Some(response) => response, - None => match messages.next().map_err(Error::parse)? { - Some(Message::ErrorResponse(error)) => { - return Poll::Ready(Err(Error::db(error))); - } - _ => return Poll::Ready(Err(Error::unexpected_message())), - }, - }; - - match response.sender.poll_reserve(cx) { + match self.sender.poll_reserve(cx) { Poll::Ready(Ok(())) => { - let _ = response.sender.send_item(messages); - if !request_complete { - self.responses.push_front(response); - } + let _ = self.sender.send_item(messages); } Poll::Ready(Err(_)) => { - // we need to keep paging through the rest of the messages even if the receiver's hung up - if !request_complete { - self.responses.push_front(response); - } + return Poll::Ready(Err(Error::closed())); } Poll::Pending => { - self.responses.push_front(response); - self.pending_responses.push_back(BackendMessage::Normal { - messages, - request_complete, - }); + self.pending_responses + .push_back(BackendMessage::Normal { messages }); trace!("poll_read: waiting on sender"); return Poll::Pending; } @@ -174,7 +136,7 @@ where } /// Fetch the next client request and enqueue the response sender. - fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll> { + fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll> { if self.receiver.is_closed() { return Poll::Ready(None); } @@ -182,10 +144,7 @@ where match self.receiver.poll_recv(cx) { Poll::Ready(Some(request)) => { trace!("polled new request"); - self.responses.push_back(Response { - sender: PollSender::new(request.sender), - }); - Poll::Ready(Some(request.messages)) + Poll::Ready(Some(request)) } Poll::Ready(None) => Poll::Ready(None), Poll::Pending => Poll::Pending, @@ -194,7 +153,7 @@ where /// Process client requests and write them to the postgres connection, flushing if necessary. /// client -> postgres - fn poll_write(&mut self, cx: &mut Context<'_>) -> Poll> { + fn poll_write(&mut self, cx: &mut Context<'_>) -> Poll> { loop { if Pin::new(&mut self.stream) .poll_ready(cx) @@ -209,14 +168,14 @@ where match self.poll_request(cx) { // send the message to postgres - Poll::Ready(Some(RequestMessages::Single(request))) => { + Poll::Ready(Some(request)) => { Pin::new(&mut self.stream) .start_send(request) .map_err(Error::io)?; } // No more messages from the client, and no more responses to wait for. // Send a terminate message to postgres - Poll::Ready(None) if self.responses.is_empty() => { + Poll::Ready(None) => { trace!("poll_write: at eof, terminating"); let mut request = BytesMut::new(); frontend::terminate(&mut request); @@ -228,16 +187,7 @@ where trace!("poll_write: sent eof, closing"); trace!("poll_write: done"); - return Poll::Ready(Ok(WriteReady::Terminating)); - } - // No more messages from the client, but there are still some responses to wait for. - Poll::Ready(None) => { - trace!( - "poll_write: at eof, pending responses {}", - self.responses.len() - ); - ready!(self.poll_flush(cx))?; - return Poll::Ready(Ok(WriteReady::WaitingOnRead)); + return Poll::Ready(Ok(())); } // Still waiting for a message from the client. Poll::Pending => { @@ -298,7 +248,7 @@ where // if the state is still active, try read from and write to postgres. let message = self.poll_read(cx)?; let closing = self.poll_write(cx)?; - if let Poll::Ready(WriteReady::Terminating) = closing { + if let Poll::Ready(()) = closing { self.state = State::Closing; } diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs index 8e28843347..eeefb45d26 100644 --- a/libs/proxy/tokio-postgres2/src/generic_client.rs +++ b/libs/proxy/tokio-postgres2/src/generic_client.rs @@ -1,9 +1,6 @@ #![allow(async_fn_in_trait)] -use postgres_protocol2::Oid; - use crate::query::RowStream; -use crate::types::Type; use crate::{Client, Error, Transaction}; mod private { @@ -15,20 +12,17 @@ mod private { /// This trait is "sealed", and cannot be implemented outside of this crate. pub trait GenericClient: private::Sealed { /// Like `Client::query_raw_txt`. - async fn query_raw_txt(&self, statement: &str, params: I) -> Result + async fn query_raw_txt(&mut self, statement: &str, params: I) -> Result where S: AsRef + Sync + Send, I: IntoIterator> + Sync + Send, I::IntoIter: ExactSizeIterator + Sync + Send; - - /// Query for type information - async fn get_type(&mut self, oid: Oid) -> Result; } impl private::Sealed for Client {} impl GenericClient for Client { - async fn query_raw_txt(&self, statement: &str, params: I) -> Result + async fn query_raw_txt(&mut self, statement: &str, params: I) -> Result where S: AsRef + Sync + Send, I: IntoIterator> + Sync + Send, @@ -36,17 +30,12 @@ impl GenericClient for Client { { self.query_raw_txt(statement, params).await } - - /// Query for type information - async fn get_type(&mut self, oid: Oid) -> Result { - self.get_type_inner(oid).await - } } impl private::Sealed for Transaction<'_> {} impl GenericClient for Transaction<'_> { - async fn query_raw_txt(&self, statement: &str, params: I) -> Result + async fn query_raw_txt(&mut self, statement: &str, params: I) -> Result where S: AsRef + Sync + Send, I: IntoIterator> + Sync + Send, @@ -54,9 +43,4 @@ impl GenericClient for Transaction<'_> { { self.query_raw_txt(statement, params).await } - - /// Query for type information - async fn get_type(&mut self, oid: Oid) -> Result { - self.client_mut().get_type(oid).await - } } diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs index c8ebba5487..9556070ed5 100644 --- a/libs/proxy/tokio-postgres2/src/lib.rs +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -18,7 +18,6 @@ pub use crate::statement::{Column, Statement}; pub use crate::tls::NoTls; pub use crate::transaction::Transaction; pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder}; -use crate::types::ToSql; /// After executing a query, the connection will be in one of these states #[derive(Clone, Copy, Debug, PartialEq)] @@ -120,9 +119,3 @@ pub enum SimpleQueryMessage { /// The number of rows modified or selected is returned. CommandComplete(u64), } - -fn slice_iter<'a>( - s: &'a [&'a (dyn ToSql + Sync)], -) -> impl ExactSizeIterator + 'a { - s.iter().map(|s| *s as _) -} diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs index b27eabcb0e..16b9cf66f4 100644 --- a/libs/proxy/tokio-postgres2/src/prepare.rs +++ b/libs/proxy/tokio-postgres2/src/prepare.rs @@ -1,19 +1,14 @@ -use std::future::Future; -use std::pin::Pin; -use std::sync::Arc; - -use bytes::Bytes; +use bytes::BytesMut; use fallible_iterator::FallibleIterator; -use futures_util::{TryStreamExt, pin_mut}; -use postgres_protocol2::message::backend::Message; +use postgres_protocol2::IsNull; +use postgres_protocol2::message::backend::{Message, RowDescriptionBody}; use postgres_protocol2::message::frontend; -use tracing::debug; +use postgres_protocol2::types::oid_to_sql; +use postgres_types2::Format; -use crate::client::{CachedTypeInfo, InnerClient}; -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; +use crate::client::{CachedTypeInfo, PartialQuery, Responses}; use crate::types::{Kind, Oid, Type}; -use crate::{Column, Error, Statement, query, slice_iter}; +use crate::{Column, Error, Row, Statement}; pub(crate) const TYPEINFO_QUERY: &str = "\ SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid @@ -23,22 +18,51 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid WHERE t.oid = $1 "; +/// we need to make sure we close this prepared statement. +struct CloseStmt<'a, 'b> { + client: Option<&'a mut PartialQuery<'b>>, + name: &'static str, +} + +impl<'a> CloseStmt<'a, '_> { + fn close(mut self) -> Result<&'a mut Responses, Error> { + let client = self.client.take().unwrap(); + client.send_with_flush(|buf| { + frontend::close(b'S', self.name, buf).map_err(Error::encode)?; + Ok(()) + }) + } +} + +impl Drop for CloseStmt<'_, '_> { + fn drop(&mut self) { + if let Some(client) = self.client.take() { + let _ = client.send_with_flush(|buf| { + frontend::close(b'S', self.name, buf).map_err(Error::encode)?; + Ok(()) + }); + } + } +} + async fn prepare_typecheck( - client: &Arc, + client: &mut PartialQuery<'_>, name: &'static str, query: &str, - types: &[Type], ) -> Result { - let buf = encode(client, name, query, types)?; - let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + let responses = client.send_with_flush(|buf| { + frontend::parse(name, query, [], buf).map_err(Error::encode)?; + frontend::describe(b'S', name, buf).map_err(Error::encode)?; + Ok(()) + })?; match responses.next().await? { Message::ParseComplete => {} _ => return Err(Error::unexpected_message()), } - let parameter_description = match responses.next().await? { - Message::ParameterDescription(body) => body, + match responses.next().await? { + Message::ParameterDescription(_) => {} _ => return Err(Error::unexpected_message()), }; @@ -48,13 +72,6 @@ async fn prepare_typecheck( _ => return Err(Error::unexpected_message()), }; - let mut parameters = vec![]; - let mut it = parameter_description.parameters(); - while let Some(oid) = it.next().map_err(Error::parse)? { - let type_ = Type::from_oid(oid).ok_or_else(Error::unexpected_message)?; - parameters.push(type_); - } - let mut columns = vec![]; if let Some(row_description) = row_description { let mut it = row_description.fields(); @@ -65,98 +82,168 @@ async fn prepare_typecheck( } } - Ok(Statement::new(client, name, parameters, columns)) + Ok(Statement::new(name, columns)) } -fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result { - if types.is_empty() { - debug!("preparing query {}: {}", name, query); - } else { - debug!("preparing query {} with types {:?}: {}", name, types, query); - } - - client.with_buf(|buf| { - frontend::parse(name, query, types.iter().map(Type::oid), buf).map_err(Error::encode)?; - frontend::describe(b'S', name, buf).map_err(Error::encode)?; - frontend::sync(buf); - Ok(buf.split().freeze()) - }) -} - -pub async fn get_type( - client: &Arc, - typecache: &mut CachedTypeInfo, - oid: Oid, -) -> Result { +fn try_from_cache(typecache: &CachedTypeInfo, oid: Oid) -> Option { if let Some(type_) = Type::from_oid(oid) { - return Ok(type_); + return Some(type_); } if let Some(type_) = typecache.types.get(&oid) { - return Ok(type_.clone()); + return Some(type_.clone()); }; - let stmt = typeinfo_statement(client, typecache).await?; + None +} - let rows = query::query(client, stmt, slice_iter(&[&oid])).await?; - pin_mut!(rows); +pub async fn parse_row_description( + client: &mut PartialQuery<'_>, + typecache: &mut CachedTypeInfo, + row_description: Option, +) -> Result, Error> { + let mut columns = vec![]; - let row = match rows.try_next().await? { - Some(row) => row, - None => return Err(Error::unexpected_message()), + if let Some(row_description) = row_description { + let mut it = row_description.fields(); + while let Some(field) = it.next().map_err(Error::parse)? { + let type_ = try_from_cache(typecache, field.type_oid()).unwrap_or(Type::UNKNOWN); + let column = Column::new(field.name().to_string(), type_, field); + columns.push(column); + } + } + + let all_known = columns.iter().all(|c| c.type_ != Type::UNKNOWN); + if all_known { + // all known, return early. + return Ok(columns); + } + + let typeinfo = "neon_proxy_typeinfo"; + + // make sure to close the typeinfo statement before exiting. + let mut guard = CloseStmt { + name: typeinfo, + client: None, + }; + let client = guard.client.insert(client); + + // get the typeinfo statement. + let stmt = prepare_typecheck(client, typeinfo, TYPEINFO_QUERY).await?; + + for column in &mut columns { + column.type_ = get_type(client, typecache, &stmt, column.type_oid()).await?; + } + + // cancel the close guard. + let responses = guard.close()?; + + match responses.next().await? { + Message::CloseComplete => {} + _ => return Err(Error::unexpected_message()), + } + + Ok(columns) +} + +async fn get_type( + client: &mut PartialQuery<'_>, + typecache: &mut CachedTypeInfo, + stmt: &Statement, + mut oid: Oid, +) -> Result { + let mut stack = vec![]; + let mut type_ = loop { + if let Some(type_) = try_from_cache(typecache, oid) { + break type_; + } + + let row = exec(client, stmt, oid).await?; + if stack.len() > 8 { + return Err(Error::unexpected_message()); + } + + let name: String = row.try_get(0)?; + let type_: i8 = row.try_get(1)?; + let elem_oid: Oid = row.try_get(2)?; + let rngsubtype: Option = row.try_get(3)?; + let basetype: Oid = row.try_get(4)?; + let schema: String = row.try_get(5)?; + let relid: Oid = row.try_get(6)?; + + let kind = if type_ == b'e' as i8 { + Kind::Enum + } else if type_ == b'p' as i8 { + Kind::Pseudo + } else if basetype != 0 { + Kind::Domain(basetype) + } else if elem_oid != 0 { + stack.push((name, oid, schema)); + oid = elem_oid; + continue; + } else if relid != 0 { + Kind::Composite(relid) + } else if let Some(rngsubtype) = rngsubtype { + Kind::Range(rngsubtype) + } else { + Kind::Simple + }; + + let type_ = Type::new(name, oid, kind, schema); + typecache.types.insert(oid, type_.clone()); + break type_; }; - let name: String = row.try_get(0)?; - let type_: i8 = row.try_get(1)?; - let elem_oid: Oid = row.try_get(2)?; - let rngsubtype: Option = row.try_get(3)?; - let basetype: Oid = row.try_get(4)?; - let schema: String = row.try_get(5)?; - let relid: Oid = row.try_get(6)?; - - let kind = if type_ == b'e' as i8 { - Kind::Enum - } else if type_ == b'p' as i8 { - Kind::Pseudo - } else if basetype != 0 { - Kind::Domain(basetype) - } else if elem_oid != 0 { - let type_ = get_type_rec(client, typecache, elem_oid).await?; - Kind::Array(type_) - } else if relid != 0 { - Kind::Composite(relid) - } else if let Some(rngsubtype) = rngsubtype { - let type_ = get_type_rec(client, typecache, rngsubtype).await?; - Kind::Range(type_) - } else { - Kind::Simple - }; - - let type_ = Type::new(name, oid, kind, schema); - typecache.types.insert(oid, type_.clone()); + while let Some((name, oid, schema)) = stack.pop() { + type_ = Type::new(name, oid, Kind::Array(type_), schema); + typecache.types.insert(oid, type_.clone()); + } Ok(type_) } -fn get_type_rec<'a>( - client: &'a Arc, - typecache: &'a mut CachedTypeInfo, - oid: Oid, -) -> Pin> + Send + 'a>> { - Box::pin(get_type(client, typecache, oid)) -} +/// exec the typeinfo statement returning one row. +async fn exec( + client: &mut PartialQuery<'_>, + statement: &Statement, + param: Oid, +) -> Result { + let responses = client.send_with_flush(|buf| { + encode_bind(statement, param, "", buf); + frontend::execute("", 0, buf).map_err(Error::encode)?; + Ok(()) + })?; -async fn typeinfo_statement( - client: &Arc, - typecache: &mut CachedTypeInfo, -) -> Result { - if let Some(stmt) = &typecache.typeinfo { - return Ok(stmt.clone()); + match responses.next().await? { + Message::BindComplete => {} + _ => return Err(Error::unexpected_message()), } - let typeinfo = "neon_proxy_typeinfo"; - let stmt = prepare_typecheck(client, typeinfo, TYPEINFO_QUERY, &[]).await?; + let row = match responses.next().await? { + Message::DataRow(body) => Row::new(statement.clone(), body, Format::Binary)?, + _ => return Err(Error::unexpected_message()), + }; - typecache.typeinfo = Some(stmt.clone()); - Ok(stmt) + match responses.next().await? { + Message::CommandComplete(_) => {} + _ => return Err(Error::unexpected_message()), + }; + + Ok(row) +} + +fn encode_bind(statement: &Statement, param: Oid, portal: &str, buf: &mut BytesMut) { + frontend::bind( + portal, + statement.name(), + [Format::Binary as i16], + [param], + |param, buf| { + oid_to_sql(param, buf); + Ok(IsNull::No) + }, + [Format::Binary as i16], + buf, + ) + .unwrap(); } diff --git a/libs/proxy/tokio-postgres2/src/query.rs b/libs/proxy/tokio-postgres2/src/query.rs index 106bc69d49..5f3ed8ef5a 100644 --- a/libs/proxy/tokio-postgres2/src/query.rs +++ b/libs/proxy/tokio-postgres2/src/query.rs @@ -1,76 +1,43 @@ -use std::fmt; -use std::marker::PhantomPinned; use std::pin::Pin; -use std::sync::Arc; use std::task::{Context, Poll}; -use bytes::{BufMut, Bytes, BytesMut}; -use fallible_iterator::FallibleIterator; +use bytes::BufMut; use futures_util::{Stream, ready}; -use pin_project_lite::pin_project; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; -use postgres_types2::{Format, ToSql, Type}; -use tracing::debug; +use postgres_types2::Format; -use crate::client::{InnerClient, Responses}; -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; -use crate::types::IsNull; -use crate::{Column, Error, ReadyForQueryStatus, Row, Statement}; +use crate::client::{CachedTypeInfo, InnerClient, Responses}; +use crate::{Error, ReadyForQueryStatus, Row, Statement}; -struct BorrowToSqlParamsDebug<'a>(&'a [&'a (dyn ToSql + Sync)]); - -impl fmt::Debug for BorrowToSqlParamsDebug<'_> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_list().entries(self.0.iter()).finish() - } -} - -pub async fn query<'a, I>( - client: &InnerClient, - statement: Statement, - params: I, -) -> Result -where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, -{ - let buf = if tracing::enabled!(tracing::Level::DEBUG) { - let params = params.into_iter().collect::>(); - debug!( - "executing statement {} with parameters: {:?}", - statement.name(), - BorrowToSqlParamsDebug(params.as_slice()), - ); - encode(client, &statement, params)? - } else { - encode(client, &statement, params)? - }; - let responses = start(client, buf).await?; - Ok(RowStream { - statement, - responses, - command_tag: None, - status: ReadyForQueryStatus::Unknown, - output_format: Format::Binary, - _p: PhantomPinned, - }) -} - -pub async fn query_txt( - client: &Arc, +pub async fn query_txt<'a, S, I>( + client: &'a mut InnerClient, + typecache: &mut CachedTypeInfo, query: &str, params: I, -) -> Result +) -> Result, Error> where S: AsRef, I: IntoIterator>, I::IntoIter: ExactSizeIterator, { let params = params.into_iter(); + let mut client = client.start()?; - let buf = client.with_buf(|buf| { + // Flow: + // 1. Parse the query + // 2. Inspect the row description for OIDs + // 3. If there's any OIDs we don't already know about, perform the typeinfo routine + // 4. Execute the query + // 5. Sync. + // + // The typeinfo routine: + // 1. Parse the typeinfo query + // 2. Execute the query on each OID + // 3. If the result does not match an OID we know, repeat 2. + + // parse the query and get type info + let responses = client.send_with_flush(|buf| { frontend::parse( "", // unnamed prepared statement query, // query to parse @@ -79,7 +46,30 @@ where ) .map_err(Error::encode)?; frontend::describe(b'S', "", buf).map_err(Error::encode)?; - // Bind, pass params as text, retrieve as binary + Ok(()) + })?; + + match responses.next().await? { + Message::ParseComplete => {} + _ => return Err(Error::unexpected_message()), + } + + match responses.next().await? { + Message::ParameterDescription(_) => {} + _ => return Err(Error::unexpected_message()), + }; + + let row_description = match responses.next().await? { + Message::RowDescription(body) => Some(body), + Message::NoData => None, + _ => return Err(Error::unexpected_message()), + }; + + let columns = + crate::prepare::parse_row_description(&mut client, typecache, row_description).await?; + + let responses = client.send_with_sync(|buf| { + // Bind, pass params as text, retrieve as text match frontend::bind( "", // empty string selects the unnamed portal "", // unnamed prepared statement @@ -102,173 +92,55 @@ where // Execute frontend::execute("", 0, buf).map_err(Error::encode)?; - // Sync - frontend::sync(buf); - Ok(buf.split().freeze()) + Ok(()) })?; - // now read the responses - let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; - - match responses.next().await? { - Message::ParseComplete => {} - _ => return Err(Error::unexpected_message()), - } - - let parameter_description = match responses.next().await? { - Message::ParameterDescription(body) => body, - _ => return Err(Error::unexpected_message()), - }; - - let row_description = match responses.next().await? { - Message::RowDescription(body) => Some(body), - Message::NoData => None, - _ => return Err(Error::unexpected_message()), - }; - match responses.next().await? { Message::BindComplete => {} _ => return Err(Error::unexpected_message()), } - let mut parameters = vec![]; - let mut it = parameter_description.parameters(); - while let Some(oid) = it.next().map_err(Error::parse)? { - let type_ = Type::from_oid(oid).unwrap_or(Type::UNKNOWN); - parameters.push(type_); - } - - let mut columns = vec![]; - if let Some(row_description) = row_description { - let mut it = row_description.fields(); - while let Some(field) = it.next().map_err(Error::parse)? { - let type_ = Type::from_oid(field.type_oid()).unwrap_or(Type::UNKNOWN); - let column = Column::new(field.name().to_string(), type_, field); - columns.push(column); - } - } - Ok(RowStream { - statement: Statement::new_anonymous(parameters, columns), responses, + statement: Statement::new("", columns), command_tag: None, status: ReadyForQueryStatus::Unknown, output_format: Format::Text, - _p: PhantomPinned, }) } -async fn start(client: &InnerClient, buf: Bytes) -> Result { - let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; - - match responses.next().await? { - Message::BindComplete => {} - _ => return Err(Error::unexpected_message()), - } - - Ok(responses) +/// A stream of table rows. +pub struct RowStream<'a> { + responses: &'a mut Responses, + output_format: Format, + pub statement: Statement, + pub command_tag: Option, + pub status: ReadyForQueryStatus, } -pub fn encode<'a, I>(client: &InnerClient, statement: &Statement, params: I) -> Result -where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, -{ - client.with_buf(|buf| { - encode_bind(statement, params, "", buf)?; - frontend::execute("", 0, buf).map_err(Error::encode)?; - frontend::sync(buf); - Ok(buf.split().freeze()) - }) -} - -pub fn encode_bind<'a, I>( - statement: &Statement, - params: I, - portal: &str, - buf: &mut BytesMut, -) -> Result<(), Error> -where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, -{ - let param_types = statement.params(); - let params = params.into_iter(); - - assert!( - param_types.len() == params.len(), - "expected {} parameters but got {}", - param_types.len(), - params.len() - ); - - let (param_formats, params): (Vec<_>, Vec<_>) = params - .zip(param_types.iter()) - .map(|(p, ty)| (p.encode_format(ty) as i16, p)) - .unzip(); - - let params = params.into_iter(); - - let mut error_idx = 0; - let r = frontend::bind( - portal, - statement.name(), - param_formats, - params.zip(param_types).enumerate(), - |(idx, (param, ty)), buf| match param.to_sql_checked(ty, buf) { - Ok(IsNull::No) => Ok(postgres_protocol2::IsNull::No), - Ok(IsNull::Yes) => Ok(postgres_protocol2::IsNull::Yes), - Err(e) => { - error_idx = idx; - Err(e) - } - }, - Some(1), - buf, - ); - match r { - Ok(()) => Ok(()), - Err(frontend::BindError::Conversion(e)) => Err(Error::to_sql(e, error_idx)), - Err(frontend::BindError::Serialization(e)) => Err(Error::encode(e)), - } -} - -pin_project! { - /// A stream of table rows. - pub struct RowStream { - statement: Statement, - responses: Responses, - command_tag: Option, - output_format: Format, - status: ReadyForQueryStatus, - #[pin] - _p: PhantomPinned, - } -} - -impl Stream for RowStream { +impl Stream for RowStream<'_> { type Item = Result; fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let this = self.project(); + let this = self.get_mut(); loop { match ready!(this.responses.poll_next(cx)?) { Message::DataRow(body) => { return Poll::Ready(Some(Ok(Row::new( this.statement.clone(), body, - *this.output_format, + this.output_format, )?))); } Message::EmptyQueryResponse | Message::PortalSuspended => {} Message::CommandComplete(body) => { if let Ok(tag) = body.tag() { - *this.command_tag = Some(tag.to_string()); + this.command_tag = Some(tag.to_string()); } } Message::ReadyForQuery(status) => { - *this.status = status.into(); + this.status = status.into(); return Poll::Ready(None); } _ => return Poll::Ready(Some(Err(Error::unexpected_message()))), @@ -276,24 +148,3 @@ impl Stream for RowStream { } } } - -impl RowStream { - /// Returns information about the columns of data in the row. - pub fn columns(&self) -> &[Column] { - self.statement.columns() - } - - /// Returns the command tag of this query. - /// - /// This is only available after the stream has been exhausted. - pub fn command_tag(&self) -> Option { - self.command_tag.clone() - } - - /// Returns if the connection is ready for querying, with the status of the connection. - /// - /// This might be available only after the stream has been exhausted. - pub fn ready_status(&self) -> ReadyForQueryStatus { - self.status - } -} diff --git a/libs/proxy/tokio-postgres2/src/simple_query.rs b/libs/proxy/tokio-postgres2/src/simple_query.rs index 2cf17188cf..e1ed48cdaf 100644 --- a/libs/proxy/tokio-postgres2/src/simple_query.rs +++ b/libs/proxy/tokio-postgres2/src/simple_query.rs @@ -1,19 +1,14 @@ -use std::marker::PhantomPinned; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use bytes::Bytes; use fallible_iterator::FallibleIterator; use futures_util::{Stream, ready}; use pin_project_lite::pin_project; use postgres_protocol2::message::backend::Message; -use postgres_protocol2::message::frontend; use tracing::debug; use crate::client::{InnerClient, Responses}; -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow}; /// Information about a column of a single query row. @@ -33,28 +28,28 @@ impl SimpleColumn { } } -pub async fn simple_query(client: &InnerClient, query: &str) -> Result { +pub async fn simple_query<'a>( + client: &'a mut InnerClient, + query: &str, +) -> Result, Error> { debug!("executing simple query: {}", query); - let buf = encode(client, query)?; - let responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + let responses = client.send_simple_query(query)?; Ok(SimpleQueryStream { responses, columns: None, status: ReadyForQueryStatus::Unknown, - _p: PhantomPinned, }) } pub async fn batch_execute( - client: &InnerClient, + client: &mut InnerClient, query: &str, ) -> Result { debug!("executing statement batch: {}", query); - let buf = encode(client, query)?; - let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + let responses = client.send_simple_query(query)?; loop { match responses.next().await? { @@ -68,25 +63,16 @@ pub async fn batch_execute( } } -pub(crate) fn encode(client: &InnerClient, query: &str) -> Result { - client.with_buf(|buf| { - frontend::query(query, buf).map_err(Error::encode)?; - Ok(buf.split().freeze()) - }) -} - pin_project! { /// A stream of simple query results. - pub struct SimpleQueryStream { - responses: Responses, + pub struct SimpleQueryStream<'a> { + responses: &'a mut Responses, columns: Option>, status: ReadyForQueryStatus, - #[pin] - _p: PhantomPinned, } } -impl SimpleQueryStream { +impl SimpleQueryStream<'_> { /// Returns if the connection is ready for querying, with the status of the connection. /// /// This might be available only after the stream has been exhausted. @@ -95,7 +81,7 @@ impl SimpleQueryStream { } } -impl Stream for SimpleQueryStream { +impl Stream for SimpleQueryStream<'_> { type Item = Result; fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { diff --git a/libs/proxy/tokio-postgres2/src/statement.rs b/libs/proxy/tokio-postgres2/src/statement.rs index e4828db712..1f22d87fd7 100644 --- a/libs/proxy/tokio-postgres2/src/statement.rs +++ b/libs/proxy/tokio-postgres2/src/statement.rs @@ -1,35 +1,15 @@ use std::fmt; -use std::sync::{Arc, Weak}; +use std::sync::Arc; +use crate::types::Type; use postgres_protocol2::Oid; use postgres_protocol2::message::backend::Field; -use postgres_protocol2::message::frontend; - -use crate::client::InnerClient; -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; -use crate::types::Type; struct StatementInner { - client: Weak, name: &'static str, - params: Vec, columns: Vec, } -impl Drop for StatementInner { - fn drop(&mut self) { - if let Some(client) = self.client.upgrade() { - let buf = client.with_buf(|buf| { - frontend::close(b'S', self.name, buf).unwrap(); - frontend::sync(buf); - buf.split().freeze() - }); - let _ = client.send(RequestMessages::Single(FrontendMessage::Raw(buf))); - } - } -} - /// A prepared statement. /// /// Prepared statements can only be used with the connection that created them. @@ -37,38 +17,14 @@ impl Drop for StatementInner { pub struct Statement(Arc); impl Statement { - pub(crate) fn new( - inner: &Arc, - name: &'static str, - params: Vec, - columns: Vec, - ) -> Statement { - Statement(Arc::new(StatementInner { - client: Arc::downgrade(inner), - name, - params, - columns, - })) - } - - pub(crate) fn new_anonymous(params: Vec, columns: Vec) -> Statement { - Statement(Arc::new(StatementInner { - client: Weak::new(), - name: "", - params, - columns, - })) + pub(crate) fn new(name: &'static str, columns: Vec) -> Statement { + Statement(Arc::new(StatementInner { name, columns })) } pub(crate) fn name(&self) -> &str { self.0.name } - /// Returns the expected types of the statement's parameters. - pub fn params(&self) -> &[Type] { - &self.0.params - } - /// Returns information about the columns returned when the statement is queried. pub fn columns(&self) -> &[Column] { &self.0.columns @@ -78,7 +34,7 @@ impl Statement { /// Information about a column of a query. pub struct Column { name: String, - type_: Type, + pub(crate) type_: Type, // raw fields from RowDescription table_oid: Oid, diff --git a/libs/proxy/tokio-postgres2/src/transaction.rs b/libs/proxy/tokio-postgres2/src/transaction.rs index f32603470f..12fe0737d4 100644 --- a/libs/proxy/tokio-postgres2/src/transaction.rs +++ b/libs/proxy/tokio-postgres2/src/transaction.rs @@ -1,7 +1,3 @@ -use postgres_protocol2::message::frontend; - -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; use crate::query::RowStream; use crate::{CancelToken, Client, Error, ReadyForQueryStatus}; @@ -20,14 +16,7 @@ impl Drop for Transaction<'_> { return; } - let buf = self.client.inner().with_buf(|buf| { - frontend::query("ROLLBACK", buf).unwrap(); - buf.split().freeze() - }); - let _ = self - .client - .inner() - .send(RequestMessages::Single(FrontendMessage::Raw(buf))); + let _ = self.client.inner_mut().send_simple_query("ROLLBACK"); } } @@ -54,7 +43,11 @@ impl<'a> Transaction<'a> { } /// Like `Client::query_raw_txt`. - pub async fn query_raw_txt(&self, statement: &str, params: I) -> Result + pub async fn query_raw_txt( + &mut self, + statement: &str, + params: I, + ) -> Result where S: AsRef, I: IntoIterator>, diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index dfaeedaeae..1c5bb64480 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -14,7 +14,9 @@ use hyper::http::{HeaderName, HeaderValue}; use hyper::{HeaderMap, Request, Response, StatusCode, header}; use indexmap::IndexMap; use postgres_client::error::{DbError, ErrorPosition, SqlState}; -use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; +use postgres_client::{ + GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, RowStream, Transaction, +}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; use serde_json::Value; @@ -1092,22 +1094,41 @@ async fn query_to_json( let query_start = Instant::now(); let query_params = data.params; - let mut row_stream = std::pin::pin!( - client - .query_raw_txt(&data.query, query_params) - .await - .map_err(SqlOverHttpError::Postgres)? - ); + let mut row_stream = client + .query_raw_txt(&data.query, query_params) + .await + .map_err(SqlOverHttpError::Postgres)?; let query_acknowledged = Instant::now(); + let columns_len = row_stream.statement.columns().len(); + let mut fields = Vec::with_capacity(columns_len); + let mut types = Vec::with_capacity(columns_len); + + for c in row_stream.statement.columns() { + fields.push(json!({ + "name": c.name().to_owned(), + "dataTypeID": c.type_().oid(), + "tableID": c.table_oid(), + "columnID": c.column_id(), + "dataTypeSize": c.type_size(), + "dataTypeModifier": c.type_modifier(), + "format": "text", + })); + + types.push(c.type_().clone()); + } + + let raw_output = parsed_headers.raw_output; + let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode); + // Manually drain the stream into a vector to leave row_stream hanging // around to get a command tag. Also check that the response is not too // big. - let mut rows: Vec = Vec::new(); + let mut rows = Vec::new(); while let Some(row) = row_stream.next().await { let row = row.map_err(SqlOverHttpError::Postgres)?; *current_size += row.body_len(); - rows.push(row); + // we don't have a streaming response support yet so this is to prevent OOM // from a malicious query (eg a cross join) if *current_size > config.max_response_size_bytes { @@ -1115,13 +1136,26 @@ async fn query_to_json( config.max_response_size_bytes, )); } + + let row = pg_text_row_to_json(&row, &types, raw_output, array_mode)?; + rows.push(row); + + // assumption: parsing pg text and converting to json takes CPU time. + // let's assume it is slightly expensive, so we should consume some cooperative budget. + // Especially considering that `RowStream::next` might be pulling from a batch + // of rows and never hit the tokio mpsc for a long time (although unlikely). + tokio::task::consume_budget().await; } let query_resp_end = Instant::now(); - let ready = row_stream.ready_status(); + let RowStream { + command_tag, + status: ready, + .. + } = row_stream; // grab the command tag and number of rows affected - let command_tag = row_stream.command_tag().unwrap_or_default(); + let command_tag = command_tag.unwrap_or_default(); let mut command_tag_split = command_tag.split(' '); let command_tag_name = command_tag_split.next().unwrap_or_default(); let command_tag_count = if command_tag_name == "INSERT" { @@ -1142,38 +1176,6 @@ async fn query_to_json( "finished executing query" ); - let columns_len = row_stream.columns().len(); - let mut fields = Vec::with_capacity(columns_len); - let mut columns = Vec::with_capacity(columns_len); - - for c in row_stream.columns() { - fields.push(json!({ - "name": c.name().to_owned(), - "dataTypeID": c.type_().oid(), - "tableID": c.table_oid(), - "columnID": c.column_id(), - "dataTypeSize": c.type_size(), - "dataTypeModifier": c.type_modifier(), - "format": "text", - })); - - match client.get_type(c.type_oid()).await { - Ok(t) => columns.push(t), - Err(err) => { - tracing::warn!(?err, "unable to query type information"); - return Err(SqlOverHttpError::InternalPostgres(err)); - } - } - } - - let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode); - - // convert rows to JSON - let rows = rows - .iter() - .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode)) - .collect::, _>>()?; - // Resulting JSON format is based on the format of node-postgres result. let results = json!({ "command": command_tag_name.to_string(), From abc6c84262368b8770116f0f71b4a814d60bf971 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Fri, 23 May 2025 15:16:13 -0500 Subject: [PATCH 142/142] Update sql_exporter to 0.17.3 (#12013) Signed-off-by: Tristan Partin --- build-tools.Dockerfile | 2 +- compute/compute-node.Dockerfile | 6 +++--- test_runner/regress/test_compute_metrics.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index 1933fd19d8..9d4c93e1cd 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -155,7 +155,7 @@ RUN set -e \ # Keep the version the same as in compute/compute-node.Dockerfile and # test_runner/regress/test_compute_metrics.py. -ENV SQL_EXPORTER_VERSION=0.17.0 +ENV SQL_EXPORTER_VERSION=0.17.3 RUN curl -fsSL \ "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \ --output sql_exporter.tar.gz \ diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 3e2c09493f..f4a5593b71 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1784,17 +1784,17 @@ ARG TARGETARCH RUN if [ "$TARGETARCH" = "amd64" ]; then\ postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\ pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\ - sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\ + sql_exporter_sha256='9a41127a493e8bfebfe692bf78c7ed2872a58a3f961ee534d1b0da9ae584aaab';\ else\ postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\ pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\ - sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\ + sql_exporter_sha256='530e6afc77c043497ed965532c4c9dfa873bc2a4f0b3047fad367715c0081d6a';\ fi\ && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\ | tar xzf - --strip-components=1 -C.\ && curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\ | tar xzf - --strip-components=1 -C.\ - && curl -sL https://github.com/burningalchemist/sql_exporter/releases/download/0.17.0/sql_exporter-0.17.0.linux-${TARGETARCH}.tar.gz\ + && curl -sL https://github.com/burningalchemist/sql_exporter/releases/download/0.17.3/sql_exporter-0.17.3.linux-${TARGETARCH}.tar.gz\ | tar xzf - --strip-components=1 -C.\ && echo "${postgres_exporter_sha256} postgres_exporter" | sha256sum -c -\ && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\ diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index 5e3f8671a2..2cb2ee7b58 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -217,11 +217,11 @@ if SQL_EXPORTER is None: self, logs_dir: Path, config_file: Path, collector_file: Path, port: int ) -> None: # NOTE: Keep the version the same as in - # compute/Dockerfile.compute-node and Dockerfile.build-tools. + # compute/compute-node.Dockerfile and build-tools.Dockerfile. # # The "host" network mode allows sql_exporter to talk to the # endpoint which is running on the host. - super().__init__("docker.io/burningalchemist/sql_exporter:0.17.0", network_mode="host") + super().__init__("docker.io/burningalchemist/sql_exporter:0.17.3", network_mode="host") self.__logs_dir = logs_dir self.__port = port