diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml index 8ccb67b04a..c185086eef 100644 --- a/.github/ansible/prod.ap-southeast-1.hosts.yaml +++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml index b3cd5de01c..0a0f974ea4 100644 --- a/.github/ansible/prod.eu-central-1.hosts.yaml +++ b/.github/ansible/prod.eu-central-1.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml index 22c705e1cf..4427bb344e 100644 --- a/.github/ansible/prod.us-east-2.hosts.yaml +++ b/.github/ansible/prod.us-east-2.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml index f03e2d9435..53626b4f59 100644 --- a/.github/ansible/prod.us-west-2.hosts.yaml +++ b/.github/ansible/prod.us-west-2.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index b537795704..34c8e77280 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -8,11 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 80 + min_avail_bytes: 0 + period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold" period: "20m" - threshold: "20m" + threshold: &default_eviction_threshold "20m" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index cd8f832af0..94f2be83a4 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -8,11 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 80 + min_avail_bytes: 0 + period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold" period: "20m" - threshold: "20m" + threshold: &default_eviction_threshold "20m" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml index ad712c4745..2307856464 100644 --- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml @@ -30,10 +30,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: dev - zenith_region: eu-west-1 - zenith_region_slug: eu-west-1 + neon_service: proxy-scram + neon_env: dev + neon_region: eu-west-1 exposedService: annotations: diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml index 91ddd07eae..feca05aff6 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml @@ -15,10 +15,9 @@ settings: # -- Additional labels for neon-proxy-link pods podLabels: - zenith_service: proxy - zenith_env: dev - zenith_region: us-east-2 - zenith_region_slug: us-east-2 + neon_service: proxy + neon_env: dev + neon_region: us-east-2 service: type: LoadBalancer diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml index 6ec18ff388..feee1b369a 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml @@ -15,10 +15,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram-legacy - zenith_env: dev - zenith_region: us-east-2 - zenith_region_slug: us-east-2 + neon_service: proxy-scram-legacy + neon_env: dev + neon_region: us-east-2 exposedService: annotations: diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml index a091be1016..40814e55c9 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml @@ -30,10 +30,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: dev - zenith_region: us-east-2 - zenith_region_slug: us-east-2 + neon_service: proxy-scram + neon_env: dev + neon_region: us-east-2 exposedService: annotations: diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml index 8d65e94d00..aa5be89101 100644 --- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml @@ -31,10 +31,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: prod - zenith_region: ap-southeast-1 - zenith_region_slug: ap-southeast-1 + neon_service: proxy-scram + neon_env: prod + neon_region: ap-southeast-1 exposedService: annotations: diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml index f806b37482..083af6aa2d 100644 --- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml @@ -31,10 +31,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: prod - zenith_region: eu-central-1 - zenith_region_slug: eu-central-1 + neon_service: proxy-scram + neon_env: prod + neon_region: eu-central-1 exposedService: annotations: diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml index eff24302bb..30dcefc151 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml @@ -13,10 +13,9 @@ settings: # -- Additional labels for zenith-proxy pods podLabels: - zenith_service: proxy - zenith_env: production - zenith_region: us-east-2 - zenith_region_slug: us-east-2 + neon_service: proxy + neon_env: production + neon_region: us-east-2 service: type: LoadBalancer diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml index 38719f64e7..40fbc52b39 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml @@ -31,10 +31,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: prod - zenith_region: us-east-2 - zenith_region_slug: us-east-2 + neon_service: proxy-scram + neon_env: prod + neon_region: us-east-2 exposedService: annotations: diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml index d23ea41bd7..a186fb833f 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml @@ -31,10 +31,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: prod - zenith_region: us-west-2 - zenith_region_slug: us-west-2 + neon_service: proxy-scram + neon_env: prod + neon_region: us-west-2 exposedService: annotations: diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml index d5a7d6d575..810a6a5f78 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml @@ -31,10 +31,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: prod - zenith_region: us-west-2 - zenith_region_slug: us-west-2 + neon_service: proxy-scram + neon_env: prod + neon_region: us-west-2 exposedService: annotations: diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 3f32b80ca8..816c5ee711 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -3,8 +3,12 @@ ## Issue ticket number and link ## Checklist before requesting a review + - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. +## Checklist before merging + +- [ ] Do not forget to reformat commit message to not include the above checklist diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d50a42d83c..8482341b0c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -184,10 +184,10 @@ jobs: CARGO_FEATURES="--features testing" if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FLAGS="--locked $CARGO_FEATURES" + CARGO_FLAGS="--locked" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FLAGS="--locked --release $CARGO_FEATURES" + CARGO_FLAGS="--locked --release" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV @@ -240,11 +240,18 @@ jobs: - name: Run cargo build run: | - ${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests + ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests - name: Run cargo test run: | - ${cov_prefix} cargo test $CARGO_FLAGS + ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES + + # Run separate tests for real S3 + export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty + export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev + export REMOTE_STORAGE_S3_REGION=eu-central-1 + # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now + ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact - name: Install rust binaries run: | @@ -268,7 +275,7 @@ jobs: mkdir -p /tmp/neon/test_bin/ test_exe_paths=$( - ${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run | + ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run | jq -r '.executable | select(. != null)' ) for bin in $test_exe_paths; do @@ -891,6 +898,16 @@ jobs: needs: [ push-docker-hub, tag, regress-tests ] if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch' steps: + - name: Fix git ownership + run: | + # Workaround for `fatal: detected dubious ownership in repository at ...` + # + # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers + # Ref https://github.com/actions/checkout/issues/785 + # + git config --global --add safe.directory ${{ github.workspace }} + git config --global --add safe.directory ${GITHUB_WORKSPACE} + - name: Checkout uses: actions/checkout@v3 with: diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 2ae517e5e7..ef4c293e31 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -53,14 +53,14 @@ jobs: uses: actions/cache@v3 with: path: pg_install/v14 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v15 build id: cache_pg_15 uses: actions/cache@v3 with: path: pg_install/v15 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Set extra env for macOS run: | diff --git a/Cargo.lock b/Cargo.lock index 17aacd8ee7..4590e76014 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2474,6 +2474,7 @@ dependencies = [ "strum", "strum_macros", "svg_fmt", + "sync_wrapper", "tempfile", "tenant_size_model", "thiserror", @@ -3085,6 +3086,7 @@ dependencies = [ "serde", "serde_json", "tempfile", + "test-context", "tokio", "tokio-util", "toml_edit", @@ -3888,6 +3890,27 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "test-context" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3" +dependencies = [ + "async-trait", + "futures", + "test-context-macros", +] + +[[package]] +name = "test-context-macros" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "textwrap" version = "0.16.0" @@ -4534,6 +4557,7 @@ dependencies = [ "once_cell", "pin-project-lite", "rand", + "regex", "routerify", "sentry", "serde", diff --git a/Cargo.toml b/Cargo.toml index e27a50a1cb..09cc150606 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -97,6 +97,7 @@ strum_macros = "0.24" svg_fmt = "0.4.1" sync_wrapper = "0.1.2" tar = "0.4" +test-context = "0.1" thiserror = "1.0" tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] } tokio = { version = "1.17", features = ["macros"] } diff --git a/README.md b/README.md index 43f3e3a02b..55df67f6c7 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,8 @@ pacman -S base-devel readline zlib libseccomp openssl clang \ postgresql-libs cmake postgresql protobuf ``` +Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases). + 2. [Install Rust](https://www.rust-lang.org/tools/install) ``` # recommended approach from https://www.rust-lang.org/tools/install diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index b96842e416..f29a576413 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -203,13 +203,14 @@ fn main() -> Result<()> { if delay_exit { info!("giving control plane 30s to collect the error before shutdown"); thread::sleep(Duration::from_secs(30)); - info!("shutting down"); } + info!("shutting down tracing"); // Shutdown trace pipeline gracefully, so that it has a chance to send any // pending traces before we exit. tracing_utils::shutdown_tracing(); + info!("shutting down"); exit(exit_code.unwrap_or(1)) } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 79f851ed13..01b192b2de 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -74,18 +74,9 @@ impl GenericOption { /// Represent `GenericOption` as configuration option. pub fn to_pg_setting(&self) -> String { if let Some(val) = &self.value { - // TODO: check in the console DB that we don't have these settings - // set for any non-deleted project and drop this override. - let name = match self.name.as_str() { - "safekeepers" => "neon.safekeepers", - "wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout", - "wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout", - it => it, - }; - match self.vartype.as_ref() { - "string" => format!("{} = '{}'", name, escape_conf_value(val)), - _ => format!("{} = {}", name, val), + "string" => format!("{} = '{}'", self.name, escape_conf_value(val)), + _ => format!("{} = {}", self.name, val), } } else { self.name.to_owned() diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index ee504bfaa6..bc81107706 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -90,7 +90,6 @@ impl ComputeControlPlane { timeline_id, lsn, tenant_id, - uses_wal_proposer: false, pg_version, }); @@ -115,7 +114,6 @@ pub struct PostgresNode { pub timeline_id: TimelineId, pub lsn: Option, // if it's a read-only node. None for primary pub tenant_id: TenantId, - uses_wal_proposer: bool, pg_version: u32, } @@ -149,7 +147,6 @@ impl PostgresNode { let port: u16 = conf.parse_field("port", &context)?; let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?; let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?; - let uses_wal_proposer = conf.get("neon.safekeepers").is_some(); // Read postgres version from PG_VERSION file to determine which postgres version binary to use. // If it doesn't exist, assume broken data directory and use default pg version. @@ -172,7 +169,6 @@ impl PostgresNode { timeline_id, lsn: recovery_target_lsn, tenant_id, - uses_wal_proposer, pg_version, }) } @@ -364,7 +360,7 @@ impl PostgresNode { fn load_basebackup(&self, auth_token: &Option) -> Result<()> { let backup_lsn = if let Some(lsn) = self.lsn { Some(lsn) - } else if self.uses_wal_proposer { + } else if !self.env.safekeepers.is_empty() { // LSN 0 means that it is bootstrap and we need to download just // latest data from the pageserver. That is a bit clumsy but whole bootstrap // procedure evolves quite actively right now, so let's think about it again diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 3c66400a05..094069e4c0 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -363,6 +363,11 @@ impl PageServerNode { .map(|x| serde_json::from_str(x)) .transpose() .context("Failed to parse 'eviction_policy' json")?, + min_resident_size_override: settings + .remove("min_resident_size_override") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'min_resident_size_override' as integer")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -435,6 +440,11 @@ impl PageServerNode { .map(|x| serde_json::from_str(x)) .transpose() .context("Failed to parse 'eviction_policy' json")?, + min_resident_size_override: settings + .get("min_resident_size_override") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'min_resident_size_override' as an integer")?, }) .send()? .error_from_body()?; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 0f860d0a6d..98a4b56858 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -120,6 +120,7 @@ pub struct TenantCreateRequest { // We might do that once the eviction feature has stabilizied. // For now, this field is not even documented in the openapi_spec.yml. pub eviction_policy: Option, + pub min_resident_size_override: Option, } #[serde_as] @@ -165,6 +166,7 @@ pub struct TenantConfigRequest { // We might do that once the eviction feature has stabilizied. // For now, this field is not even documented in the openapi_spec.yml. pub eviction_policy: Option, + pub min_resident_size_override: Option, } impl TenantConfigRequest { @@ -185,6 +187,7 @@ impl TenantConfigRequest { max_lsn_wal_lag: None, trace_read_requests: None, eviction_policy: None, + min_resident_size_override: None, } } } diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 656c0ff312..a976e19029 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -936,35 +936,40 @@ impl<'a> BeMessage<'a> { } } -// Neon extension of postgres replication protocol -// See NEON_STATUS_UPDATE_TAG_BYTE +/// Feedback pageserver sends to safekeeper and safekeeper resends to compute. +/// Serialized in custom flexible key/value format. In replication protocol, it +/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres +/// Standby status update / Hot standby feedback messages. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub struct ReplicationFeedback { - // Last known size of the timeline. Used to enforce timeline size limit. +pub struct PageserverFeedback { + /// Last known size of the timeline. Used to enforce timeline size limit. pub current_timeline_size: u64, - // Parts of StandbyStatusUpdate we resend to compute via safekeeper - pub ps_writelsn: u64, - pub ps_applylsn: u64, - pub ps_flushlsn: u64, - pub ps_replytime: SystemTime, + /// LSN last received and ingested by the pageserver. + pub last_received_lsn: u64, + /// LSN up to which data is persisted by the pageserver to its local disc. + pub disk_consistent_lsn: u64, + /// LSN up to which data is persisted by the pageserver on s3; safekeepers + /// consider WAL before it can be removed. + pub remote_consistent_lsn: u64, + pub replytime: SystemTime, } -// NOTE: Do not forget to increment this number when adding new fields to ReplicationFeedback. +// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback. // Do not remove previously available fields because this might be backwards incompatible. -pub const REPLICATION_FEEDBACK_FIELDS_NUMBER: u8 = 5; +pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5; -impl ReplicationFeedback { - pub fn empty() -> ReplicationFeedback { - ReplicationFeedback { +impl PageserverFeedback { + pub fn empty() -> PageserverFeedback { + PageserverFeedback { current_timeline_size: 0, - ps_writelsn: 0, - ps_applylsn: 0, - ps_flushlsn: 0, - ps_replytime: SystemTime::now(), + last_received_lsn: 0, + remote_consistent_lsn: 0, + disk_consistent_lsn: 0, + replytime: SystemTime::now(), } } - // Serialize ReplicationFeedback using custom format + // Serialize PageserverFeedback using custom format // to support protocol extensibility. // // Following layout is used: @@ -974,24 +979,26 @@ impl ReplicationFeedback { // null-terminated string - key, // uint32 - value length in bytes // value itself + // + // TODO: change serialized fields names once all computes migrate to rename. pub fn serialize(&self, buf: &mut BytesMut) { - buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys + buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys buf.put_slice(b"current_timeline_size\0"); buf.put_i32(8); buf.put_u64(self.current_timeline_size); buf.put_slice(b"ps_writelsn\0"); buf.put_i32(8); - buf.put_u64(self.ps_writelsn); + buf.put_u64(self.last_received_lsn); buf.put_slice(b"ps_flushlsn\0"); buf.put_i32(8); - buf.put_u64(self.ps_flushlsn); + buf.put_u64(self.disk_consistent_lsn); buf.put_slice(b"ps_applylsn\0"); buf.put_i32(8); - buf.put_u64(self.ps_applylsn); + buf.put_u64(self.remote_consistent_lsn); let timestamp = self - .ps_replytime + .replytime .duration_since(*PG_EPOCH) .expect("failed to serialize pg_replytime earlier than PG_EPOCH") .as_micros() as i64; @@ -1001,9 +1008,10 @@ impl ReplicationFeedback { buf.put_i64(timestamp); } - // Deserialize ReplicationFeedback message - pub fn parse(mut buf: Bytes) -> ReplicationFeedback { - let mut rf = ReplicationFeedback::empty(); + // Deserialize PageserverFeedback message + // TODO: change serialized fields names once all computes migrate to rename. + pub fn parse(mut buf: Bytes) -> PageserverFeedback { + let mut rf = PageserverFeedback::empty(); let nfields = buf.get_u8(); for _ in 0..nfields { let key = read_cstr(&mut buf).unwrap(); @@ -1016,39 +1024,39 @@ impl ReplicationFeedback { b"ps_writelsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - rf.ps_writelsn = buf.get_u64(); + rf.last_received_lsn = buf.get_u64(); } b"ps_flushlsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - rf.ps_flushlsn = buf.get_u64(); + rf.disk_consistent_lsn = buf.get_u64(); } b"ps_applylsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - rf.ps_applylsn = buf.get_u64(); + rf.remote_consistent_lsn = buf.get_u64(); } b"ps_replytime" => { let len = buf.get_i32(); assert_eq!(len, 8); let raw_time = buf.get_i64(); if raw_time > 0 { - rf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); + rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); } else { - rf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); + rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); } } _ => { let len = buf.get_i32(); warn!( - "ReplicationFeedback parse. unknown key {} of len {len}. Skip it.", + "PageserverFeedback parse. unknown key {} of len {len}. Skip it.", String::from_utf8_lossy(key.as_ref()) ); buf.advance(len as usize); } } } - trace!("ReplicationFeedback parsed is {:?}", rf); + trace!("PageserverFeedback parsed is {:?}", rf); rf } } @@ -1059,33 +1067,33 @@ mod tests { #[test] fn test_replication_feedback_serialization() { - let mut rf = ReplicationFeedback::empty(); + let mut rf = PageserverFeedback::empty(); // Fill rf with some values rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); rf.serialize(&mut data); - let rf_parsed = ReplicationFeedback::parse(data.freeze()); + let rf_parsed = PageserverFeedback::parse(data.freeze()); assert_eq!(rf, rf_parsed); } #[test] fn test_replication_feedback_unknown_key() { - let mut rf = ReplicationFeedback::empty(); + let mut rf = PageserverFeedback::empty(); // Fill rf with some values rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); rf.serialize(&mut data); // Add an extra field to the buffer and adjust number of keys if let Some(first) = data.first_mut() { - *first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1; + *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1; } data.put_slice(b"new_field_one\0"); @@ -1093,7 +1101,7 @@ mod tests { data.put_u64(42); // Parse serialized data and check that new field is not parsed - let rf_parsed = ReplicationFeedback::parse(data.freeze()); + let rf_parsed = PageserverFeedback::parse(data.freeze()); assert_eq!(rf, rf_parsed); } diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 15812e8439..da15823b69 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -26,3 +26,4 @@ workspace_hack.workspace = true [dev-dependencies] tempfile.workspace = true +test-context.workspace = true diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 901f849801..1d50a777f4 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -39,6 +39,9 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; +/// No limits on the client side, which currenltly means 1000 for AWS S3. +/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax +pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None; const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; @@ -64,6 +67,10 @@ impl RemotePath { pub fn object_name(&self) -> Option<&str> { self.0.file_name().and_then(|os_str| os_str.to_str()) } + + pub fn join(&self, segment: &Path) -> Self { + Self(self.0.join(segment)) + } } /// Storage (potentially remote) API to manage its state. @@ -266,6 +273,7 @@ pub struct S3Config { /// AWS S3 has various limits on its API calls, we need not to exceed those. /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. pub concurrency_limit: NonZeroUsize, + pub max_keys_per_list_response: Option, } impl Debug for S3Config { @@ -275,6 +283,10 @@ impl Debug for S3Config { .field("bucket_region", &self.bucket_region) .field("prefix_in_bucket", &self.prefix_in_bucket) .field("concurrency_limit", &self.concurrency_limit) + .field( + "max_keys_per_list_response", + &self.max_keys_per_list_response, + ) .finish() } } @@ -303,6 +315,11 @@ impl RemoteStorageConfig { ) .context("Failed to parse 'concurrency_limit' as a positive integer")?; + let max_keys_per_list_response = + parse_optional_integer::("max_keys_per_list_response", toml) + .context("Failed to parse 'max_keys_per_list_response' as a positive integer")? + .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE); + let storage = match (local_path, bucket_name, bucket_region) { // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled (None, None, None) => return Ok(None), @@ -324,6 +341,7 @@ impl RemoteStorageConfig { .map(|endpoint| parse_toml_string("endpoint", endpoint)) .transpose()?, concurrency_limit, + max_keys_per_list_response, }), (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from( parse_toml_string("local_path", local_path)?, diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 93f5e0596e..d4eb7d9244 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -102,6 +102,7 @@ pub struct S3Bucket { client: Client, bucket_name: String, prefix_in_bucket: Option, + max_keys_per_list_response: Option, // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded. // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold. // The helps to ensure we don't exceed the thresholds. @@ -164,6 +165,7 @@ impl S3Bucket { Ok(Self { client, bucket_name: aws_config.bucket_name.clone(), + max_keys_per_list_response: aws_config.max_keys_per_list_response, prefix_in_bucket, concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())), }) @@ -291,7 +293,9 @@ impl RemoteStorage for S3Bucket { .list_objects_v2() .bucket(self.bucket_name.clone()) .set_prefix(self.prefix_in_bucket.clone()) + .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()) .set_continuation_token(continuation_token) + .set_max_keys(self.max_keys_per_list_response) .send() .await .map_err(|e| { @@ -306,7 +310,7 @@ impl RemoteStorage for S3Bucket { .filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))), ); - match fetch_response.continuation_token { + match fetch_response.next_continuation_token { Some(new_token) => continuation_token = Some(new_token), None => break, } @@ -354,6 +358,7 @@ impl RemoteStorage for S3Bucket { .set_prefix(list_prefix.clone()) .set_continuation_token(continuation_token) .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()) + .set_max_keys(self.max_keys_per_list_response) .send() .await .map_err(|e| { @@ -371,7 +376,7 @@ impl RemoteStorage for S3Bucket { .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))), ); - match fetch_response.continuation_token { + match fetch_response.next_continuation_token { Some(new_token) => continuation_token = Some(new_token), None => break, } diff --git a/libs/remote_storage/tests/pagination_tests.rs b/libs/remote_storage/tests/pagination_tests.rs new file mode 100644 index 0000000000..eb52409c44 --- /dev/null +++ b/libs/remote_storage/tests/pagination_tests.rs @@ -0,0 +1,275 @@ +use std::collections::HashSet; +use std::env; +use std::num::{NonZeroU32, NonZeroUsize}; +use std::ops::ControlFlow; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::UNIX_EPOCH; + +use anyhow::Context; +use remote_storage::{ + GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, +}; +use test_context::{test_context, AsyncTestContext}; +use tokio::task::JoinSet; +use tracing::{debug, error, info}; + +const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; + +/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries. +/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. +/// See the client creation in [`create_s3_client`] for details on the required env vars. +/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the +/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details. +/// +/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`] +/// where +/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference +/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket +/// +/// Then, verifies that the client does return correct prefixes when queried: +/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only +/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}` +/// +/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys. +/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3, +/// since current default AWS S3 pagination limit is 1000. +/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax) +/// +/// Lastly, the test attempts to clean up and remove all uploaded S3 files. +/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished. +#[test_context(MaybeEnabledS3)] +#[tokio::test] +async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledS3::Enabled(ctx) => ctx, + MaybeEnabledS3::Disabled => return Ok(()), + MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"), + }; + + let test_client = Arc::clone(&ctx.client_with_excessive_pagination); + let expected_remote_prefixes = ctx.remote_prefixes.clone(); + + let base_prefix = + RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?; + let root_remote_prefixes = test_client + .list_prefixes(None) + .await + .context("client list root prefixes failure")? + .into_iter() + .collect::>(); + assert_eq!( + root_remote_prefixes, HashSet::from([base_prefix.clone()]), + "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}" + ); + + let nested_remote_prefixes = test_client + .list_prefixes(Some(&base_prefix)) + .await + .context("client list nested prefixes failure")? + .into_iter() + .collect::>(); + let remote_only_prefixes = nested_remote_prefixes + .difference(&expected_remote_prefixes) + .collect::>(); + let missing_uploaded_prefixes = expected_remote_prefixes + .difference(&nested_remote_prefixes) + .collect::>(); + assert_eq!( + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", + ); + + Ok(()) +} + +enum MaybeEnabledS3 { + Enabled(S3WithTestBlobs), + Disabled, + UploadsFailed(anyhow::Error, S3WithTestBlobs), +} + +struct S3WithTestBlobs { + client_with_excessive_pagination: Arc, + base_prefix_str: &'static str, + remote_prefixes: HashSet, + remote_blobs: HashSet, +} + +#[async_trait::async_trait] +impl AsyncTestContext for MaybeEnabledS3 { + async fn setup() -> Self { + utils::logging::init(utils::logging::LogFormat::Test).expect("logging init failed"); + if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { + info!( + "`{}` env variable is not set, skipping the test", + ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME + ); + return Self::Disabled; + } + + let max_keys_in_list_response = 10; + let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap()); + + let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response) + .context("S3 client creation") + .expect("S3 client creation failed"); + + let base_prefix_str = "test/"; + match upload_s3_data( + &client_with_excessive_pagination, + base_prefix_str, + upload_tasks_count, + ) + .await + { + ControlFlow::Continue(uploads) => { + info!("Remote objects created successfully"); + Self::Enabled(S3WithTestBlobs { + client_with_excessive_pagination, + base_prefix_str, + remote_prefixes: uploads.prefixes, + remote_blobs: uploads.blobs, + }) + } + ControlFlow::Break(uploads) => Self::UploadsFailed( + anyhow::anyhow!("One or multiple blobs failed to upload to S3"), + S3WithTestBlobs { + client_with_excessive_pagination, + base_prefix_str, + remote_prefixes: uploads.prefixes, + remote_blobs: uploads.blobs, + }, + ), + } + } + + async fn teardown(self) { + match self { + Self::Disabled => {} + Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => { + cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await; + } + } + } +} + +fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result> { + let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET") + .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?; + let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION") + .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?; + let random_prefix_part = std::time::SystemTime::now() + .duration_since(UNIX_EPOCH) + .context("random s3 test prefix part calculation")? + .as_millis(); + let remote_storage_config = RemoteStorageConfig { + max_concurrent_syncs: NonZeroUsize::new(100).unwrap(), + max_sync_errors: NonZeroU32::new(5).unwrap(), + storage: RemoteStorageKind::AwsS3(S3Config { + bucket_name: remote_storage_s3_bucket, + bucket_region: remote_storage_s3_region, + prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")), + endpoint: None, + concurrency_limit: NonZeroUsize::new(100).unwrap(), + max_keys_per_list_response: Some(max_keys_per_list_response), + }), + }; + Ok(Arc::new( + GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, + )) +} + +struct Uploads { + prefixes: HashSet, + blobs: HashSet, +} + +async fn upload_s3_data( + client: &Arc, + base_prefix_str: &'static str, + upload_tasks_count: usize, +) -> ControlFlow { + info!("Creating {upload_tasks_count} S3 files"); + let mut upload_tasks = JoinSet::new(); + for i in 1..upload_tasks_count + 1 { + let task_client = Arc::clone(client); + upload_tasks.spawn(async move { + let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/")); + let blob_prefix = RemotePath::new(&prefix) + .with_context(|| format!("{prefix:?} to RemotePath conversion"))?; + let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}"))); + debug!("Creating remote item {i} at path {blob_path:?}"); + + let data = format!("remote blob data {i}").into_bytes(); + let data_len = data.len(); + task_client + .upload( + Box::new(std::io::Cursor::new(data)), + data_len, + &blob_path, + None, + ) + .await?; + + Ok::<_, anyhow::Error>((blob_prefix, blob_path)) + }); + } + + let mut upload_tasks_failed = false; + let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count); + let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); + while let Some(task_run_result) = upload_tasks.join_next().await { + match task_run_result + .context("task join failed") + .and_then(|task_result| task_result.context("upload task failed")) + { + Ok((upload_prefix, upload_path)) => { + uploaded_prefixes.insert(upload_prefix); + uploaded_blobs.insert(upload_path); + } + Err(e) => { + error!("Upload task failed: {e:?}"); + upload_tasks_failed = true; + } + } + } + + let uploads = Uploads { + prefixes: uploaded_prefixes, + blobs: uploaded_blobs, + }; + if upload_tasks_failed { + ControlFlow::Break(uploads) + } else { + ControlFlow::Continue(uploads) + } +} + +async fn cleanup(client: &Arc, objects_to_delete: HashSet) { + info!( + "Removing {} objects from the remote storage during cleanup", + objects_to_delete.len() + ); + let mut delete_tasks = JoinSet::new(); + for object_to_delete in objects_to_delete { + let task_client = Arc::clone(client); + delete_tasks.spawn(async move { + debug!("Deleting remote item at path {object_to_delete:?}"); + task_client + .delete(&object_to_delete) + .await + .with_context(|| format!("{object_to_delete:?} removal")) + }); + } + + while let Some(task_run_result) = delete_tasks.join_next().await { + match task_run_result { + Ok(task_result) => match task_result { + Ok(()) => {} + Err(e) => error!("Delete task failed: {e:?}"), + }, + Err(join_err) => error!("Delete task did not finish correctly: {join_err}"), + } + } +} diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index b9f67e82f8..391bc52a80 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -19,6 +19,7 @@ jsonwebtoken.workspace = true nix.workspace = true once_cell.workspace = true pin-project-lite.workspace = true +regex.workspace = true routerify.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 766d759ab4..d4176911ac 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -51,6 +51,9 @@ pub mod history_buffer; pub mod measured_stream; +pub mod serde_percent; +pub mod serde_regex; + /// use with fail::cfg("$name", "return(2000)") #[macro_export] macro_rules! failpoint_sleep_millis_async { diff --git a/libs/utils/src/serde_percent.rs b/libs/utils/src/serde_percent.rs new file mode 100644 index 0000000000..63b62b5f1e --- /dev/null +++ b/libs/utils/src/serde_percent.rs @@ -0,0 +1,83 @@ +//! A serde::Deserialize type for percentages. +//! +//! See [`Percent`] for details. + +use serde::{Deserialize, Serialize}; + +/// If the value is not an integer between 0 and 100, +/// deserialization fails with a descriptive error. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +#[serde(transparent)] +pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8); + +impl Percent { + pub fn get(&self) -> u8 { + self.0 + } +} + +fn deserialize_pct_0_to_100<'de, D>(deserializer: D) -> Result +where + D: serde::de::Deserializer<'de>, +{ + let v: u8 = serde::de::Deserialize::deserialize(deserializer)?; + if v > 100 { + return Err(serde::de::Error::custom( + "must be an integer between 0 and 100", + )); + } + Ok(v) +} + +#[cfg(test)] +mod tests { + use super::Percent; + + #[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq, Eq)] + struct Foo { + bar: Percent, + } + + #[test] + fn basics() { + let input = r#"{ "bar": 50 }"#; + let foo: Foo = serde_json::from_str(input).unwrap(); + assert_eq!(foo.bar.get(), 50); + } + #[test] + fn null_handling() { + let input = r#"{ "bar": null }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn zero() { + let input = r#"{ "bar": 0 }"#; + let foo: Foo = serde_json::from_str(input).unwrap(); + assert_eq!(foo.bar.get(), 0); + } + #[test] + fn out_of_range_above() { + let input = r#"{ "bar": 101 }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn out_of_range_below() { + let input = r#"{ "bar": -1 }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn float() { + let input = r#"{ "bar": 50.5 }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn string() { + let input = r#"{ "bar": "50 %" }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } +} diff --git a/libs/utils/src/serde_regex.rs b/libs/utils/src/serde_regex.rs new file mode 100644 index 0000000000..95ea4f8e44 --- /dev/null +++ b/libs/utils/src/serde_regex.rs @@ -0,0 +1,60 @@ +//! A `serde::{Deserialize,Serialize}` type for regexes. + +use std::ops::Deref; + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct Regex( + #[serde( + deserialize_with = "deserialize_regex", + serialize_with = "serialize_regex" + )] + regex::Regex, +); + +fn deserialize_regex<'de, D>(deserializer: D) -> Result +where + D: serde::de::Deserializer<'de>, +{ + let s: String = serde::de::Deserialize::deserialize(deserializer)?; + let re = regex::Regex::new(&s).map_err(serde::de::Error::custom)?; + Ok(re) +} + +fn serialize_regex(re: ®ex::Regex, serializer: S) -> Result +where + S: serde::ser::Serializer, +{ + serializer.collect_str(re.as_str()) +} + +impl Deref for Regex { + type Target = regex::Regex; + + fn deref(&self) -> ®ex::Regex { + &self.0 + } +} + +impl PartialEq for Regex { + fn eq(&self, other: &Regex) -> bool { + // comparing the automatons would be quite complicated + self.as_str() == other.as_str() + } +} + +impl Eq for Regex {} + +#[cfg(test)] +mod tests { + + #[test] + fn roundtrip() { + let input = r#""foo.*bar""#; + let re: super::Regex = serde_json::from_str(input).unwrap(); + assert!(re.is_match("foo123bar")); + assert!(!re.is_match("foo")); + let output = serde_json::to_string(&re).unwrap(); + assert_eq!(output, input); + } +} diff --git a/libs/utils/src/signals.rs b/libs/utils/src/signals.rs index 6586da2339..c37e9aea58 100644 --- a/libs/utils/src/signals.rs +++ b/libs/utils/src/signals.rs @@ -1,25 +1,7 @@ -use signal_hook::flag; use signal_hook::iterator::Signals; -use std::sync::atomic::AtomicBool; -use std::sync::Arc; pub use signal_hook::consts::{signal::*, TERM_SIGNALS}; -pub fn install_shutdown_handlers() -> anyhow::Result { - let term_now = Arc::new(AtomicBool::new(false)); - for sig in TERM_SIGNALS { - // When terminated by a second term signal, exit with exit code 1. - // This will do nothing the first time (because term_now is false). - flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?; - // But this will "arm" the above for the second time, by setting it to true. - // The order of registering these is important, if you put this one first, it will - // first arm and then terminate ‒ all in the first round. - flag::register(*sig, Arc::clone(&term_now))?; - } - - Ok(ShutdownSignals) -} - pub enum Signal { Quit, Interrupt, @@ -39,10 +21,7 @@ impl Signal { pub struct ShutdownSignals; impl ShutdownSignals { - pub fn handle( - self, - mut handler: impl FnMut(Signal) -> anyhow::Result<()>, - ) -> anyhow::Result<()> { + pub fn handle(mut handler: impl FnMut(Signal) -> anyhow::Result<()>) -> anyhow::Result<()> { for raw_signal in Signals::new(TERM_SIGNALS)?.into_iter() { let signal = match raw_signal { SIGINT => Signal::Interrupt, diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 8d6641a387..0bc7eba95e 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -48,6 +48,7 @@ serde_json = { workspace = true, features = ["raw_value"] } serde_with.workspace = true signal-hook.workspace = true svg_fmt.workspace = true +sync_wrapper.workspace = true tokio-tar.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 14e86ddcb6..ed23a18ee0 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -8,6 +8,7 @@ use anyhow::{anyhow, Context}; use clap::{Arg, ArgAction, Command}; use fail::FailScenario; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; +use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use remote_storage::GenericRemoteStorage; use tracing::*; @@ -24,11 +25,9 @@ use pageserver::{ virtual_file, }; use postgres_backend::AuthType; +use utils::signals::ShutdownSignals; use utils::{ - auth::JwtAuth, - logging, project_git_version, - sentry_init::init_sentry, - signals::{self, Signal}, + auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal, tcp_listener, }; @@ -263,9 +262,6 @@ fn start_pageserver( info!("Starting pageserver pg protocol handler on {pg_addr}"); let pageserver_listener = tcp_listener::bind(pg_addr)?; - // Install signal handlers - let signals = signals::install_shutdown_handlers()?; - // Launch broker client WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?; @@ -319,14 +315,34 @@ fn start_pageserver( // Scan the local 'tenants/' directory and start loading the tenants BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?; + // shared state between the disk-usage backed eviction background task and the http endpoint + // that allows triggering disk-usage based eviction manually. note that the http endpoint + // is still accessible even if background task is not configured as long as remote storage has + // been configured. + let disk_usage_eviction_state: Arc = Arc::default(); + + if let Some(remote_storage) = &remote_storage { + launch_disk_usage_global_eviction_task( + conf, + remote_storage.clone(), + disk_usage_eviction_state.clone(), + )?; + } + // Start up the service to handle HTTP mgmt API request. We created the // listener earlier already. { let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); - let router = http::make_router(conf, launch_ts, http_auth, remote_storage)? - .build() - .map_err(|err| anyhow!(err))?; + let router = http::make_router( + conf, + launch_ts, + http_auth, + remote_storage, + disk_usage_eviction_state, + )? + .build() + .map_err(|err| anyhow!(err))?; let service = utils::http::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)? .serve(service) @@ -409,7 +425,7 @@ fn start_pageserver( } // All started up! Now just sit and wait for shutdown signal. - signals.handle(|signal| match signal { + ShutdownSignals::handle(|signal| match signal { Signal::Quit => { info!( "Got {}. Terminating in immediate shutdown mode", diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 58a6056385..19f0f22815 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -27,6 +27,7 @@ use utils::{ logging::LogFormat, }; +use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig; use crate::tenant::config::TenantConf; use crate::tenant::config::TenantConfOpt; use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME}; @@ -92,6 +93,8 @@ pub mod defaults { #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' +#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} + # [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -104,6 +107,8 @@ pub mod defaults { #image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD} #pitr_interval = '{DEFAULT_PITR_INTERVAL}' +#min_resident_size_override = .. # in bytes + # [remote_storage] "### @@ -180,6 +185,8 @@ pub struct PageServerConf { // See the corresponding metric's help string. pub evictions_low_residence_duration_metric_threshold: Duration, + pub disk_usage_based_eviction: Option, + pub test_remote_failures: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, @@ -252,6 +259,8 @@ struct PageServerConfigBuilder { evictions_low_residence_duration_metric_threshold: BuilderValue, + disk_usage_based_eviction: BuilderValue>, + test_remote_failures: BuilderValue, ondemand_download_behavior_treat_error_as_warn: BuilderValue, @@ -312,6 +321,8 @@ impl Default for PageServerConfigBuilder { ) .expect("cannot parse DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD")), + disk_usage_based_eviction: Set(None), + test_remote_failures: Set(0), ondemand_download_behavior_treat_error_as_warn: Set(false), @@ -431,6 +442,10 @@ impl PageServerConfigBuilder { self.evictions_low_residence_duration_metric_threshold = BuilderValue::Set(value); } + pub fn disk_usage_based_eviction(&mut self, value: Option) { + self.disk_usage_based_eviction = BuilderValue::Set(value); + } + pub fn ondemand_download_behavior_treat_error_as_warn( &mut self, ondemand_download_behavior_treat_error_as_warn: bool, @@ -515,6 +530,9 @@ impl PageServerConfigBuilder { .ok_or(anyhow!( "missing evictions_low_residence_duration_metric_threshold" ))?, + disk_usage_based_eviction: self + .disk_usage_based_eviction + .ok_or(anyhow!("missing disk_usage_based_eviction"))?, test_remote_failures: self .test_remote_failures .ok_or(anyhow!("missing test_remote_failuers"))?, @@ -704,6 +722,12 @@ impl PageServerConf { builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), "evictions_low_residence_duration_metric_threshold" => builder.evictions_low_residence_duration_metric_threshold(parse_toml_duration(key, item)?), + "disk_usage_based_eviction" => { + tracing::info!("disk_usage_based_eviction: {:#?}", &item); + builder.disk_usage_based_eviction( + toml_edit::de::from_item(item.clone()) + .context("parse disk_usage_based_eviction")?) + }, "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), } @@ -808,6 +832,13 @@ impl PageServerConf { ); } + if let Some(item) = item.get("min_resident_size_override") { + t_conf.min_resident_size_override = Some( + toml_edit::de::from_item(item.clone()) + .context("parse min_resident_size_override")?, + ); + } + Ok(t_conf) } @@ -850,6 +881,7 @@ impl PageServerConf { defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, ) .unwrap(), + disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, } @@ -1058,6 +1090,7 @@ log_format = 'json' evictions_low_residence_duration_metric_threshold: humantime::parse_duration( defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD )?, + disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, }, @@ -1112,6 +1145,7 @@ log_format = 'json' metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), synthetic_size_calculation_interval: Duration::from_secs(333), evictions_low_residence_duration_metric_threshold: Duration::from_secs(444), + disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, }, @@ -1238,6 +1272,7 @@ broker_endpoint = '{broker_endpoint}' prefix_in_bucket: Some(prefix_in_bucket.clone()), endpoint: Some(endpoint.clone()), concurrency_limit: s3_concurrency_limit, + max_keys_per_list_response: None, }), }, "Remote storage config should correctly parse the S3 config" diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs new file mode 100644 index 0000000000..eeeb6fda89 --- /dev/null +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -0,0 +1,689 @@ +//! This module implements the pageserver-global disk-usage-based layer eviction task. +//! +//! # Mechanics +//! +//! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background +//! loop that evicts layers in response to a shortage of available bytes +//! in the $repo/tenants directory's filesystem. +//! +//! The loop runs periodically at a configurable `period`. +//! +//! Each loop iteration uses `statvfs` to determine filesystem-level space usage. +//! It compares the returned usage data against two different types of thresholds. +//! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds. +//! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration. +//! We're good if that second statvfs shows that we're _actually_ below the configured thresholds. +//! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further. +//! +//! # Eviction Policy +//! +//! There are two thresholds: +//! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space. +//! If the actual usage is higher, the threshold is exceeded. +//! `min_avail_bytes` is the absolute available space in bytes. +//! If the actual usage is lower, the threshold is exceeded. +//! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction +//! is performed on the next iteration, to release disk space and bring the usage below the thresholds again. +//! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant. +//! The reservation is to keep the most recently accessed X bytes per tenant resident. +//! If we cannot relieve pressure by evicting layers outside of the reservation, we +//! start evicting layers that are part of the reservation, LRU first. +//! +//! The value for the per-tenant reservation is referred to as `tenant_min_resident_size` +//! throughout the code, but, no actual variable carries that name. +//! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`. +//! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress +//! during page reconstruction. +//! An alternative default for all tenants can be specified in the `tenant_config` section of the config. +//! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`). + +// Implementation notes: +// - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl +// reading these fields. We use the Debug impl for semi-structured logging, though. + +use std::{ + collections::HashMap, + path::Path, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use anyhow::Context; +use remote_storage::GenericRemoteStorage; +use serde::{Deserialize, Serialize}; +use tokio::time::Instant; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, instrument, warn, Instrument}; +use utils::serde_percent::Percent; + +use crate::{ + config::PageServerConf, + task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, + tenant::{self, storage_layer::PersistentLayer, Timeline}, +}; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DiskUsageEvictionTaskConfig { + pub max_usage_pct: Percent, + pub min_avail_bytes: u64, + #[serde(with = "humantime_serde")] + pub period: Duration, + #[cfg(feature = "testing")] + pub mock_statvfs: Option, +} + +#[derive(Default)] +pub struct State { + /// Exclude http requests and background task from running at the same time. + mutex: tokio::sync::Mutex<()>, +} + +pub fn launch_disk_usage_global_eviction_task( + conf: &'static PageServerConf, + storage: GenericRemoteStorage, + state: Arc, +) -> anyhow::Result<()> { + let Some(task_config) = &conf.disk_usage_based_eviction else { + info!("disk usage based eviction task not configured"); + return Ok(()); + }; + + info!("launching disk usage based eviction task"); + + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::DiskUsageEviction, + None, + None, + "disk usage based eviction", + false, + async move { + disk_usage_eviction_task( + &state, + task_config, + storage, + &conf.tenants_path(), + task_mgr::shutdown_token(), + ) + .await; + info!("disk usage based eviction task finishing"); + Ok(()) + }, + ); + + Ok(()) +} + +#[instrument(skip_all)] +async fn disk_usage_eviction_task( + state: &State, + task_config: &DiskUsageEvictionTaskConfig, + storage: GenericRemoteStorage, + tenants_dir: &Path, + cancel: CancellationToken, +) { + use crate::tenant::tasks::random_init_delay; + { + if random_init_delay(task_config.period, &cancel) + .await + .is_err() + { + info!("shutting down"); + return; + } + } + + let mut iteration_no = 0; + loop { + iteration_no += 1; + let start = Instant::now(); + + async { + let res = disk_usage_eviction_task_iteration( + state, + task_config, + &storage, + tenants_dir, + &cancel, + ) + .await; + + match res { + Ok(()) => {} + Err(e) => { + // these stat failures are expected to be very rare + warn!("iteration failed, unexpected error: {e:#}"); + } + } + } + .instrument(tracing::info_span!("iteration", iteration_no)) + .await; + + let sleep_until = start + task_config.period; + tokio::select! { + _ = tokio::time::sleep_until(sleep_until) => {}, + _ = cancel.cancelled() => { + info!("shutting down"); + break + } + } + } +} + +pub trait Usage: Clone + Copy + std::fmt::Debug { + fn has_pressure(&self) -> bool; + fn add_available_bytes(&mut self, bytes: u64); +} + +async fn disk_usage_eviction_task_iteration( + state: &State, + task_config: &DiskUsageEvictionTaskConfig, + storage: &GenericRemoteStorage, + tenants_dir: &Path, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + let usage_pre = filesystem_level_usage::get(tenants_dir, task_config) + .context("get filesystem-level disk usage before evictions")?; + let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await; + match res { + Ok(outcome) => { + debug!(?outcome, "disk_usage_eviction_iteration finished"); + match outcome { + IterationOutcome::NoPressure | IterationOutcome::Cancelled => { + // nothing to do, select statement below will handle things + } + IterationOutcome::Finished(outcome) => { + // Verify with statvfs whether we made any real progress + let after = filesystem_level_usage::get(tenants_dir, task_config) + // It's quite unlikely to hit the error here. Keep the code simple and bail out. + .context("get filesystem-level disk usage after evictions")?; + + debug!(?after, "disk usage"); + + if after.has_pressure() { + // Don't bother doing an out-of-order iteration here now. + // In practice, the task period is set to a value in the tens-of-seconds range, + // which will cause another iteration to happen soon enough. + // TODO: deltas between the three different usages would be helpful, + // consider MiB, GiB, TiB + warn!(?outcome, ?after, "disk usage still high"); + } else { + info!(?outcome, ?after, "disk usage pressure relieved"); + } + } + } + } + Err(e) => { + error!("disk_usage_eviction_iteration failed: {:#}", e); + } + } + + Ok(()) +} + +#[derive(Debug, Serialize)] +#[allow(clippy::large_enum_variant)] +pub enum IterationOutcome { + NoPressure, + Cancelled, + Finished(IterationOutcomeFinished), +} + +#[allow(dead_code)] +#[derive(Debug, Serialize)] +pub struct IterationOutcomeFinished { + /// The actual usage observed before we started the iteration. + before: U, + /// The expected value for `after`, according to internal accounting, after phase 1. + planned: PlannedUsage, + /// The outcome of phase 2, where we actually do the evictions. + /// + /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will + /// be the same as `planned`. + assumed: AssumedUsage, +} + +#[derive(Debug, Serialize)] +#[allow(dead_code)] +struct AssumedUsage { + /// The expected value for `after`, after phase 2. + projected_after: U, + /// The layers we failed to evict during phase 2. + failed: LayerCount, +} + +#[allow(dead_code)] +#[derive(Debug, Serialize)] +struct PlannedUsage { + respecting_tenant_min_resident_size: U, + fallback_to_global_lru: Option, +} + +#[allow(dead_code)] +#[derive(Debug, Default, Serialize)] +struct LayerCount { + file_sizes: u64, + count: usize, +} + +pub async fn disk_usage_eviction_task_iteration_impl( + state: &State, + storage: &GenericRemoteStorage, + usage_pre: U, + cancel: &CancellationToken, +) -> anyhow::Result> { + // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex) + let _g = state + .mutex + .try_lock() + .map_err(|_| anyhow::anyhow!("iteration is already executing"))?; + + debug!(?usage_pre, "disk usage"); + + if !usage_pre.has_pressure() { + return Ok(IterationOutcome::NoPressure); + } + + warn!( + ?usage_pre, + "running disk usage based eviction due to pressure" + ); + + let candidates = match collect_eviction_candidates(cancel).await? { + EvictionCandidates::Cancelled => { + return Ok(IterationOutcome::Cancelled); + } + EvictionCandidates::Finished(partitioned) => partitioned, + }; + + // Debug-log the list of candidates + let now = SystemTime::now(); + for (i, (partition, candidate)) in candidates.iter().enumerate() { + debug!( + "cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}", + i + 1, + candidates.len(), + candidate.layer.file_size(), + now.duration_since(candidate.last_activity_ts) + .unwrap() + .as_micros(), + partition, + candidate.layer.get_tenant_id(), + candidate.layer.get_timeline_id(), + candidate.layer.filename().file_name(), + ); + } + + // phase1: select victims to relieve pressure + // + // Walk through the list of candidates, until we have accumulated enough layers to get + // us back under the pressure threshold. 'usage_planned' is updated so that it tracks + // how much disk space would be used after evicting all the layers up to the current + // point in the list. The layers are collected in 'batched', grouped per timeline. + // + // If we get far enough in the list that we start to evict layers that are below + // the tenant's min-resident-size threshold, print a warning, and memorize the disk + // usage at that point, in 'usage_planned_min_resident_size_respecting'. + let mut batched: HashMap<_, Vec>> = HashMap::new(); + let mut warned = None; + let mut usage_planned = usage_pre; + for (i, (partition, candidate)) in candidates.into_iter().enumerate() { + if !usage_planned.has_pressure() { + debug!( + no_candidates_evicted = i, + "took enough candidates for pressure to be relieved" + ); + break; + } + + if partition == MinResidentSizePartition::Below && warned.is_none() { + warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"); + warned = Some(usage_planned); + } + + usage_planned.add_available_bytes(candidate.layer.file_size()); + + batched + .entry(TimelineKey(candidate.timeline)) + .or_default() + .push(candidate.layer); + } + + let usage_planned = match warned { + Some(respecting_tenant_min_resident_size) => PlannedUsage { + respecting_tenant_min_resident_size, + fallback_to_global_lru: Some(usage_planned), + }, + None => PlannedUsage { + respecting_tenant_min_resident_size: usage_planned, + fallback_to_global_lru: None, + }, + }; + debug!(?usage_planned, "usage planned"); + + // phase2: evict victims batched by timeline + + // After the loop, `usage_assumed` is the post-eviction usage, + // according to internal accounting. + let mut usage_assumed = usage_pre; + let mut evictions_failed = LayerCount::default(); + for (timeline, batch) in batched { + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + let batch_size = batch.len(); + + debug!(%timeline_id, "evicting batch for timeline"); + + async { + let results = timeline.evict_layers(storage, &batch, cancel.clone()).await; + + match results { + Err(e) => { + warn!("failed to evict batch: {:#}", e); + } + Ok(results) => { + assert_eq!(results.len(), batch.len()); + for (result, layer) in results.into_iter().zip(batch.iter()) { + match result { + Some(Ok(true)) => { + usage_assumed.add_available_bytes(layer.file_size()); + } + Some(Ok(false)) => { + // this is: + // - Replacement::{NotFound, Unexpected} + // - it cannot be is_remote_layer, filtered already + evictions_failed.file_sizes += layer.file_size(); + evictions_failed.count += 1; + } + None => { + assert!(cancel.is_cancelled()); + return; + } + Some(Err(e)) => { + // we really shouldn't be getting this, precondition failure + error!("failed to evict layer: {:#}", e); + } + } + } + } + } + } + .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size)) + .await; + + if cancel.is_cancelled() { + return Ok(IterationOutcome::Cancelled); + } + } + + Ok(IterationOutcome::Finished(IterationOutcomeFinished { + before: usage_pre, + planned: usage_planned, + assumed: AssumedUsage { + projected_after: usage_assumed, + failed: evictions_failed, + }, + })) +} + +#[derive(Clone)] +struct EvictionCandidate { + timeline: Arc, + layer: Arc, + last_activity_ts: SystemTime, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +enum MinResidentSizePartition { + Above, + Below, +} + +enum EvictionCandidates { + Cancelled, + Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>), +} + +/// Gather the eviction candidates. +/// +/// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction +/// order. A caller that evicts in that order, until pressure is relieved, implements +/// the eviction policy outlined in the module comment. +/// +/// # Example +/// +/// Imagine that there are two tenants, A and B, with five layers each, a-e. +/// Each layer has size 100, and both tenant's min_resident_size is 150. +/// The eviction order would be +/// +/// ```text +/// partition last_activity_ts tenant/layer +/// Above 18:30 A/c +/// Above 19:00 A/b +/// Above 18:29 B/c +/// Above 19:05 B/b +/// Above 20:00 B/a +/// Above 20:03 A/a +/// Below 20:30 A/d +/// Below 20:40 B/d +/// Below 20:45 B/e +/// Below 20:58 A/e +/// ``` +/// +/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`. +/// They are all in the `Above` partition, so, we respected each tenant's min_resident_size. +/// +/// But, if we need to evict 900 bytes to relieve pressure, we'd evict +/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition +/// after exhauting the `Above` partition. +/// So, we did not respect each tenant's min_resident_size. +async fn collect_eviction_candidates( + cancel: &CancellationToken, +) -> anyhow::Result { + // get a snapshot of the list of tenants + let tenants = tenant::mgr::list_tenants() + .await + .context("get list of tenants")?; + + let mut candidates = Vec::new(); + + for (tenant_id, _state) in &tenants { + if cancel.is_cancelled() { + return Ok(EvictionCandidates::Cancelled); + } + let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await { + Ok(tenant) => tenant, + Err(e) => { + // this can happen if tenant has lifecycle transition after we fetched it + debug!("failed to get tenant: {e:#}"); + continue; + } + }; + + // collect layers from all timelines in this tenant + // + // If one of the timelines becomes `!is_active()` during the iteration, + // for example because we're shutting down, then `max_layer_size` can be too small. + // That's OK. This code only runs under a disk pressure situation, and being + // a little unfair to tenants during shutdown in such a situation is tolerable. + let mut tenant_candidates = Vec::new(); + let mut max_layer_size = 0; + for tl in tenant.list_timelines() { + if !tl.is_active() { + continue; + } + let info = tl.get_local_layers_for_disk_usage_eviction(); + debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); + tenant_candidates.extend( + info.resident_layers + .into_iter() + .map(|layer_infos| (tl.clone(), layer_infos)), + ); + max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0)); + + if cancel.is_cancelled() { + return Ok(EvictionCandidates::Cancelled); + } + } + + // `min_resident_size` defaults to maximum layer file size of the tenant. + // This ensures that each tenant can have at least one layer resident at a given time, + // ensuring forward progress for a single Timeline::get in that tenant. + // It's a questionable heuristic since, usually, there are many Timeline::get + // requests going on for a tenant, and, at least in Neon prod, the median + // layer file size is much smaller than the compaction target size. + // We could be better here, e.g., sum of all L0 layers + most recent L1 layer. + // That's what's typically used by the various background loops. + // + // The default can be overriden with a fixed value in the tenant conf. + // A default override can be put in the default tenant conf in the pageserver.toml. + let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { + debug!( + tenant_id=%tenant.tenant_id(), + overriden_size=s, + "using overridden min resident size for tenant" + ); + s + } else { + debug!( + tenant_id=%tenant.tenant_id(), + max_layer_size, + "using max layer size as min_resident_size for tenant", + ); + max_layer_size + }; + + // Sort layers most-recently-used first, then partition by + // cumsum above/below min_resident_size. + tenant_candidates + .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts)); + let mut cumsum: i128 = 0; + for (timeline, layer_info) in tenant_candidates.into_iter() { + let file_size = layer_info.file_size(); + let candidate = EvictionCandidate { + timeline, + last_activity_ts: layer_info.last_activity_ts, + layer: layer_info.layer, + }; + let partition = if cumsum > min_resident_size as i128 { + MinResidentSizePartition::Above + } else { + MinResidentSizePartition::Below + }; + candidates.push((partition, candidate)); + cumsum += i128::from(file_size); + } + } + + debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); + candidates + .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts)); + + Ok(EvictionCandidates::Finished(candidates)) +} + +struct TimelineKey(Arc); + +impl PartialEq for TimelineKey { + fn eq(&self, other: &Self) -> bool { + Arc::ptr_eq(&self.0, &other.0) + } +} + +impl Eq for TimelineKey {} + +impl std::hash::Hash for TimelineKey { + fn hash(&self, state: &mut H) { + Arc::as_ptr(&self.0).hash(state); + } +} + +impl std::ops::Deref for TimelineKey { + type Target = Timeline; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +mod filesystem_level_usage { + use std::path::Path; + + use anyhow::Context; + + use crate::statvfs::Statvfs; + + use super::DiskUsageEvictionTaskConfig; + + #[derive(Debug, Clone, Copy)] + #[allow(dead_code)] + pub struct Usage<'a> { + config: &'a DiskUsageEvictionTaskConfig, + + /// Filesystem capacity + total_bytes: u64, + /// Free filesystem space + avail_bytes: u64, + } + + impl super::Usage for Usage<'_> { + fn has_pressure(&self) -> bool { + let usage_pct = + (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64; + + let pressures = [ + ( + "min_avail_bytes", + self.avail_bytes < self.config.min_avail_bytes, + ), + ( + "max_usage_pct", + usage_pct > self.config.max_usage_pct.get() as u64, + ), + ]; + + pressures.into_iter().any(|(_, has_pressure)| has_pressure) + } + + fn add_available_bytes(&mut self, bytes: u64) { + self.avail_bytes += bytes; + } + } + + pub fn get<'a>( + tenants_dir: &Path, + config: &'a DiskUsageEvictionTaskConfig, + ) -> anyhow::Result> { + let mock_config = { + #[cfg(feature = "testing")] + { + config.mock_statvfs.as_ref() + } + #[cfg(not(feature = "testing"))] + { + None + } + }; + + let stat = Statvfs::get(tenants_dir, mock_config) + .context("statvfs failed, presumably directory got unlinked")?; + + // https://unix.stackexchange.com/a/703650 + let blocksize = if stat.fragment_size() > 0 { + stat.fragment_size() + } else { + stat.block_size() + }; + + // use blocks_available (b_avail) since, pageserver runs as unprivileged user + let avail_bytes = stat.blocks_available() * blocksize; + let total_bytes = stat.blocks() * blocksize; + + Ok(Usage { + config, + total_bytes, + avail_bytes, + }) + } +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 795c0cd3c4..478e9d228a 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -27,6 +27,31 @@ paths: id: type: integer + /v1/disk_usage_eviction/run: + put: + description: Do an iteration of disk-usage-based eviction to evict a given amount of disk space. + security: [] + requestBody: + content: + application/json: + schema: + type: object + required: + - evict_bytes + properties: + evict_bytes: + type: integer + responses: + "200": + description: | + The run completed. + This does not necessarily mean that we actually evicted `evict_bytes`. + Examine the returned object for detail, or, just watch the actual effect of the call using `du` or `df`. + content: + application/json: + schema: + type: object + /v1/tenant/{tenant_id}: parameters: - name: tenant_id @@ -873,13 +898,9 @@ components: type: object properties: tenant_specific_overrides: - type: object - schema: - $ref: "#/components/schemas/TenantConfigInfo" + $ref: "#/components/schemas/TenantConfigInfo" effective_config: - type: object - schema: - $ref: "#/components/schemas/TenantConfigInfo" + $ref: "#/components/schemas/TenantConfigInfo" TimelineInfo: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index b0addc82f1..2db60f557d 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -18,6 +18,7 @@ use super::models::{ TimelineCreateRequest, TimelineGcRequest, TimelineInfo, }; use crate::context::{DownloadBehavior, RequestContext}; +use crate::disk_usage_eviction_task; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; @@ -48,6 +49,7 @@ struct State { auth: Option>, allowlist_routes: Vec, remote_storage: Option, + disk_usage_eviction_state: Arc, } impl State { @@ -55,6 +57,7 @@ impl State { conf: &'static PageServerConf, auth: Option>, remote_storage: Option, + disk_usage_eviction_state: Arc, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() @@ -65,6 +68,7 @@ impl State { auth, allowlist_routes, remote_storage, + disk_usage_eviction_state, }) } } @@ -775,6 +779,8 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?; + + let tenant = crate::tenant::mgr::get_tenant(tenant_id, true) + .await + .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?; + + tenant.set_broken("broken from test"); + + json_response(StatusCode::OK, ()) +} + #[cfg(feature = "testing")] async fn failpoints_handler(mut request: Request) -> Result, ApiError> { if !fail::has_failpoints() { @@ -1063,6 +1085,89 @@ async fn always_panic_handler(req: Request) -> Result, ApiE json_response(StatusCode::NO_CONTENT, ()) } +async fn disk_usage_eviction_run(mut r: Request) -> Result, ApiError> { + check_permission(&r, None)?; + + #[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)] + struct Config { + /// How many bytes to evict before reporting that pressure is relieved. + evict_bytes: u64, + } + + #[derive(Debug, Clone, Copy, serde::Serialize)] + struct Usage { + // remains unchanged after instantiation of the struct + config: Config, + // updated by `add_available_bytes` + freed_bytes: u64, + } + + impl crate::disk_usage_eviction_task::Usage for Usage { + fn has_pressure(&self) -> bool { + self.config.evict_bytes > self.freed_bytes + } + + fn add_available_bytes(&mut self, bytes: u64) { + self.freed_bytes += bytes; + } + } + + let config = json_request::(&mut r) + .await + .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?; + + let usage = Usage { + config, + freed_bytes: 0, + }; + + use crate::task_mgr::MGMT_REQUEST_RUNTIME; + + let (tx, rx) = tokio::sync::oneshot::channel(); + + let state = get_state(&r); + + let Some(storage) = state.remote_storage.clone() else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "remote storage not configured, cannot run eviction iteration" + ))) + }; + + let state = state.disk_usage_eviction_state.clone(); + + let cancel = CancellationToken::new(); + let child_cancel = cancel.clone(); + let _g = cancel.drop_guard(); + + crate::task_mgr::spawn( + MGMT_REQUEST_RUNTIME.handle(), + TaskKind::DiskUsageEviction, + None, + None, + "ondemand disk usage eviction", + false, + async move { + let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( + &state, + &storage, + usage, + &child_cancel, + ) + .await; + + info!(?res, "disk_usage_eviction_task_iteration_impl finished"); + + let _ = tx.send(res); + Ok(()) + } + .in_current_span(), + ); + + let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, response) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -1075,6 +1180,7 @@ pub fn make_router( launch_ts: &'static LaunchTimestamp, auth: Option>, remote_storage: Option, + disk_usage_eviction_state: Arc, ) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); @@ -1119,7 +1225,8 @@ pub fn make_router( Ok(router .data(Arc::new( - State::new(conf, auth, remote_storage).context("Failed to initialize router state")?, + State::new(conf, auth, remote_storage, disk_usage_eviction_state) + .context("Failed to initialize router state")?, )) .get("/v1/status", |r| RequestSpan(status_handler).handle(r)) .put( @@ -1200,6 +1307,13 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", |r| RequestSpan(evict_timeline_layer_handler).handle(r), ) + .put("/v1/disk_usage_eviction/run", |r| { + RequestSpan(disk_usage_eviction_run).handle(r) + }) + .put( + "/v1/tenant/:tenant_id/break", + testing_api!("set tenant state to broken", handle_tenant_break), + ) .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r)) .any(handler_404)) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 09e21ae755..278658eba3 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -4,6 +4,7 @@ pub mod broker_client; pub mod config; pub mod consumption_metrics; pub mod context; +pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; pub mod keyspace; @@ -12,6 +13,7 @@ pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; pub mod repository; +pub(crate) mod statvfs; pub mod task_mgr; pub mod tenant; pub mod trace; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 6cb245aed7..1f31e5a8fb 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -257,7 +257,7 @@ impl EvictionsWithLowResidenceDuration { } pub fn observe(&self, observed_value: Duration) { - if self.threshold < observed_value { + if observed_value < self.threshold { self.counter .as_ref() .expect("nobody calls this function after `remove_from_vec`") diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b63ee31d5e..c0e4a2a9cf 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -27,6 +27,7 @@ use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; use std::io; use std::net::TcpListener; +use std::pin::pin; use std::str; use std::str::FromStr; use std::sync::Arc; @@ -466,8 +467,7 @@ impl PageServerHandler { pgb.write_message_noflush(&BeMessage::CopyInResponse)?; pgb.flush().await?; - let copyin_reader = StreamReader::new(copyin_stream(pgb)); - tokio::pin!(copyin_reader); + let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb))); timeline .import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx) .await?; @@ -512,8 +512,7 @@ impl PageServerHandler { info!("importing wal"); pgb.write_message_noflush(&BeMessage::CopyInResponse)?; pgb.flush().await?; - let copyin_reader = StreamReader::new(copyin_stream(pgb)); - tokio::pin!(copyin_reader); + let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb))); import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?; info!("wal import complete"); diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs new file mode 100644 index 0000000000..28d950b5e6 --- /dev/null +++ b/pageserver/src/statvfs.rs @@ -0,0 +1,150 @@ +//! Wrapper around nix::sys::statvfs::Statvfs that allows for mocking. + +use std::path::Path; + +pub enum Statvfs { + Real(nix::sys::statvfs::Statvfs), + Mock(mock::Statvfs), +} + +// NB: on macOS, the block count type of struct statvfs is u32. +// The workaround seems to be to use the non-standard statfs64 call. +// Sincce it should only be a problem on > 2TiB disks, let's ignore +// the problem for now and upcast to u64. +impl Statvfs { + pub fn get(tenants_dir: &Path, mocked: Option<&mock::Behavior>) -> nix::Result { + if let Some(mocked) = mocked { + Ok(Statvfs::Mock(mock::get(tenants_dir, mocked)?)) + } else { + Ok(Statvfs::Real(nix::sys::statvfs::statvfs(tenants_dir)?)) + } + } + + // NB: allow() because the block count type is u32 on macOS. + #[allow(clippy::useless_conversion)] + pub fn blocks(&self) -> u64 { + match self { + Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(), + Statvfs::Mock(stat) => stat.blocks, + } + } + + // NB: allow() because the block count type is u32 on macOS. + #[allow(clippy::useless_conversion)] + pub fn blocks_available(&self) -> u64 { + match self { + Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(), + Statvfs::Mock(stat) => stat.blocks_available, + } + } + + pub fn fragment_size(&self) -> u64 { + match self { + Statvfs::Real(stat) => stat.fragment_size(), + Statvfs::Mock(stat) => stat.fragment_size, + } + } + + pub fn block_size(&self) -> u64 { + match self { + Statvfs::Real(stat) => stat.block_size(), + Statvfs::Mock(stat) => stat.block_size, + } + } +} + +pub mod mock { + use anyhow::Context; + use regex::Regex; + use std::path::Path; + use tracing::log::info; + + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[serde(tag = "type")] + pub enum Behavior { + Success { + blocksize: u64, + total_blocks: u64, + name_filter: Option, + }, + Failure { + mocked_error: MockedError, + }, + } + + #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[allow(clippy::upper_case_acronyms)] + pub enum MockedError { + EIO, + } + + impl From for nix::Error { + fn from(e: MockedError) -> Self { + match e { + MockedError::EIO => nix::Error::EIO, + } + } + } + + pub fn get(tenants_dir: &Path, behavior: &Behavior) -> nix::Result { + info!("running mocked statvfs"); + + match behavior { + Behavior::Success { + blocksize, + total_blocks, + ref name_filter, + } => { + let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap(); + + // round it up to the nearest block multiple + let used_blocks = (used_bytes + (blocksize - 1)) / blocksize; + + if used_blocks > *total_blocks { + panic!( + "mocking error: used_blocks > total_blocks: {used_blocks} > {total_blocks}" + ); + } + + let avail_blocks = total_blocks - used_blocks; + + Ok(Statvfs { + blocks: *total_blocks, + blocks_available: avail_blocks, + fragment_size: *blocksize, + block_size: *blocksize, + }) + } + Behavior::Failure { mocked_error } => Err((*mocked_error).into()), + } + } + + fn walk_dir_disk_usage(path: &Path, name_filter: Option<&Regex>) -> anyhow::Result { + let mut total = 0; + for entry in walkdir::WalkDir::new(path) { + let entry = entry?; + if !entry.file_type().is_file() { + continue; + } + if !name_filter + .as_ref() + .map(|filter| filter.is_match(entry.file_name().to_str().unwrap())) + .unwrap_or(true) + { + continue; + } + total += entry + .metadata() + .with_context(|| format!("get metadata of {:?}", entry.path()))? + .len(); + } + Ok(total) + } + + pub struct Statvfs { + pub blocks: u64, + pub blocks_available: u64, + pub fragment_size: u64, + pub block_size: u64, + } +} diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 44b1bbb06d..82aebc6c07 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -234,6 +234,9 @@ pub enum TaskKind { // Eviction. One per timeline. Eviction, + /// See [`crate::disk_usage_eviction_task`]. + DiskUsageEviction, + // Initial logical size calculation InitialLogicalSizeCalculation, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2c5226e5bc..7fac7d2ac0 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -95,7 +95,7 @@ mod timeline; pub mod size; -pub use timeline::{PageReconstructError, Timeline}; +pub use timeline::{LocalLayerInfoForDiskUsageEviction, PageReconstructError, Timeline}; // re-export this function so that page_cache.rs can use it. pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file; @@ -1706,6 +1706,13 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) } + pub fn get_min_resident_size_override(&self) -> Option { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .min_resident_size_override + .or(self.conf.default_tenant_conf.min_resident_size_override) + } + pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { *self.tenant_conf.write().unwrap() = new_tenant_conf; } @@ -2783,6 +2790,7 @@ pub mod harness { max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), trace_read_requests: Some(tenant_conf.trace_read_requests), eviction_policy: Some(tenant_conf.eviction_policy), + min_resident_size_override: tenant_conf.min_resident_size_override, } } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 48cb6be121..cdabb23a7b 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -92,6 +92,7 @@ pub struct TenantConf { pub max_lsn_wal_lag: NonZeroU64, pub trace_read_requests: bool, pub eviction_policy: EvictionPolicy, + pub min_resident_size_override: Option, } /// Same as TenantConf, but this struct preserves the information about @@ -159,6 +160,10 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub eviction_policy: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub min_resident_size_override: Option, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -220,6 +225,9 @@ impl TenantConfOpt { .trace_read_requests .unwrap_or(global_conf.trace_read_requests), eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy), + min_resident_size_override: self + .min_resident_size_override + .or(global_conf.min_resident_size_override), } } } @@ -251,6 +259,7 @@ impl Default for TenantConf { .expect("cannot parse default max walreceiver Lsn wal lag"), trace_read_requests: false, eviction_policy: EvictionPolicy::NoEviction, + min_resident_size_override: None, } } } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index c36b6121c0..2ee723e7c3 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -121,10 +121,10 @@ struct LayerAccessStatsInner { } #[derive(Debug, Clone, Copy)] -pub(super) struct LayerAccessStatFullDetails { - pub(super) when: SystemTime, - pub(super) task_kind: TaskKind, - pub(super) access_kind: LayerAccessKind, +pub(crate) struct LayerAccessStatFullDetails { + pub(crate) when: SystemTime, + pub(crate) task_kind: TaskKind, + pub(crate) access_kind: LayerAccessKind, } #[derive(Clone, Copy, strum_macros::EnumString)] @@ -255,7 +255,7 @@ impl LayerAccessStats { ret } - pub(super) fn most_recent_access_or_residence_event( + fn most_recent_access_or_residence_event( &self, ) -> Either { let locked = self.0.lock().unwrap(); @@ -268,6 +268,13 @@ impl LayerAccessStats { } } } + + pub(crate) fn latest_activity(&self) -> SystemTime { + match self.most_recent_access_or_residence_event() { + Either::Left(mra) => mra.when, + Either::Right(re) => re.timestamp, + } + } } /// Supertrait of the [`Layer`] trait that captures the bare minimum interface diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 20d1d2bfb6..8aeacc12f5 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -244,14 +244,12 @@ pub(crate) async fn random_init_delay( ) -> Result<(), Cancelled> { use rand::Rng; + if period == Duration::ZERO { + return Ok(()); + } + let d = { let mut rng = rand::thread_rng(); - - // gen_range asserts that the range cannot be empty, which it could be because period can - // be set to zero to disable gc or compaction, so lets set it to be at least 10s. - let period = std::cmp::max(period, Duration::from_secs(10)); - - // semi-ok default as the source of jitter rng.gen_range(Duration::ZERO..=period) }; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 611c2c27d3..e80e32644b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -13,6 +13,7 @@ use pageserver_api::models::{ DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceStatus, TimelineState, }; +use remote_storage::GenericRemoteStorage; use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError}; use tokio_util::sync::CancellationToken; use tracing::*; @@ -24,6 +25,7 @@ use std::collections::HashMap; use std::fs; use std::ops::{Deref, Range}; use std::path::{Path, PathBuf}; +use std::pin::pin; use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; @@ -677,8 +679,7 @@ impl Timeline { let mut failed = 0; - let cancelled = task_mgr::shutdown_watcher(); - tokio::pin!(cancelled); + let mut cancelled = pin!(task_mgr::shutdown_watcher()); loop { tokio::select! { @@ -957,6 +958,25 @@ impl Timeline { } } + /// Evict a batch of layers. + /// + /// GenericRemoteStorage reference is required as a witness[^witness_article] for "remote storage is configured." + /// + /// [^witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html + pub async fn evict_layers( + &self, + _: &GenericRemoteStorage, + layers_to_evict: &[Arc], + cancel: CancellationToken, + ) -> anyhow::Result>>> { + let remote_client = self.remote_client.clone().expect( + "GenericRemoteStorage is configured, so timeline must have RemoteTimelineClient", + ); + + self.evict_layer_batch(&remote_client, layers_to_evict, cancel) + .await + } + /// Evict multiple layers at once, continuing through errors. /// /// Try to evict the given `layers_to_evict` by @@ -994,6 +1014,15 @@ impl Timeline { // now lock out layer removal (compaction, gc, timeline deletion) let layer_removal_guard = self.layer_removal_cs.lock().await; + { + // to avoid racing with detach and delete_timeline + let state = self.current_state(); + anyhow::ensure!( + state == TimelineState::Active, + "timeline is not active but {state:?}" + ); + } + // start the batch update let mut layer_map = self.layers.write().unwrap(); let mut batch_updates = layer_map.batch_update(); @@ -1027,6 +1056,8 @@ impl Timeline { use super::layer_map::Replacement; if local_layer.is_remote_layer() { + // TODO(issue #3851): consider returning an err here instead of false, + // which is the same out the match later return Ok(false); } @@ -1096,6 +1127,9 @@ impl Timeline { self.metrics .evictions_with_low_residence_duration .observe(delta); + info!(layer=%local_layer.short_id(), residence_millis=delta.as_millis(), "evicted layer after known residence period"); + } else { + info!(layer=%local_layer.short_id(), "evicted layer after unknown residence period"); } true @@ -1837,13 +1871,13 @@ impl Timeline { let mut timeline_state_updates = self.subscribe_for_state_updates(); let self_calculation = Arc::clone(self); - let calculation = async { + let mut calculation = pin!(async { let cancel = cancel.child_token(); let ctx = ctx.attached_child(); self_calculation .calculate_logical_size(lsn, cancel, &ctx) .await - }; + }); let timeline_state_cancellation = async { loop { match timeline_state_updates.changed().await { @@ -1872,7 +1906,6 @@ impl Timeline { "aborted because task_mgr shutdown requested".to_string() }; - tokio::pin!(calculation); loop { tokio::select! { res = &mut calculation => { return res } @@ -4013,6 +4046,67 @@ impl Timeline { } } +pub struct DiskUsageEvictionInfo { + /// Timeline's largest layer (remote or resident) + pub max_layer_size: Option, + /// Timeline's resident layers + pub resident_layers: Vec, +} + +pub struct LocalLayerInfoForDiskUsageEviction { + pub layer: Arc, + pub last_activity_ts: SystemTime, +} + +impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it + // having to allocate a string to this is bad, but it will rarely be formatted + let ts = chrono::DateTime::::from(self.last_activity_ts); + let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true); + f.debug_struct("LocalLayerInfoForDiskUsageEviction") + .field("layer", &self.layer) + .field("last_activity", &ts) + .finish() + } +} + +impl LocalLayerInfoForDiskUsageEviction { + pub fn file_size(&self) -> u64 { + self.layer.file_size() + } +} + +impl Timeline { + pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { + let layers = self.layers.read().unwrap(); + + let mut max_layer_size: Option = None; + let mut resident_layers = Vec::new(); + + for l in layers.iter_historic_layers() { + let file_size = l.file_size(); + max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); + + if l.is_remote_layer() { + continue; + } + + let last_activity_ts = l.access_stats().latest_activity(); + + resident_layers.push(LocalLayerInfoForDiskUsageEviction { + layer: l, + last_activity_ts, + }); + } + + DiskUsageEvictionInfo { + max_layer_size, + resident_layers, + } + } +} + type TraversalPathItem = ( ValueReconstructResult, Lsn, diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 3ec8c30d70..cf799a8808 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -20,7 +20,6 @@ use std::{ time::{Duration, SystemTime}, }; -use either::Either; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn}; @@ -185,13 +184,7 @@ impl Timeline { if hist_layer.is_remote_layer() { continue; } - let last_activity_ts = match hist_layer - .access_stats() - .most_recent_access_or_residence_event() - { - Either::Left(mra) => mra.when, - Either::Right(re) => re.timestamp, - }; + let last_activity_ts = hist_layer.access_stats().latest_activity(); let no_activity_for = match now.duration_since(last_activity_ts) { Ok(d) => d, Err(_e) => { @@ -399,7 +392,6 @@ impl Timeline { let mut throwaway_cache = HashMap::new(); let gather = crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx); - tokio::pin!(gather); tokio::select! { _ = cancel.cancelled() => {} diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 0c770136db..de07676ffe 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -237,11 +237,7 @@ async fn connection_manager_loop_step( if let Some(new_candidate) = walreceiver_state.next_connection_candidate() { info!("Switching to new connection candidate: {new_candidate:?}"); walreceiver_state - .change_connection( - new_candidate.safekeeper_id, - new_candidate.wal_source_connconf, - ctx, - ) + .change_connection(new_candidate, ctx) .await } } @@ -346,6 +342,8 @@ struct WalConnection { started_at: NaiveDateTime, /// Current safekeeper pageserver is connected to for WAL streaming. sk_id: NodeId, + /// Availability zone of the safekeeper. + availability_zone: Option, /// Status of the connection. status: WalConnectionStatus, /// WAL streaming task handle. @@ -405,12 +403,7 @@ impl WalreceiverState { } /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. - async fn change_connection( - &mut self, - new_sk_id: NodeId, - new_wal_source_connconf: PgConnectionConfig, - ctx: &RequestContext, - ) { + async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) { self.drop_old_connection(true).await; let id = self.id; @@ -424,7 +417,7 @@ impl WalreceiverState { async move { super::walreceiver_connection::handle_walreceiver_connection( timeline, - new_wal_source_connconf, + new_sk.wal_source_connconf, events_sender, cancellation, connect_timeout, @@ -433,13 +426,16 @@ impl WalreceiverState { .await .context("walreceiver connection handling failure") } - .instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id)) + .instrument( + info_span!("walreceiver_connection", id = %id, node_id = %new_sk.safekeeper_id), + ) }); let now = Utc::now().naive_utc(); self.wal_connection = Some(WalConnection { started_at: now, - sk_id: new_sk_id, + sk_id: new_sk.safekeeper_id, + availability_zone: new_sk.availability_zone, status: WalConnectionStatus { is_connected: false, has_processed_wal: false, @@ -546,6 +542,7 @@ impl WalreceiverState { /// * if connected safekeeper is not present, pick the candidate /// * if we haven't received any updates for some time, pick the candidate /// * if the candidate commit_lsn is much higher than the current one, pick the candidate + /// * if the candidate commit_lsn is same, but candidate is located in the same AZ as the pageserver, pick the candidate /// * if connected safekeeper stopped sending us new WAL which is available on other safekeeper, pick the candidate /// /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently. @@ -559,6 +556,7 @@ impl WalreceiverState { let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) = self.select_connection_candidate(Some(connected_sk_node))?; + let new_availability_zone = new_safekeeper_broker_data.availability_zone.clone(); let now = Utc::now().naive_utc(); if let Ok(latest_interaciton) = @@ -569,6 +567,7 @@ impl WalreceiverState { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, + availability_zone: new_availability_zone, reason: ReconnectReason::NoKeepAlives { last_keep_alive: Some( existing_wal_connection.status.latest_connection_update, @@ -594,6 +593,7 @@ impl WalreceiverState { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, + availability_zone: new_availability_zone, reason: ReconnectReason::LaggingWal { current_commit_lsn, new_commit_lsn, @@ -601,6 +601,20 @@ impl WalreceiverState { }, }); } + // If we have a candidate with the same commit_lsn as the current one, which is in the same AZ as pageserver, + // and the current one is not, switch to the new one. + if self.availability_zone.is_some() + && existing_wal_connection.availability_zone + != self.availability_zone + && self.availability_zone == new_availability_zone + { + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + availability_zone: new_availability_zone, + wal_source_connconf: new_wal_source_connconf, + reason: ReconnectReason::SwitchAvailabilityZone, + }); + } } None => debug!( "Best SK candidate has its commit_lsn behind connected SK's commit_lsn" @@ -668,6 +682,7 @@ impl WalreceiverState { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, + availability_zone: new_availability_zone, reason: ReconnectReason::NoWalTimeout { current_lsn, current_commit_lsn, @@ -686,10 +701,11 @@ impl WalreceiverState { self.wal_connection.as_mut().unwrap().discovered_new_wal = discovered_new_wal; } None => { - let (new_sk_id, _, new_wal_source_connconf) = + let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) = self.select_connection_candidate(None)?; return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, + availability_zone: new_safekeeper_broker_data.availability_zone.clone(), wal_source_connconf: new_wal_source_connconf, reason: ReconnectReason::NoExistingConnection, }); @@ -794,6 +810,7 @@ impl WalreceiverState { struct NewWalConnectionCandidate { safekeeper_id: NodeId, wal_source_connconf: PgConnectionConfig, + availability_zone: Option, // This field is used in `derive(Debug)` only. #[allow(dead_code)] reason: ReconnectReason, @@ -808,6 +825,7 @@ enum ReconnectReason { new_commit_lsn: Lsn, threshold: NonZeroU64, }, + SwitchAvailabilityZone, NoWalTimeout { current_lsn: Lsn, current_commit_lsn: Lsn, @@ -873,6 +891,7 @@ mod tests { peer_horizon_lsn: 0, local_start_lsn: 0, safekeeper_connstr: safekeeper_connstr.to_owned(), + availability_zone: None, }, latest_update, } @@ -933,6 +952,7 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender @@ -1095,6 +1115,7 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender @@ -1160,6 +1181,7 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: NodeId(1), + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender @@ -1222,6 +1244,7 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: NodeId(1), + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), discovered_new_wal: Some(NewCommittedWAL { @@ -1289,4 +1312,74 @@ mod tests { availability_zone: None, } } + + #[tokio::test] + async fn switch_to_same_availability_zone() -> anyhow::Result<()> { + // Pageserver and one of safekeepers will be in the same availability zone + // and pageserver should prefer to connect to it. + let test_az = Some("test_az".to_owned()); + + let harness = TenantHarness::create("switch_to_same_availability_zone")?; + let mut state = dummy_state(&harness).await; + state.availability_zone = test_az.clone(); + let current_lsn = Lsn(100_000).align(); + let now = Utc::now().naive_utc(); + + let connected_sk_id = NodeId(0); + + let connection_status = WalConnectionStatus { + is_connected: true, + has_processed_wal: true, + latest_connection_update: now, + latest_wal_update: now, + commit_lsn: Some(current_lsn), + streaming_lsn: Some(current_lsn), + }; + + state.wal_connection = Some(WalConnection { + started_at: now, + sk_id: connected_sk_id, + availability_zone: None, + status: connection_status, + connection_task: TaskHandle::spawn(move |sender, _| async move { + sender + .send(TaskStateUpdate::Progress(connection_status)) + .ok(); + Ok(()) + }), + discovered_new_wal: None, + }); + + // We have another safekeeper with the same commit_lsn, and it have the same availability zone as + // the current pageserver. + let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now); + same_az_sk.timeline.availability_zone = test_az.clone(); + + state.wal_stream_candidates = HashMap::from([ + ( + connected_sk_id, + dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now), + ), + (NodeId(1), same_az_sk), + ]); + + // We expect that pageserver will switch to the safekeeper in the same availability zone, + // even if it has the same commit_lsn. + let next_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(next_candidate.safekeeper_id, NodeId(1)); + assert_eq!( + next_candidate.reason, + ReconnectReason::SwitchAvailabilityZone, + "Should switch to the safekeeper in the same availability zone, if it has the same commit_lsn" + ); + assert_eq!( + next_candidate.wal_source_connconf.host(), + &Host::Domain("same_az".to_owned()) + ); + + Ok(()) + } } diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 7194a4f3ed..ea2f2392ea 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -2,6 +2,7 @@ use std::{ error::Error, + pin::pin, str::FromStr, sync::Arc, time::{Duration, SystemTime}, @@ -17,7 +18,7 @@ use postgres_ffi::v14::xlog_utils::normalize_lsn; use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; -use tokio::{pin, select, sync::watch, time}; +use tokio::{select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, trace, warn}; @@ -36,7 +37,7 @@ use crate::{ use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; -use pq_proto::ReplicationFeedback; +use pq_proto::PageserverFeedback; use utils::lsn::Lsn; /// Status of the connection. @@ -187,8 +188,7 @@ pub async fn handle_walreceiver_connection( let query = format!("START_REPLICATION PHYSICAL {startpoint}"); let copy_stream = replication_client.copy_both_simple(&query).await?; - let physical_stream = ReplicationStream::new(copy_stream); - pin!(physical_stream); + let mut physical_stream = pin!(ReplicationStream::new(copy_stream)); let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); @@ -319,12 +319,12 @@ pub async fn handle_walreceiver_connection( timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0)); // The last LSN we processed. It is not guaranteed to survive pageserver crash. - let write_lsn = u64::from(last_lsn); + let last_received_lsn = u64::from(last_lsn); // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data - let flush_lsn = u64::from(timeline.get_disk_consistent_lsn()); + let disk_consistent_lsn = u64::from(timeline.get_disk_consistent_lsn()); // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. - let apply_lsn = u64::from(timeline_remote_consistent_lsn); + let remote_consistent_lsn = u64::from(timeline_remote_consistent_lsn); let ts = SystemTime::now(); // Update the status about what we just received. This is shown in the mgmt API. @@ -343,12 +343,12 @@ pub async fn handle_walreceiver_connection( let (timeline_logical_size, _) = timeline .get_current_logical_size(&ctx) .context("Status update creation failed to get current logical size")?; - let status_update = ReplicationFeedback { + let status_update = PageserverFeedback { current_timeline_size: timeline_logical_size, - ps_writelsn: write_lsn, - ps_flushlsn: flush_lsn, - ps_applylsn: apply_lsn, - ps_replytime: ts, + last_received_lsn, + disk_consistent_lsn, + remote_consistent_lsn, + replytime: ts, }; debug!("neon_status_update {status_update:?}"); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index b0b2a23e3c..45037a8c01 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -1872,9 +1872,9 @@ RecvAppendResponses(Safekeeper *sk) return sk->state == SS_ACTIVE; } -/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */ +/* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */ void -ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * rf) +ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback * rf) { uint8 nkeys; int i; @@ -1892,45 +1892,45 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->currentClusterSize = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", + elog(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu", rf->currentClusterSize); } - else if (strcmp(key, "ps_writelsn") == 0) + else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_writelsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_writelsn)); + rf->last_received_lsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X", + LSN_FORMAT_ARGS(rf->last_received_lsn)); } - else if (strcmp(key, "ps_flushlsn") == 0) + else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_flushlsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_flushlsn)); + rf->disk_consistent_lsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X", + LSN_FORMAT_ARGS(rf->disk_consistent_lsn)); } - else if (strcmp(key, "ps_applylsn") == 0) + else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_applylsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_applylsn)); + rf->remote_consistent_lsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X", + LSN_FORMAT_ARGS(rf->remote_consistent_lsn)); } - else if (strcmp(key, "ps_replytime") == 0) + else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_replytime = pq_getmsgint64(reply_message); + rf->replytime = pq_getmsgint64(reply_message); { char *replyTimeStr; /* Copy because timestamptz_to_str returns a static buffer */ - replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime)); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s", - rf->ps_replytime, replyTimeStr); + replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime)); + elog(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s", + rf->replytime, replyTimeStr); pfree(replyTimeStr); } @@ -1944,7 +1944,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * * Skip unknown keys to support backward compatibile protocol * changes */ - elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); + elog(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len); pq_getmsgbytes(reply_message, len); }; } @@ -2024,7 +2024,7 @@ GetAcknowledgedByQuorumWALPosition(void) } /* - * ReplicationFeedbackShmemSize --- report amount of shared memory space needed + * WalproposerShmemSize --- report amount of shared memory space needed */ Size WalproposerShmemSize(void) @@ -2054,10 +2054,10 @@ WalproposerShmemInit(void) } void -replication_feedback_set(ReplicationFeedback * rf) +replication_feedback_set(PageserverFeedback * rf) { SpinLockAcquire(&walprop_shared->mutex); - memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback)); + memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback)); SpinLockRelease(&walprop_shared->mutex); } @@ -2065,43 +2065,43 @@ void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) { SpinLockAcquire(&walprop_shared->mutex); - *writeLsn = walprop_shared->feedback.ps_writelsn; - *flushLsn = walprop_shared->feedback.ps_flushlsn; - *applyLsn = walprop_shared->feedback.ps_applylsn; + *writeLsn = walprop_shared->feedback.last_received_lsn; + *flushLsn = walprop_shared->feedback.disk_consistent_lsn; + *applyLsn = walprop_shared->feedback.remote_consistent_lsn; SpinLockRelease(&walprop_shared->mutex); } /* - * Get ReplicationFeedback fields from the most advanced safekeeper + * Get PageserverFeedback fields from the most advanced safekeeper */ static void -GetLatestNeonFeedback(ReplicationFeedback * rf) +GetLatestNeonFeedback(PageserverFeedback * rf) { int latest_safekeeper = 0; - XLogRecPtr ps_writelsn = InvalidXLogRecPtr; + XLogRecPtr last_received_lsn = InvalidXLogRecPtr; for (int i = 0; i < n_safekeepers; i++) { - if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn) + if (safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn) { latest_safekeeper = i; - ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn; + last_received_lsn = safekeeper[i].appendResponse.rf.last_received_lsn; } } rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; - rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn; - rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn; - rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn; - rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; + rf->last_received_lsn = safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn; + rf->disk_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn; + rf->remote_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn; + rf->replytime = safekeeper[latest_safekeeper].appendResponse.rf.replytime; elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," - " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", + " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->ps_writelsn), - LSN_FORMAT_ARGS(rf->ps_flushlsn), - LSN_FORMAT_ARGS(rf->ps_applylsn), - rf->ps_replytime); + LSN_FORMAT_ARGS(rf->last_received_lsn), + LSN_FORMAT_ARGS(rf->disk_consistent_lsn), + LSN_FORMAT_ARGS(rf->remote_consistent_lsn), + rf->replytime); replication_feedback_set(rf); } @@ -2115,16 +2115,16 @@ HandleSafekeeperResponse(void) XLogRecPtr minFlushLsn; minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); - diskConsistentLsn = quorumFeedback.rf.ps_flushlsn; + diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; if (!syncSafekeepers) { - /* Get ReplicationFeedback fields from the most advanced safekeeper */ + /* Get PageserverFeedback fields from the most advanced safekeeper */ GetLatestNeonFeedback(&quorumFeedback.rf); SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); } - if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn) + if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) { if (minQuorumLsn > quorumFeedback.flushLsn) @@ -2142,7 +2142,7 @@ HandleSafekeeperResponse(void) * apply_lsn - This is what processed and durably saved at* * pageserver. */ - quorumFeedback.rf.ps_flushlsn, + quorumFeedback.rf.disk_consistent_lsn, GetCurrentTimestamp(), false); } @@ -2326,7 +2326,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg) msg->hs.xmin.value = pq_getmsgint64_le(&s); msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) - ParseReplicationFeedbackMessage(&s, &msg->rf); + ParsePageserverFeedbackMessage(&s, &msg->rf); pq_getmsgend(&s); return true; } @@ -2462,7 +2462,7 @@ backpressure_lag_impl(void) replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); #define MB ((XLogRecPtr)1024 * 1024) - elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X", + elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X", LSN_FORMAT_ARGS(myFlushLsn), LSN_FORMAT_ARGS(writePtr), LSN_FORMAT_ARGS(flushPtr), diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 537c733850..f016a229eb 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -280,21 +280,21 @@ typedef struct HotStandbyFeedback FullTransactionId catalog_xmin; } HotStandbyFeedback; -typedef struct ReplicationFeedback +typedef struct PageserverFeedback { /* current size of the timeline on pageserver */ uint64 currentClusterSize; /* standby_status_update fields that safekeeper received from pageserver */ - XLogRecPtr ps_writelsn; - XLogRecPtr ps_flushlsn; - XLogRecPtr ps_applylsn; - TimestampTz ps_replytime; -} ReplicationFeedback; + XLogRecPtr last_received_lsn; + XLogRecPtr disk_consistent_lsn; + XLogRecPtr remote_consistent_lsn; + TimestampTz replytime; +} PageserverFeedback; typedef struct WalproposerShmemState { slock_t mutex; - ReplicationFeedback feedback; + PageserverFeedback feedback; term_t mineLastElectedTerm; pg_atomic_uint64 backpressureThrottlingTime; } WalproposerShmemState; @@ -320,10 +320,10 @@ typedef struct AppendResponse /* Feedback recieved from pageserver includes standby_status_update fields */ /* and custom neon feedback. */ /* This part of the message is extensible. */ - ReplicationFeedback rf; + PageserverFeedback rf; } AppendResponse; -/* ReplicationFeedback is extensible part of the message that is parsed separately */ +/* PageserverFeedback is extensible part of the message that is parsed separately */ /* Other fields are fixed part */ #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) @@ -383,13 +383,13 @@ extern void WalProposerSync(int argc, char *argv[]); extern void WalProposerMain(Datum main_arg); extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); extern void WalProposerPoll(void); -extern void ParseReplicationFeedbackMessage(StringInfo reply_message, - ReplicationFeedback *rf); +extern void ParsePageserverFeedbackMessage(StringInfo reply_message, + PageserverFeedback *rf); extern void StartProposerReplication(StartReplicationCmd *cmd); extern Size WalproposerShmemSize(void); extern bool WalproposerShmemInit(void); -extern void replication_feedback_set(ReplicationFeedback *rf); +extern void replication_feedback_set(PageserverFeedback *rf); extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); /* libpqwalproposer hooks & helper type */ diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 0692340147..c39ba4f417 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.66.1" +channel = "1.68.2" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 8966e8c49b..ace921a26d 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -5,6 +5,7 @@ use anyhow::{bail, Context, Result}; use clap::Parser; use remote_storage::RemoteStorageConfig; use toml_edit::Document; +use utils::signals::ShutdownSignals; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; @@ -39,7 +40,7 @@ use utils::{ logging::{self, LogFormat}, project_git_version, sentry_init::init_sentry, - signals, tcp_listener, + tcp_listener, }; const PID_FILE_NAME: &str = "safekeeper.pid"; @@ -216,7 +217,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let timeline_collector = safekeeper::metrics::TimelineCollector::new(); metrics::register_internal(Box::new(timeline_collector))?; - let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); @@ -274,15 +274,12 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { set_build_info_metric(GIT_VERSION); // TODO: put more thoughts into handling of failed threads - // We probably should restart them. + // We should catch & die if they are in trouble. - // NOTE: we still have to handle signals like SIGQUIT to prevent coredumps - signals.handle(|signal| { - // TODO: implement graceful shutdown with joining threads etc - info!( - "received {}, terminating in immediate shutdown mode", - signal.name() - ); + // On any shutdown signal, log receival and exit. Additionally, handling + // SIGQUIT prevents coredump. + ShutdownSignals::handle(|signal| { + info!("received {}, terminating", signal.name()); std::process::exit(0); }) } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 14badebd95..cdec45c148 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -242,6 +242,7 @@ async fn record_safekeeper_info(mut request: Request) -> Result, peer_horizon_lsn: GenericGaugeVec, remote_consistent_lsn: GenericGaugeVec, - feedback_ps_write_lsn: GenericGaugeVec, + ps_last_received_lsn: GenericGaugeVec, feedback_last_time_seconds: GenericGaugeVec, timeline_active: GenericGaugeVec, wal_backup_active: GenericGaugeVec, @@ -339,15 +339,15 @@ impl TimelineCollector { .unwrap(); descs.extend(remote_consistent_lsn.desc().into_iter().cloned()); - let feedback_ps_write_lsn = GenericGaugeVec::new( + let ps_last_received_lsn = GenericGaugeVec::new( Opts::new( - "safekeeper_feedback_ps_write_lsn", + "safekeeper_ps_last_received_lsn", "Last LSN received by the pageserver, acknowledged in the feedback", ), &["tenant_id", "timeline_id"], ) .unwrap(); - descs.extend(feedback_ps_write_lsn.desc().into_iter().cloned()); + descs.extend(ps_last_received_lsn.desc().into_iter().cloned()); let feedback_last_time_seconds = GenericGaugeVec::new( Opts::new( @@ -458,7 +458,7 @@ impl TimelineCollector { epoch_start_lsn, peer_horizon_lsn, remote_consistent_lsn, - feedback_ps_write_lsn, + ps_last_received_lsn, feedback_last_time_seconds, timeline_active, wal_backup_active, @@ -489,7 +489,7 @@ impl Collector for TimelineCollector { self.epoch_start_lsn.reset(); self.peer_horizon_lsn.reset(); self.remote_consistent_lsn.reset(); - self.feedback_ps_write_lsn.reset(); + self.ps_last_received_lsn.reset(); self.feedback_last_time_seconds.reset(); self.timeline_active.reset(); self.wal_backup_active.reset(); @@ -514,11 +514,11 @@ impl Collector for TimelineCollector { let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; - let mut most_advanced: Option = None; + let mut most_advanced: Option = None; for replica in tli.replicas.iter() { if let Some(replica_feedback) = replica.pageserver_feedback { if let Some(current) = most_advanced { - if current.ps_writelsn < replica_feedback.ps_writelsn { + if current.last_received_lsn < replica_feedback.last_received_lsn { most_advanced = Some(replica_feedback); } } else { @@ -568,11 +568,10 @@ impl Collector for TimelineCollector { .set(tli.wal_storage.flush_wal_seconds); if let Some(feedback) = most_advanced { - self.feedback_ps_write_lsn + self.ps_last_received_lsn .with_label_values(labels) - .set(feedback.ps_writelsn); - if let Ok(unix_time) = feedback.ps_replytime.duration_since(SystemTime::UNIX_EPOCH) - { + .set(feedback.last_received_lsn); + if let Ok(unix_time) = feedback.replytime.duration_since(SystemTime::UNIX_EPOCH) { self.feedback_last_time_seconds .with_label_values(labels) .set(unix_time.as_secs()); @@ -599,7 +598,7 @@ impl Collector for TimelineCollector { mfs.extend(self.epoch_start_lsn.collect()); mfs.extend(self.peer_horizon_lsn.collect()); mfs.extend(self.remote_consistent_lsn.collect()); - mfs.extend(self.feedback_ps_write_lsn.collect()); + mfs.extend(self.ps_last_received_lsn.collect()); mfs.extend(self.feedback_last_time_seconds.collect()); mfs.extend(self.timeline_active.collect()); mfs.extend(self.wal_backup_active.collect()); diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index d8fe36d7f8..10b4842cbd 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -18,7 +18,7 @@ use crate::control_file; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; -use pq_proto::{ReplicationFeedback, SystemId}; +use pq_proto::{PageserverFeedback, SystemId}; use utils::{ bin_ser::LeSer, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, @@ -360,7 +360,7 @@ pub struct AppendResponse { // a criterion for walproposer --sync mode exit pub commit_lsn: Lsn, pub hs_feedback: HotStandbyFeedback, - pub pageserver_feedback: ReplicationFeedback, + pub pageserver_feedback: PageserverFeedback, } impl AppendResponse { @@ -370,7 +370,7 @@ impl AppendResponse { flush_lsn: Lsn(0), commit_lsn: Lsn(0), hs_feedback: HotStandbyFeedback::empty(), - pageserver_feedback: ReplicationFeedback::empty(), + pageserver_feedback: PageserverFeedback::empty(), } } } @@ -708,7 +708,7 @@ where commit_lsn: self.state.commit_lsn, // will be filled by the upper code to avoid bothering safekeeper hs_feedback: HotStandbyFeedback::empty(), - pageserver_feedback: ReplicationFeedback::empty(), + pageserver_feedback: PageserverFeedback::empty(), }; trace!("formed AppendResponse {:?}", ar); ar diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index b533e87c5b..a6ca89efa4 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -11,7 +11,7 @@ use postgres_backend::PostgresBackend; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError}; use postgres_ffi::get_current_timestamp; use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; -use pq_proto::{BeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody}; +use pq_proto::{BeMessage, PageserverFeedback, WalSndKeepAlive, XLogDataBody}; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; @@ -319,11 +319,9 @@ impl ReplyReader { // pageserver sends this. // Note: deserializing is on m[9..] because we skip the tag byte and len bytes. let buf = Bytes::copy_from_slice(&msg[9..]); - let reply = ReplicationFeedback::parse(buf); + let reply = PageserverFeedback::parse(buf); - trace!("ReplicationFeedback is {:?}", reply); - // Only pageserver sends ReplicationFeedback, so set the flag. - // This replica is the source of information to resend to compute. + trace!("PageserverFeedback is {:?}", reply); self.feedback.pageserver_feedback = Some(reply); self.tli diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index fca460d998..9dd8a63cf0 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -4,7 +4,7 @@ use anyhow::{anyhow, bail, Result}; use parking_lot::{Mutex, MutexGuard}; use postgres_ffi::XLogSegNo; -use pq_proto::ReplicationFeedback; +use pq_proto::PageserverFeedback; use serde::Serialize; use std::cmp::{max, min}; use std::path::PathBuf; @@ -91,7 +91,7 @@ pub struct ReplicaState { /// combined hot standby feedback from all replicas pub hs_feedback: HotStandbyFeedback, /// Replication specific feedback received from pageserver, if any - pub pageserver_feedback: Option, + pub pageserver_feedback: Option, } impl Default for ReplicaState { @@ -276,7 +276,7 @@ impl SharedState { // if let Some(pageserver_feedback) = state.pageserver_feedback { if let Some(acc_feedback) = acc.pageserver_feedback { - if acc_feedback.ps_writelsn < pageserver_feedback.ps_writelsn { + if acc_feedback.last_received_lsn < pageserver_feedback.last_received_lsn { warn!("More than one pageserver is streaming WAL for the timeline. Feedback resolving is not fully supported yet."); acc.pageserver_feedback = Some(pageserver_feedback); } @@ -287,12 +287,12 @@ impl SharedState { // last lsn received by pageserver // FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver. // See https://github.com/neondatabase/neon/issues/1171 - acc.last_received_lsn = Lsn::from(pageserver_feedback.ps_writelsn); + acc.last_received_lsn = Lsn::from(pageserver_feedback.last_received_lsn); // When at least one pageserver has preserved data up to remote_consistent_lsn, // safekeeper is free to delete it, so choose max of all pageservers. acc.remote_consistent_lsn = max( - Lsn::from(pageserver_feedback.ps_applylsn), + Lsn::from(pageserver_feedback.remote_consistent_lsn), acc.remote_consistent_lsn, ); } @@ -337,6 +337,7 @@ impl SharedState { safekeeper_connstr: conf.listen_pg_addr.clone(), backup_lsn: self.sk.inmem.backup_lsn.0, local_start_lsn: self.sk.state.local_start_lsn.0, + availability_zone: conf.availability_zone.clone(), } } } @@ -584,7 +585,7 @@ impl Timeline { let replica_state = shared_state.replicas[replica_id].unwrap(); let reported_remote_consistent_lsn = replica_state .pageserver_feedback - .map(|f| Lsn(f.ps_applylsn)) + .map(|f| Lsn(f.remote_consistent_lsn)) .unwrap_or(Lsn::INVALID); let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet (reported_remote_consistent_lsn!= Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. diff --git a/scripts/sk_collect_dumps/.gitignore b/scripts/sk_collect_dumps/.gitignore new file mode 100644 index 0000000000..d9d4d0296a --- /dev/null +++ b/scripts/sk_collect_dumps/.gitignore @@ -0,0 +1,2 @@ +result +*.json diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md new file mode 100644 index 0000000000..52b73e9495 --- /dev/null +++ b/scripts/sk_collect_dumps/readme.md @@ -0,0 +1,25 @@ +# Collect /v1/debug_dump from all safekeeper nodes + +1. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory. +2. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database. + +## How to use ansible (staging) + +``` +AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml + +AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.eu-west-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml +``` + +## How to use ansible (prod) + +``` +AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-west-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml + +AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml + +AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.eu-central-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml + +AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.ap-southeast-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml +``` + diff --git a/scripts/sk_collect_dumps/remote.yaml b/scripts/sk_collect_dumps/remote.yaml new file mode 100644 index 0000000000..29ce83efde --- /dev/null +++ b/scripts/sk_collect_dumps/remote.yaml @@ -0,0 +1,18 @@ +- name: Fetch state dumps from safekeepers + hosts: safekeepers + gather_facts: False + remote_user: "{{ remote_user }}" + + tasks: + - name: Download file + get_url: + url: "http://{{ inventory_hostname }}:7676/v1/debug_dump?dump_all=true&dump_disk_content=false" + dest: "/tmp/{{ inventory_hostname }}.json" + + - name: Fetch file from remote hosts + fetch: + src: "/tmp/{{ inventory_hostname }}.json" + dest: "./result/{{ inventory_hostname }}.json" + flat: yes + fail_on_missing: no + diff --git a/scripts/sk_collect_dumps/upload.sh b/scripts/sk_collect_dumps/upload.sh new file mode 100755 index 0000000000..2e54ecba1c --- /dev/null +++ b/scripts/sk_collect_dumps/upload.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +if [ -z "$DB_CONNSTR" ]; then + echo "DB_CONNSTR is not set" + exit 1 +fi + +# Create a temporary table for JSON data +psql $DB_CONNSTR -c 'DROP TABLE IF EXISTS tmp_json' +psql $DB_CONNSTR -c 'CREATE TABLE tmp_json (data jsonb)' + +for file in ./result/*.json; do + echo "$file" + SK_ID=$(jq '.config.id' $file) + echo "SK_ID: $SK_ID" + jq -c ".timelines[] | . + {\"sk_id\": $SK_ID}" $file | psql $DB_CONNSTR -c "\\COPY tmp_json (data) FROM STDIN" +done + +TABLE_NAME=$1 + +if [ -z "$TABLE_NAME" ]; then + echo "TABLE_NAME is not set, skipping conversion to table with typed columns" + echo "Usage: ./upload.sh TABLE_NAME" + exit 0 +fi + +psql $DB_CONNSTR <>'sk_id')::bigint AS sk_id, + (data->>'tenant_id') AS tenant_id, + (data->>'timeline_id') AS timeline_id, + (data->'memory'->>'active')::bool AS active, + (data->'memory'->>'flush_lsn')::bigint AS flush_lsn, + (data->'memory'->'mem_state'->>'backup_lsn')::bigint AS backup_lsn, + (data->'memory'->'mem_state'->>'commit_lsn')::bigint AS commit_lsn, + (data->'memory'->'mem_state'->>'peer_horizon_lsn')::bigint AS peer_horizon_lsn, + (data->'memory'->'mem_state'->>'remote_consistent_lsn')::bigint AS remote_consistent_lsn, + (data->'memory'->>'write_lsn')::bigint AS write_lsn, + (data->'memory'->>'num_computes')::bigint AS num_computes, + (data->'memory'->>'epoch_start_lsn')::bigint AS epoch_start_lsn, + (data->'memory'->>'last_removed_segno')::bigint AS last_removed_segno, + (data->'memory'->>'is_cancelled')::bool AS is_cancelled, + (data->'control_file'->>'backup_lsn')::bigint AS disk_backup_lsn, + (data->'control_file'->>'commit_lsn')::bigint AS disk_commit_lsn, + (data->'control_file'->'acceptor_state'->>'term')::bigint AS disk_term, + (data->'control_file'->>'local_start_lsn')::bigint AS local_start_lsn, + (data->'control_file'->>'peer_horizon_lsn')::bigint AS disk_peer_horizon_lsn, + (data->'control_file'->>'timeline_start_lsn')::bigint AS timeline_start_lsn, + (data->'control_file'->>'remote_consistent_lsn')::bigint AS disk_remote_consistent_lsn +FROM tmp_json +EOF diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index f3544a7cb8..6563fec8b6 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -133,6 +133,7 @@ async fn publish(client: Option, n_keys: u64) { peer_horizon_lsn: 5, safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(), local_start_lsn: 0, + availability_zone: None, }; counter += 1; yield info; diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto index 1a46896d02..4b2de1a8e5 100644 --- a/storage_broker/proto/broker.proto +++ b/storage_broker/proto/broker.proto @@ -36,9 +36,11 @@ message SafekeeperTimelineInfo { uint64 local_start_lsn = 9; // A connection string to use for WAL receiving. string safekeeper_connstr = 10; + // Availability zone of a safekeeper. + optional string availability_zone = 11; } message TenantTimelineId { bytes tenant_id = 1; bytes timeline_id = 2; -} \ No newline at end of file +} diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 1a0d261184..d7ace28426 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -33,6 +33,7 @@ use tonic::transport::server::Connected; use tonic::Code; use tonic::{Request, Response, Status}; use tracing::*; +use utils::signals::ShutdownSignals; use metrics::{Encoder, TextEncoder}; use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE}; @@ -437,6 +438,14 @@ async fn main() -> Result<(), Box> { info!("version: {GIT_VERSION}"); ::metrics::set_build_info_metric(GIT_VERSION); + // On any shutdown signal, log receival and exit. + std::thread::spawn(move || { + ShutdownSignals::handle(|signal| { + info!("received {}, terminating", signal.name()); + std::process::exit(0); + }) + }); + let registry = Registry { shared_state: Arc::new(RwLock::new(SharedState::new(args.all_keys_chan_size))), timeline_chan_size: args.timeline_chan_size, @@ -516,6 +525,7 @@ mod tests { peer_horizon_lsn: 5, safekeeper_connstr: "neon-1-sk-1.local:7676".to_owned(), local_start_lsn: 0, + availability_zone: None, } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9929d3e66b..a232bf8b6d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1220,6 +1220,28 @@ class PageserverHttpClient(requests.Session): self.verbose_error(res) return TenantConfig.from_json(res.json()) + def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]): + assert "tenant_id" not in config.keys() + res = self.put( + f"http://localhost:{self.port}/v1/tenant/config", + json={**config, "tenant_id": str(tenant_id)}, + ) + self.verbose_error(res) + + def patch_tenant_config_client_side( + self, + tenant_id: TenantId, + inserts: Optional[Dict[str, Any]] = None, + removes: Optional[List[str]] = None, + ): + current = self.tenant_config(tenant_id).tenant_specific_overrides + if inserts is not None: + current.update(inserts) + if removes is not None: + for key in removes: + del current[key] + self.set_tenant_config(tenant_id, current) + def tenant_size(self, tenant_id: TenantId) -> int: return self.tenant_size_and_modelinputs(tenant_id)[0] @@ -1536,6 +1558,18 @@ class PageserverHttpClient(requests.Session): for layer in info.historic_layers: self.evict_layer(tenant_id, timeline_id, layer.layer_file_name) + def disk_usage_eviction_run(self, request: dict[str, Any]): + res = self.put( + f"http://localhost:{self.port}/v1/disk_usage_eviction/run", + json=request, + ) + self.verbose_error(res) + return res.json() + + def tenant_break(self, tenant_id: TenantId): + res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break") + self.verbose_error(res) + @dataclass class TenantConfig: diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py new file mode 100644 index 0000000000..6ed09734fe --- /dev/null +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -0,0 +1,541 @@ +import shutil +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Tuple + +import pytest +import toml +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + LocalFsStorage, + NeonEnv, + NeonEnvBuilder, + PageserverHttpClient, + PgBin, + RemoteStorageKind, + wait_for_last_flush_lsn, + wait_for_upload_queue_empty, + wait_until, +) +from fixtures.types import Lsn, TenantId, TimelineId + +GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" + + +@pytest.mark.parametrize("config_level_override", [None, 400]) +def test_min_resident_size_override_handling( + neon_env_builder: NeonEnvBuilder, config_level_override: int +): + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + def assert_config(tenant_id, expect_override, expect_effective): + config = ps_http.tenant_config(tenant_id) + assert config.tenant_specific_overrides.get("min_resident_size_override") == expect_override + assert config.effective_config.get("min_resident_size_override") == expect_effective + + def assert_overrides(tenant_id, default_tenant_conf_value): + ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 200}) + assert_config(tenant_id, 200, 200) + + ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 0}) + assert_config(tenant_id, 0, 0) + + ps_http.set_tenant_config(tenant_id, {}) + assert_config(tenant_id, None, default_tenant_conf_value) + + env.pageserver.stop() + if config_level_override is not None: + env.pageserver.start( + overrides=( + "--pageserver-config-override=tenant_config={ min_resident_size_override = " + + str(config_level_override) + + " }", + ) + ) + else: + env.pageserver.start() + + tenant_id, _ = env.neon_cli.create_tenant() + assert_overrides(tenant_id, config_level_override) + + # Also ensure that specifying the paramter to create_tenant works, in addition to http-level recconfig. + tenant_id, _ = env.neon_cli.create_tenant(conf={"min_resident_size_override": "100"}) + assert_config(tenant_id, 100, 100) + ps_http.set_tenant_config(tenant_id, {}) + assert_config(tenant_id, None, config_level_override) + + +@dataclass +class EvictionEnv: + timelines: list[Tuple[TenantId, TimelineId]] + neon_env: NeonEnv + pg_bin: PgBin + pageserver_http: PageserverHttpClient + layer_size: int + pgbench_init_lsns: Dict[TenantId, Lsn] + + def timelines_du(self) -> Tuple[int, int, int]: + return poor_mans_du(self.neon_env, [(tid, tlid) for tid, tlid in self.timelines]) + + def du_by_timeline(self) -> Dict[Tuple[TenantId, TimelineId], int]: + return { + (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)])[0] + for tid, tlid in self.timelines + } + + def warm_up_tenant(self, tenant_id: TenantId): + """ + Start a read-only compute at the LSN after pgbench -i, and run pgbench -S against it. + This assumes that the tenant is still at the state after pbench -i. + """ + lsn = self.pgbench_init_lsns[tenant_id] + with self.neon_env.postgres.create_start("main", tenant_id=tenant_id, lsn=lsn) as pg: + self.pg_bin.run(["pgbench", "-S", pg.connstr()]) + + def pageserver_start_with_disk_usage_eviction( + self, period, max_usage_pct, min_avail_bytes, mock_behavior + ): + disk_usage_config = { + "period": period, + "max_usage_pct": max_usage_pct, + "min_avail_bytes": min_avail_bytes, + "mock_statvfs": mock_behavior, + } + + enc = toml.TomlEncoder() + + self.neon_env.pageserver.start( + overrides=( + "--pageserver-config-override=disk_usage_based_eviction=" + + enc.dump_inline_table(disk_usage_config).replace("\n", " "), + ), + ) + + def statvfs_called(): + assert self.neon_env.pageserver.log_contains(".*running mocked statvfs.*") + + wait_until(10, 1, statvfs_called) + + +@pytest.fixture +def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv: + """ + Creates two tenants, one somewhat larger than the other. + """ + + log.info(f"setting up eviction_env for test {request.node.name}") + + neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}") + + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + # allow because we are invoking this manually; we always warn on executing disk based eviction + env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*") + + # remove the initial tenant + ## why wait for upload queue? => https://github.com/neondatabase/neon/issues/3865 + assert env.initial_timeline + wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, env.initial_timeline) + pageserver_http.tenant_detach(env.initial_tenant) + assert isinstance(env.remote_storage, LocalFsStorage) + tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant) + assert tenant_remote_storage.is_dir() + shutil.rmtree(tenant_remote_storage) + env.initial_tenant = TenantId("0" * 32) + env.initial_timeline = None + + # Choose small layer_size so that we can use low pgbench_scales and still get a large count of layers. + # Large count of layers and small layer size is good for testing because it makes evictions predictable. + # Predictable in the sense that many layer evictions will be required to reach the eviction target, because + # each eviction only makes small progress. That means little overshoot, and thereby stable asserts. + pgbench_scales = [4, 6] + layer_size = 5 * 1024**2 + + pgbench_init_lsns = {} + + timelines = [] + for scale in pgbench_scales: + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "0s", + "checkpoint_distance": f"{layer_size}", + "image_creation_threshold": "100", + "compaction_target_size": f"{layer_size}", + } + ) + + with env.postgres.create_start("main", tenant_id=tenant_id) as pg: + pg_bin.run(["pgbench", "-i", f"-s{scale}", pg.connstr()]) + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + timelines.append((tenant_id, timeline_id)) + + # stop the safekeepers to avoid on-demand downloads caused by + # initial logical size calculation triggered by walreceiver connection status + # when we restart the pageserver process in any of the tests + env.neon_cli.safekeeper_stop() + + # after stopping the safekeepers, we know that no new WAL will be coming in + for tenant_id, timeline_id in timelines: + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload_queue_empty(env.pageserver, tenant_id, timeline_id) + tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id) + assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"] + assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"] + pgbench_init_lsns[tenant_id] = Lsn(tl_info["last_record_lsn"]) + + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + log.info(f"{layers}") + assert ( + len(layers.historic_layers) >= 10 + ), "evictions happen at layer granularity, but we often assert at byte-granularity" + + eviction_env = EvictionEnv( + timelines=timelines, + neon_env=env, + pageserver_http=pageserver_http, + layer_size=layer_size, + pg_bin=pg_bin, + pgbench_init_lsns=pgbench_init_lsns, + ) + + return eviction_env + + +def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): + env = eviction_env + + env.neon_env.pageserver.allowed_errors.append( + r".* Changing Active tenant to Broken state, reason: broken from test" + ) + broken_tenant_id, broken_timeline_id = env.timelines[0] + env.pageserver_http.tenant_break(broken_tenant_id) + + healthy_tenant_id, healthy_timeline_id = env.timelines[1] + + broken_size_pre, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)]) + healthy_size_pre, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)]) + + # try to evict everything, then validate that broken tenant wasn't touched + target = broken_size_pre + healthy_size_pre + + response = env.pageserver_http.disk_usage_eviction_run({"evict_bytes": target}) + log.info(f"{response}") + + broken_size_post, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)]) + healthy_size_post, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)]) + + assert broken_size_pre == broken_size_post, "broken tenant should not be touched" + assert healthy_size_post < healthy_size_pre + assert healthy_size_post == 0 + env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) + + +def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv): + """ + Basic test to ensure that we evict enough to relieve pressure. + """ + env = eviction_env + pageserver_http = env.pageserver_http + + (total_on_disk, _, _) = env.timelines_du() + + target = total_on_disk // 2 + + response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target}) + log.info(f"{response}") + + (later_total_on_disk, _, _) = env.timelines_du() + + actual_change = total_on_disk - later_total_on_disk + + assert 0 <= actual_change, "nothing can load layers during this test" + assert actual_change >= target, "must evict more than half" + assert ( + response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change + ), "report accurately evicted bytes" + assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected" + + +def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv): + """ + Override tenant min resident and ensure that it will be respected by eviction. + """ + env = eviction_env + ps_http = env.pageserver_http + + (total_on_disk, _, _) = env.timelines_du() + du_by_timeline = env.du_by_timeline() + log.info("du_by_timeline: %s", du_by_timeline) + + assert len(du_by_timeline) == 2, "this test assumes two tenants" + large_tenant = max(du_by_timeline, key=du_by_timeline.__getitem__) + small_tenant = min(du_by_timeline, key=du_by_timeline.__getitem__) + assert du_by_timeline[large_tenant] > du_by_timeline[small_tenant] + assert ( + du_by_timeline[large_tenant] - du_by_timeline[small_tenant] > 5 * env.layer_size + ), "ensure this test will do more than 1 eviction" + + # Give the larger tenant a haircut while preventing the smaller tenant from getting one. + # To prevent the smaller from getting a haircut, we set min_resident_size to its current size. + # To ensure the larger tenant is getting a haircut, any non-zero `target` will do. + min_resident_size = du_by_timeline[small_tenant] + target = 1 + assert ( + du_by_timeline[large_tenant] > min_resident_size + ), "ensure the larger tenant will get a haircut" + ps_http.patch_tenant_config_client_side( + small_tenant[0], {"min_resident_size_override": min_resident_size} + ) + ps_http.patch_tenant_config_client_side( + large_tenant[0], {"min_resident_size_override": min_resident_size} + ) + + # Make the large tenant more-recently used. An incorrect implemention would try to evict + # the smaller tenant completely first, before turning to the larger tenant, + # since the smaller tenant's layers are least-recently-used. + env.warm_up_tenant(large_tenant[0]) + + # do one run + response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + log.info(f"{response}") + + time.sleep(1) # give log time to flush + assert not env.neon_env.pageserver.log_contains( + GLOBAL_LRU_LOG_LINE, + ), "this test is pointless if it fell back to global LRU" + + (later_total_on_disk, _, _) = env.timelines_du() + later_du_by_timeline = env.du_by_timeline() + log.info("later_du_by_timeline: %s", later_du_by_timeline) + + actual_change = total_on_disk - later_total_on_disk + assert 0 <= actual_change, "nothing can load layers during this test" + assert actual_change >= target, "eviction must always evict more than target" + assert ( + response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change + ), "report accurately evicted bytes" + assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected" + + assert ( + later_du_by_timeline[small_tenant] == du_by_timeline[small_tenant] + ), "small tenant sees no haircut" + assert ( + later_du_by_timeline[large_tenant] < du_by_timeline[large_tenant] + ), "large tenant gets a haircut" + assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target + + +def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): + """ + If we can't relieve pressure using tenant_min_resident_size-respecting eviction, + we should continue to evict layers following global LRU. + """ + env = eviction_env + ps_http = env.pageserver_http + + (total_on_disk, _, _) = env.timelines_du() + target = total_on_disk + + response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + log.info(f"{response}") + + (later_total_on_disk, _, _) = env.timelines_du() + actual_change = total_on_disk - later_total_on_disk + assert 0 <= actual_change, "nothing can load layers during this test" + assert actual_change >= target, "eviction must always evict more than target" + + time.sleep(1) # give log time to flush + assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE) + env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) + + +def test_partial_evict_tenant(eviction_env: EvictionEnv): + """ + Warm up a tenant, then build up pressure to cause in evictions in both. + We expect + * the default min resident size to be respect (largest layer file size) + * the warmed-up tenants layers above min resident size to be evicted after the cold tenant's. + """ + env = eviction_env + ps_http = env.pageserver_http + + (total_on_disk, _, _) = env.timelines_du() + du_by_timeline = env.du_by_timeline() + + # pick any tenant + [our_tenant, other_tenant] = list(du_by_timeline.keys()) + (tenant_id, timeline_id) = our_tenant + + # make our tenant more recently used than the other one + env.warm_up_tenant(tenant_id) + + # Build up enough pressure to require evictions from both tenants, + # but not enough to fall into global LRU. + # So, set target to all occipied space, except 2*env.layer_size per tenant + target = ( + du_by_timeline[other_tenant] + (du_by_timeline[our_tenant] // 2) - 2 * 2 * env.layer_size + ) + response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + log.info(f"{response}") + + (later_total_on_disk, _, _) = env.timelines_du() + actual_change = total_on_disk - later_total_on_disk + assert 0 <= actual_change, "nothing can load layers during this test" + assert actual_change >= target, "eviction must always evict more than target" + + later_du_by_timeline = env.du_by_timeline() + for tenant, later_tenant_usage in later_du_by_timeline.items(): + assert ( + later_tenant_usage < du_by_timeline[tenant] + ), "all tenants should have lost some layers" + + assert ( + later_du_by_timeline[our_tenant] > 0.5 * du_by_timeline[our_tenant] + ), "our warmed up tenant should be at about half capacity, part 1" + assert ( + # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. + # So, check for up to 3 here. + later_du_by_timeline[our_tenant] + < 0.5 * du_by_timeline[our_tenant] + 3 * env.layer_size + ), "our warmed up tenant should be at about half capacity, part 2" + assert ( + later_du_by_timeline[other_tenant] < 2 * env.layer_size + ), "the other tenant should be evicted to is min_resident_size, i.e., max layer file size" + + +def poor_mans_du( + env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]] +) -> Tuple[int, int, int]: + """ + Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples; + this could be done over layers endpoint just as well. + """ + total_on_disk = 0 + largest_layer = 0 + smallest_layer = None + for tenant_id, timeline_id in timelines: + dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + assert dir.exists(), f"timeline dir does not exist: {dir}" + sum = 0 + for file in dir.iterdir(): + if "__" not in file.name: + continue + size = file.stat().st_size + sum += size + largest_layer = max(largest_layer, size) + if smallest_layer: + smallest_layer = min(smallest_layer, size) + else: + smallest_layer = size + log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}") + + log.info(f"{tenant_id}/{timeline_id}: sum {sum}") + total_on_disk += sum + + assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0 + return (total_on_disk, largest_layer, smallest_layer or 0) + + +def test_statvfs_error_handling(eviction_env: EvictionEnv): + """ + We should log an error that statvfs fails. + """ + env = eviction_env + env.neon_env.pageserver.stop() + env.pageserver_start_with_disk_usage_eviction( + period="1s", + max_usage_pct=90, + min_avail_bytes=0, + mock_behavior={ + "type": "Failure", + "mocked_error": "EIO", + }, + ) + + assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO") + env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO") + + +def test_statvfs_pressure_usage(eviction_env: EvictionEnv): + """ + If statvfs data shows 100% usage, the eviction task will drive it down to + the configured max_usage_pct. + """ + env = eviction_env + + env.neon_env.pageserver.stop() + + # make it seem like we're at 100% utilization by setting total bytes to the used bytes + total_size, _, _ = env.timelines_du() + blocksize = 512 + total_blocks = (total_size + (blocksize - 1)) // blocksize + + env.pageserver_start_with_disk_usage_eviction( + period="1s", + max_usage_pct=33, + min_avail_bytes=0, + mock_behavior={ + "type": "Success", + "blocksize": blocksize, + "total_blocks": total_blocks, + # Only count layer files towards used bytes in the mock_statvfs. + # This avoids accounting for metadata files & tenant conf in the tests. + "name_filter": ".*__.*", + }, + ) + + def relieved_log_message(): + assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved") + + wait_until(10, 1, relieved_log_message) + + post_eviction_total_size, _, _ = env.timelines_du() + + assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage" + + +def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): + """ + If statvfs data shows 100% usage, the eviction task will drive it down to + at least the configured min_avail_bytes. + """ + env = eviction_env + + env.neon_env.pageserver.stop() + + # make it seem like we're at 100% utilization by setting total bytes to the used bytes + total_size, _, _ = env.timelines_du() + blocksize = 512 + total_blocks = (total_size + (blocksize - 1)) // blocksize + + min_avail_bytes = total_size // 3 + + env.pageserver_start_with_disk_usage_eviction( + period="1s", + max_usage_pct=100, + min_avail_bytes=min_avail_bytes, + mock_behavior={ + "type": "Success", + "blocksize": blocksize, + "total_blocks": total_blocks, + # Only count layer files towards used bytes in the mock_statvfs. + # This avoids accounting for metadata files & tenant conf in the tests. + "name_filter": ".*__.*", + }, + ) + + def relieved_log_message(): + assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved") + + wait_until(10, 1, relieved_log_message) + + post_eviction_total_size, _, _ = env.timelines_du() + + assert ( + total_size - post_eviction_total_size >= min_avail_bytes + ), "we requested at least min_avail_bytes worth of free space" diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 9fd9794436..757df1dab8 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 9fd9794436d02fbfe68f8fca5beab218907cec41 +Subproject commit 757df1dab82f69bdf69469119420a0bbb307f992 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 257aaefb25..f8a650e49b 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 257aaefb251c5c85c44652c01bf68c43db62748a +Subproject commit f8a650e49b06d39ad131b860117504044b01f312