From 45bf76eb05944e1356ad0b7158e90a3d4502a2da Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 3 Apr 2023 14:57:36 +0200 Subject: [PATCH] enable layer eviction by default in prod (#3933) Leave disk_usage_based_eviction above the current max usage in prod (82%ish), so that deploying this commit won't trigger disk_usage_based_eviction. As indicated in the TODO, we'll decrease the value to 80% later. Also update the staging YAMLs to use the anchor syntax for `evictions_low_residence_duration_metric_threshold` like we do in the prod YAMLs as of this patch. --- .github/ansible/prod.ap-southeast-1.hosts.yaml | 10 ++++++++++ .github/ansible/prod.eu-central-1.hosts.yaml | 10 ++++++++++ .github/ansible/prod.us-east-2.hosts.yaml | 10 ++++++++++ .github/ansible/prod.us-west-2.hosts.yaml | 10 ++++++++++ .github/ansible/staging.eu-west-1.hosts.yaml | 8 ++------ .github/ansible/staging.us-east-2.hosts.yaml | 8 ++------ 6 files changed, 44 insertions(+), 12 deletions(-) diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml index 8ccb67b04a..c185086eef 100644 --- a/.github/ansible/prod.ap-southeast-1.hosts.yaml +++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml index b3cd5de01c..0a0f974ea4 100644 --- a/.github/ansible/prod.eu-central-1.hosts.yaml +++ b/.github/ansible/prod.eu-central-1.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml index 22c705e1cf..4427bb344e 100644 --- a/.github/ansible/prod.us-east-2.hosts.yaml +++ b/.github/ansible/prod.us-east-2.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml index f03e2d9435..53626b4f59 100644 --- a/.github/ansible/prod.us-west-2.hosts.yaml +++ b/.github/ansible/prod.us-west-2.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index b634345c72..34c8e77280 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -8,20 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min - evictions_low_residence_duration_metric_threshold: "20m" disk_usage_based_eviction: max_usage_pct: 80 - # TODO: learn typical resident-size growth rate [GiB/minute] and configure - # min_avail_bytes such that we have X minutes of headroom. min_avail_bytes: 0 - # We assume that the worst-case growth rate is small enough that we can - # catch above-threshold conditions by checking every 10s. period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold" period: "20m" - threshold: "20m" + threshold: &default_eviction_threshold "20m" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index c1ceaa61ee..94f2be83a4 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -8,20 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min - evictions_low_residence_duration_metric_threshold: "20m" disk_usage_based_eviction: max_usage_pct: 80 - # TODO: learn typical resident-size growth rate [GiB/minute] and configure - # min_avail_bytes such that we have X minutes of headroom. min_avail_bytes: 0 - # We assume that the worst-case growth rate is small enough that we can - # catch above-threshold conditions by checking every 10s. period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold" period: "20m" - threshold: "20m" + threshold: &default_eviction_threshold "20m" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}"