diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md new file mode 100644 index 0000000000..3f32b80ca8 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md @@ -0,0 +1,10 @@ +## Describe your changes + +## Issue ticket number and link + +## Checklist before requesting a review +- [ ] I have performed a self-review of my code. +- [ ] If it is a core feature, I have added thorough tests. +- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? +- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. + diff --git a/.github/PULL_REQUEST_TEMPLATE/release-pr.md b/.github/PULL_REQUEST_TEMPLATE/release-pr.md index 8fcc3bd4af..a848077e6a 100644 --- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md +++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md @@ -14,7 +14,7 @@ - [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel - [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true) - [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some) -- [ ] Check [cloud SLO dashboard](https://observer.zenith.tech/d/_oWcBMJ7k/cloud-slos?orgId=1) -- [ ] Check [compute startup metrics dashboard](https://observer.zenith.tech/d/5OkYJEmVz/compute-startup-time) +- [ ] Check [cloud SLO dashboard](https://neonprod.grafana.net/d/_oWcBMJ7k/cloud-slos?orgId=1) +- [ ] Check [compute startup metrics dashboard](https://neonprod.grafana.net/d/5OkYJEmVz/compute-startup-time) diff --git a/.github/ansible/neon-stress.hosts.yaml b/.github/ansible/neon-stress.hosts.yaml deleted file mode 100644 index 5d5df5a6d5..0000000000 --- a/.github/ansible/neon-stress.hosts.yaml +++ /dev/null @@ -1,32 +0,0 @@ -storage: - vars: - bucket_name: neon-storage-ireland - bucket_region: eu-west-1 - console_mgmt_base_url: http://neon-stress-console.local - broker_endpoint: http://storage-broker.neon-stress.local:50051 - safekeeper_enable_s3_offload: 'false' - pageserver_config_stub: - pg_distrib_dir: /usr/local - remote_storage: - bucket_name: "{{ bucket_name }}" - bucket_region: "{{ bucket_region }}" - prefix_in_bucket: "{{ inventory_hostname }}" - safekeeper_s3_prefix: neon-stress/wal - hostname_suffix: ".local" - remote_user: admin - sentry_environment: development - children: - pageservers: - hosts: - neon-stress-ps-1: - console_region_id: aws-eu-west-1 - neon-stress-ps-2: - console_region_id: aws-eu-west-1 - safekeepers: - hosts: - neon-stress-sk-1: - console_region_id: aws-eu-west-1 - neon-stress-sk-2: - console_region_id: aws-eu-west-1 - neon-stress-sk-3: - console_region_id: aws-eu-west-1 diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index cfcc3a9ae8..fce450ed39 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -6,6 +6,8 @@ storage: broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051 pageserver_config_stub: pg_distrib_dir: /usr/local + metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events + metric_collection_interval: 10min remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/staging.hosts.yaml b/.github/ansible/staging.hosts.yaml deleted file mode 100644 index 79acfd1d2a..0000000000 --- a/.github/ansible/staging.hosts.yaml +++ /dev/null @@ -1,35 +0,0 @@ -storage: - vars: - bucket_name: zenith-staging-storage-us-east-1 - bucket_region: us-east-1 - console_mgmt_base_url: http://console-staging.local - broker_endpoint: http://storage-broker.staging.local:50051 - pageserver_config_stub: - pg_distrib_dir: /usr/local - remote_storage: - bucket_name: "{{ bucket_name }}" - bucket_region: "{{ bucket_region }}" - prefix_in_bucket: "{{ inventory_hostname }}" - safekeeper_s3_prefix: us-stage/wal - hostname_suffix: ".local" - remote_user: admin - sentry_environment: development - - children: - pageservers: - hosts: - zenith-us-stage-ps-2: - console_region_id: aws-us-east-1 - zenith-us-stage-ps-3: - console_region_id: aws-us-east-1 - zenith-us-stage-ps-4: - console_region_id: aws-us-east-1 - - safekeepers: - hosts: - zenith-us-stage-sk-4: - console_region_id: aws-us-east-1 - zenith-us-stage-sk-5: - console_region_id: aws-us-east-1 - zenith-us-stage-sk-6: - console_region_id: aws-us-east-1 diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index 78a4582e57..1d1b8dbfa4 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -6,6 +6,8 @@ storage: broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051 pageserver_config_stub: pg_distrib_dir: /usr/local + metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events + metric_collection_interval: 10min remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" @@ -25,6 +27,8 @@ storage: ansible_host: i-0c3e70929edb5d691 pageserver-1.us-east-2.aws.neon.build: ansible_host: i-0565a8b4008aa3f40 + pageserver-2.us-east-2.aws.neon.build: + ansible_host: i-01e31cdf7e970586a safekeepers: hosts: diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml index ae9c1f2e40..08304503c5 100644 --- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "http://console-staging.local/management/api/v2" domain: "*.eu-west-1.aws.neon.build" sentryEnvironment: "development" + wssPort: 8443 # -- Additional labels for neon-proxy pods podLabels: @@ -23,6 +24,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build + httpsPort: 443 #metrics: # enabled: true diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml index a2f932e4fb..be0fc329c9 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "http://console-staging.local/management/api/v2" domain: "*.cloud.stage.neon.tech" sentryEnvironment: "development" + wssPort: 8443 # -- Additional labels for neon-proxy pods podLabels: @@ -23,6 +24,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.beta.us-east-2.aws.neon.build + httpsPort: 443 #metrics: # enabled: true diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml index 1138536e94..b7f712585b 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "http://console-staging.local/management/api/v2" domain: "*.us-east-2.aws.neon.build" sentryEnvironment: "development" + wssPort: 8443 # -- Additional labels for neon-proxy pods podLabels: @@ -23,6 +24,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build + httpsPort: 443 #metrics: # enabled: true diff --git a/.github/helm-values/neon-stress.neon-storage-broker.yaml b/.github/helm-values/neon-stress.neon-storage-broker.yaml deleted file mode 100644 index e11e5d4214..0000000000 --- a/.github/helm-values/neon-stress.neon-storage-broker.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# Helm chart values for neon-storage-broker -podLabels: - neon_env: neon-stress - neon_service: storage-broker - -# Use L4 LB -service: - # service.annotations -- Annotations to add to the service - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet - # assign service to this name at external-dns - external-dns.alpha.kubernetes.io/hostname: storage-broker.neon-stress.local - # service.type -- Service type - type: LoadBalancer - # service.port -- broker listen port - port: 50051 - -ingress: - enabled: false - -metrics: - enabled: true - serviceMonitor: - enabled: true - selector: - release: kube-prometheus-stack - -extraManifests: - - apiVersion: operator.victoriametrics.com/v1beta1 - kind: VMServiceScrape - metadata: - name: "{{ include \"neon-storage-broker.fullname\" . }}" - labels: - helm.sh/chart: neon-storage-broker-{{ .Chart.Version }} - app.kubernetes.io/name: neon-storage-broker - app.kubernetes.io/instance: neon-storage-broker - app.kubernetes.io/version: "{{ .Chart.AppVersion }}" - app.kubernetes.io/managed-by: Helm - namespace: "{{ .Release.Namespace }}" - spec: - selector: - matchLabels: - app.kubernetes.io/name: "neon-storage-broker" - endpoints: - - port: broker - path: /metrics - interval: 10s - scrapeTimeout: 10s - namespaceSelector: - matchNames: - - "{{ .Release.Namespace }}" - -settings: - sentryEnvironment: "development" diff --git a/.github/helm-values/neon-stress.proxy-scram.yaml b/.github/helm-values/neon-stress.proxy-scram.yaml deleted file mode 100644 index ed580349fc..0000000000 --- a/.github/helm-values/neon-stress.proxy-scram.yaml +++ /dev/null @@ -1,52 +0,0 @@ -fullnameOverride: "neon-stress-proxy-scram" - -settings: - authBackend: "console" - authEndpoint: "http://neon-stress-console.local/management/api/v2" - domain: "*.stress.neon.tech" - sentryEnvironment: "development" - -podLabels: - zenith_service: proxy-scram - zenith_env: staging - zenith_region: eu-west-1 - zenith_region_slug: ireland - -exposedService: - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: external - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing - external-dns.alpha.kubernetes.io/hostname: '*.stress.neon.tech' - -metrics: - enabled: true - serviceMonitor: - enabled: true - selector: - release: kube-prometheus-stack - -extraManifests: - - apiVersion: operator.victoriametrics.com/v1beta1 - kind: VMServiceScrape - metadata: - name: "{{ include \"neon-proxy.fullname\" . }}" - labels: - helm.sh/chart: neon-proxy-{{ .Chart.Version }} - app.kubernetes.io/name: neon-proxy - app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}" - app.kubernetes.io/version: "{{ .Chart.AppVersion }}" - app.kubernetes.io/managed-by: Helm - namespace: "{{ .Release.Namespace }}" - spec: - selector: - matchLabels: - app.kubernetes.io/name: "neon-proxy" - endpoints: - - port: http - path: /metrics - interval: 10s - scrapeTimeout: 10s - namespaceSelector: - matchNames: - - "{{ .Release.Namespace }}" diff --git a/.github/helm-values/neon-stress.proxy.yaml b/.github/helm-values/neon-stress.proxy.yaml deleted file mode 100644 index 94270ced09..0000000000 --- a/.github/helm-values/neon-stress.proxy.yaml +++ /dev/null @@ -1,61 +0,0 @@ -fullnameOverride: "neon-stress-proxy" - -settings: - authBackend: "link" - authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/" - uri: "https://console.dev.neon.tech/psql_session/" - sentryEnvironment: "development" - -# -- Additional labels for zenith-proxy pods -podLabels: - zenith_service: proxy - zenith_env: staging - zenith_region: eu-west-1 - zenith_region_slug: ireland - -service: - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: external - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internal - external-dns.alpha.kubernetes.io/hostname: neon-stress-proxy.local - type: LoadBalancer - -exposedService: - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: external - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing - external-dns.alpha.kubernetes.io/hostname: connect.dev.neon.tech - -metrics: - enabled: true - serviceMonitor: - enabled: true - selector: - release: kube-prometheus-stack - -extraManifests: - - apiVersion: operator.victoriametrics.com/v1beta1 - kind: VMServiceScrape - metadata: - name: "{{ include \"neon-proxy.fullname\" . }}" - labels: - helm.sh/chart: neon-proxy-{{ .Chart.Version }} - app.kubernetes.io/name: neon-proxy - app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}" - app.kubernetes.io/version: "{{ .Chart.AppVersion }}" - app.kubernetes.io/managed-by: Helm - namespace: "{{ .Release.Namespace }}" - spec: - selector: - matchLabels: - app.kubernetes.io/name: "neon-proxy" - endpoints: - - port: http - path: /metrics - interval: 10s - scrapeTimeout: 10s - namespaceSelector: - matchNames: - - "{{ .Release.Namespace }}" diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml index 4e4aff1f9e..e9e89aff7c 100644 --- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "http://console-release.local/management/api/v2" domain: "*.ap-southeast-1.aws.neon.tech" sentryEnvironment: "production" + wssPort: 8443 # -- Additional labels for neon-proxy pods podLabels: @@ -23,6 +24,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech + httpsPort: 443 #metrics: # enabled: true diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml index 94290a87e1..5366ba4ae5 100644 --- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "http://console-release.local/management/api/v2" domain: "*.eu-central-1.aws.neon.tech" sentryEnvironment: "production" + wssPort: 8443 # -- Additional labels for neon-proxy pods podLabels: @@ -23,6 +24,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech + httpsPort: 443 #metrics: # enabled: true diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml index 1a4023708b..e71e457f13 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "http://console-release.local/management/api/v2" domain: "*.us-east-2.aws.neon.tech" sentryEnvironment: "production" + wssPort: 8443 # -- Additional labels for neon-proxy pods podLabels: @@ -23,6 +24,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech + httpsPort: 443 #metrics: # enabled: true diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml index 2942d6a2aa..9afe94edd1 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "http://console-release.local/management/api/v2" domain: "*.us-west-2.aws.neon.tech" sentryEnvironment: "production" + wssPort: 8443 # -- Additional labels for neon-proxy pods podLabels: @@ -23,6 +24,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: us-west-2.aws.neon.tech + httpsPort: 443 #metrics: # enabled: true diff --git a/.github/helm-values/production.proxy-scram.yaml b/.github/helm-values/production.proxy-scram.yaml index c7143cd61a..8143f7e575 100644 --- a/.github/helm-values/production.proxy-scram.yaml +++ b/.github/helm-values/production.proxy-scram.yaml @@ -3,6 +3,7 @@ settings: authEndpoint: "http://console-release.local/management/api/v2" domain: "*.cloud.neon.tech" sentryEnvironment: "production" + wssPort: 8443 podLabels: zenith_service: proxy-scram @@ -16,6 +17,7 @@ exposedService: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech' + httpsPort: 443 metrics: enabled: true diff --git a/.github/helm-values/staging.neon-storage-broker.yaml b/.github/helm-values/staging.neon-storage-broker.yaml deleted file mode 100644 index 6b21c286a1..0000000000 --- a/.github/helm-values/staging.neon-storage-broker.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# Helm chart values for neon-storage-broker -podLabels: - neon_env: staging - neon_service: storage-broker - -# Use L4 LB -service: - # service.annotations -- Annotations to add to the service - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet - # assign service to this name at external-dns - external-dns.alpha.kubernetes.io/hostname: storage-broker.staging.local - # service.type -- Service type - type: LoadBalancer - # service.port -- broker listen port - port: 50051 - -ingress: - enabled: false - -metrics: - enabled: true - serviceMonitor: - enabled: true - selector: - release: kube-prometheus-stack - -extraManifests: - - apiVersion: operator.victoriametrics.com/v1beta1 - kind: VMServiceScrape - metadata: - name: "{{ include \"neon-storage-broker.fullname\" . }}" - labels: - helm.sh/chart: neon-storage-broker-{{ .Chart.Version }} - app.kubernetes.io/name: neon-storage-broker - app.kubernetes.io/instance: neon-storage-broker - app.kubernetes.io/version: "{{ .Chart.AppVersion }}" - app.kubernetes.io/managed-by: Helm - namespace: "{{ .Release.Namespace }}" - spec: - selector: - matchLabels: - app.kubernetes.io/name: "neon-storage-broker" - endpoints: - - port: broker - path: /metrics - interval: 10s - scrapeTimeout: 10s - namespaceSelector: - matchNames: - - "{{ .Release.Namespace }}" - -settings: - sentryEnvironment: "development" diff --git a/.github/helm-values/staging.proxy-scram.yaml b/.github/helm-values/staging.proxy-scram.yaml deleted file mode 100644 index 66f9921c9a..0000000000 --- a/.github/helm-values/staging.proxy-scram.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# Helm chart values for zenith-proxy. -# This is a YAML-formatted file. - -image: - repository: neondatabase/neon - -settings: - authBackend: "console" - authEndpoint: "http://console-staging.local/management/api/v2" - domain: "*.cloud.stage.neon.tech" - sentryEnvironment: "development" - -# -- Additional labels for zenith-proxy pods -podLabels: - zenith_service: proxy-scram - zenith_env: staging - zenith_region: us-east-1 - zenith_region_slug: virginia - -exposedService: - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: external - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing - external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech - -metrics: - enabled: true - serviceMonitor: - enabled: true - selector: - release: kube-prometheus-stack - -extraManifests: - - apiVersion: operator.victoriametrics.com/v1beta1 - kind: VMServiceScrape - metadata: - name: "{{ include \"neon-proxy.fullname\" . }}" - labels: - helm.sh/chart: neon-proxy-{{ .Chart.Version }} - app.kubernetes.io/name: neon-proxy - app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}" - app.kubernetes.io/version: "{{ .Chart.AppVersion }}" - app.kubernetes.io/managed-by: Helm - namespace: "{{ .Release.Namespace }}" - spec: - selector: - matchLabels: - app.kubernetes.io/name: "neon-proxy" - endpoints: - - port: http - path: /metrics - interval: 10s - scrapeTimeout: 10s - namespaceSelector: - matchNames: - - "{{ .Release.Namespace }}" diff --git a/.github/helm-values/staging.proxy.yaml b/.github/helm-values/staging.proxy.yaml deleted file mode 100644 index a22082e625..0000000000 --- a/.github/helm-values/staging.proxy.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# Helm chart values for zenith-proxy. -# This is a YAML-formatted file. - -image: - repository: neondatabase/neon - -settings: - authBackend: "link" - authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/" - uri: "https://console.stage.neon.tech/psql_session/" - sentryEnvironment: "development" - -# -- Additional labels for zenith-proxy pods -podLabels: - zenith_service: proxy - zenith_env: staging - zenith_region: us-east-1 - zenith_region_slug: virginia - -exposedService: - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: external - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing - external-dns.alpha.kubernetes.io/hostname: connect.stage.neon.tech - -metrics: - enabled: true - serviceMonitor: - enabled: true - selector: - release: kube-prometheus-stack - -extraManifests: - - apiVersion: operator.victoriametrics.com/v1beta1 - kind: VMServiceScrape - metadata: - name: "{{ include \"neon-proxy.fullname\" . }}" - labels: - helm.sh/chart: neon-proxy-{{ .Chart.Version }} - app.kubernetes.io/name: neon-proxy - app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}" - app.kubernetes.io/version: "{{ .Chart.AppVersion }}" - app.kubernetes.io/managed-by: Helm - namespace: "{{ .Release.Namespace }}" - spec: - selector: - matchLabels: - app.kubernetes.io/name: "neon-proxy" - endpoints: - - port: http - path: /metrics - interval: 10s - scrapeTimeout: 10s - namespaceSelector: - matchNames: - - "{{ .Release.Namespace }}" diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index e3e0f1e820..59317f0a47 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -18,6 +18,7 @@ on: region_id: description: 'Use a particular region. If not set the default region will be used' required: false + default: 'aws-us-east-2' save_perf_report: type: boolean description: 'Publish perf report or not. If not set, the report is published only for the main branch' @@ -115,13 +116,10 @@ jobs: # neon-captest-prefetch: Same, with prefetching enabled (new project) # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ] + platform: [ neon-captest-reuse, neon-captest-prefetch, rds-postgres ] db_size: [ 10gb ] runner: [ us-east-2 ] include: - - platform: neon-captest-new - db_size: 50gb - runner: us-east-2 - platform: neon-captest-prefetch db_size: 50gb runner: us-east-2 @@ -409,7 +407,7 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init timeout-minutes: 360 # 6h diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 43b855a2b0..1bbba8e3fd 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -111,6 +111,7 @@ jobs: # Some of our rust modules use FFI and need those to be checked - name: Get postgres headers run: make postgres-headers -j$(nproc) + - name: Run cargo clippy run: ./run_clippy.sh @@ -126,6 +127,11 @@ jobs: cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack + # https://github.com/EmbarkStudios/cargo-deny + - name: Check rust licenses/bans/advisories/sources + if: ${{ !cancelled() }} + run: cargo deny check + build-neon: runs-on: [ self-hosted, dev, x64 ] container: @@ -177,13 +183,12 @@ jobs: # corresponding Cargo.toml files for their descriptions. - name: Set env variables run: | + CARGO_FEATURES="--features testing" if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FEATURES="--features testing" CARGO_FLAGS="--locked $CARGO_FEATURES" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FEATURES="--features testing,profiling" CARGO_FLAGS="--locked --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV @@ -555,10 +560,14 @@ jobs: - name: Kaniko build compute tools run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} - compute-node-image-v14: + compute-node-image: runs-on: [ self-hosted, dev, x64 ] container: gcr.io/kaniko-project/executor:v1.9.0-debug needs: [ tag ] + strategy: + fail-fast: false + matrix: + version: [ v14, v15 ] defaults: run: shell: sh -eu {0} @@ -573,32 +582,40 @@ jobs: - name: Configure ECR login run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Kaniko build compute node with extensions v14 - run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} + - name: Kaniko build compute node with extensions + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-${{ matrix.version }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - compute-node-image-v15: + vm-compute-node-image: runs-on: [ self-hosted, dev, x64 ] - container: gcr.io/kaniko-project/executor:v1.9.0-debug - needs: [ tag ] + needs: [ tag, compute-node-image ] + strategy: + fail-fast: false + matrix: + version: [ v14, v15 ] defaults: run: shell: sh -eu {0} steps: - - name: Checkout - uses: actions/checkout@v1 # v3 won't work with kaniko - with: - submodules: true - fetch-depth: 0 + - name: Downloading latest vm-builder + run: | + curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder + chmod +x vm-builder - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + - name: Pulling compute-node image + run: | + docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - - name: Kaniko build compute node with extensions v15 - run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} + - name: Build vm image + run: | + ./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + + - name: Pushing vm-compute-node image + run: | + docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} test-images: - needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ] + needs: [ tag, neon-image, compute-node-image, compute-tools-image ] runs-on: [ self-hosted, dev, x64 ] steps: @@ -642,13 +659,13 @@ jobs: promote-images: runs-on: [ self-hosted, dev, x64 ] - needs: [ tag, test-images ] + needs: [ tag, test-images, vm-compute-node-image ] if: github.event_name != 'workflow_dispatch' container: amazon/aws-cli strategy: fail-fast: false matrix: - name: [ neon, compute-node-v14, compute-node-v15, compute-tools ] + name: [ neon, compute-node-v14, vm-compute-node-v14, compute-node-v15, vm-compute-node-v15, compute-tools] steps: - name: Promote image to latest @@ -681,9 +698,15 @@ jobs: - name: Pull compute node v14 image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14 + - name: Pull vm compute node v14 image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14 + - name: Pull compute node v15 image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15 + - name: Pull vm compute node v15 image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15 + - name: Pull rust image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust @@ -695,7 +718,9 @@ jobs: crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest - name: Configure Docker Hub login run: | @@ -712,9 +737,15 @@ jobs: - name: Push compute node v14 image to Docker Hub run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} + - name: Push vm compute node v14 image to Docker Hub + run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} + - name: Push compute node v15 image to Docker Hub run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} + - name: Push vm compute node v15 image to Docker Hub + run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} + - name: Push rust image to Docker Hub run: crane push rust neondatabase/rust:pinned @@ -726,26 +757,25 @@ jobs: crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest calculate-deploy-targets: runs-on: [ self-hosted, dev, x64 ] if: | - (github.ref_name == 'main' || github.ref_name == 'release') && + github.ref_name == 'release' && github.event_name != 'workflow_dispatch' outputs: matrix-include: ${{ steps.set-matrix.outputs.include }} steps: - id: set-matrix run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "staging.neon-storage-broker", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}' - echo "include=[$STAGING]" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + if [[ "$GITHUB_REF_NAME" == "release" ]]; then PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}' echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to 'release'" exit 1 fi @@ -756,7 +786,7 @@ jobs: # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] if: | - (github.ref_name == 'main' || github.ref_name == 'release') && + github.ref_name == 'release' && github.event_name != 'workflow_dispatch' defaults: run: @@ -764,6 +794,8 @@ jobs: strategy: matrix: include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} + environment: + name: prod-old steps: - name: Checkout uses: actions/checkout@v3 @@ -800,7 +832,7 @@ jobs: container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly - needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] + needs: [ push-docker-hub, tag, regress-tests ] if: | (github.ref_name == 'main') && github.event_name != 'workflow_dispatch' @@ -809,7 +841,9 @@ jobs: shell: bash strategy: matrix: - target_region: [ us-east-2 ] + target_region: [ eu-west-1, us-east-2 ] + environment: + name: dev-${{ matrix.target_region }} steps: - name: Checkout uses: actions/checkout@v3 @@ -881,6 +915,8 @@ jobs: strategy: matrix: target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ] + environment: + name: prod-${{ matrix.target_region }} steps: - name: Checkout uses: actions/checkout@v3 @@ -912,7 +948,7 @@ jobs: # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] if: | - (github.ref_name == 'main' || github.ref_name == 'release') && + github.ref_name == 'release' && github.event_name != 'workflow_dispatch' defaults: run: @@ -920,6 +956,8 @@ jobs: strategy: matrix: include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} + environment: + name: prod-old env: KUBECONFIG: .kubeconfig steps: @@ -945,8 +983,8 @@ jobs: - name: Re-deploy proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s deploy-storage-broker: name: deploy storage broker on old staging and old prod @@ -955,7 +993,7 @@ jobs: # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] if: | - (github.ref_name == 'main' || github.ref_name == 'release') && + github.ref_name == 'release' && github.event_name != 'workflow_dispatch' defaults: run: @@ -963,6 +1001,8 @@ jobs: strategy: matrix: include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} + environment: + name: prod-old env: KUBECONFIG: .kubeconfig steps: @@ -1011,6 +1051,8 @@ jobs: target_cluster: dev-eu-west-1-zeta deploy_link_proxy: false deploy_legacy_scram_proxy: false + environment: + name: dev-${{ matrix.target_region }} steps: - name: Checkout uses: actions/checkout@v3 @@ -1026,19 +1068,19 @@ jobs: - name: Re-deploy scram proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - name: Re-deploy link proxy if: matrix.deploy_link_proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - name: Re-deploy legacy scram proxy if: matrix.deploy_legacy_scram_proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s deploy-storage-broker-dev-new: runs-on: [ self-hosted, dev, x64 ] @@ -1058,6 +1100,8 @@ jobs: target_cluster: dev-us-east-2-beta - target_region: eu-west-1 target_cluster: dev-eu-west-1-zeta + environment: + name: dev-${{ matrix.target_region }} steps: - name: Checkout uses: actions/checkout@v3 @@ -1096,6 +1140,8 @@ jobs: target_cluster: prod-eu-central-1-gamma - target_region: ap-southeast-1 target_cluster: prod-ap-southeast-1-epsilon + environment: + name: prod-${{ matrix.target_region }} steps: - name: Checkout uses: actions/checkout@v3 @@ -1111,7 +1157,7 @@ jobs: - name: Re-deploy proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s deploy-storage-broker-prod-new: runs-on: prod @@ -1135,6 +1181,8 @@ jobs: target_cluster: prod-eu-central-1-gamma - target_region: ap-southeast-1 target_cluster: prod-ap-southeast-1-epsilon + environment: + name: prod-${{ matrix.target_region }} steps: - name: Checkout uses: actions/checkout@v3 diff --git a/Cargo.lock b/Cargo.lock index 913b39da0f..1649e28faa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -66,12 +66,6 @@ dependencies = [ "backtrace", ] -[[package]] -name = "arrayvec" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" - [[package]] name = "asn1-rs" version = "0.5.1" @@ -563,6 +557,12 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" +[[package]] +name = "base64" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5" + [[package]] name = "bincode" version = "1.3.3" @@ -627,12 +627,6 @@ version = "3.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba" -[[package]] -name = "bytemuck" -version = "1.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aaa3a8d9a1ca92e282c96a32d6511b695d7d994d1d102ba85d279f9b2756947f" - [[package]] name = "byteorder" version = "1.4.3" @@ -893,7 +887,7 @@ dependencies = [ "clap 4.0.29", "comfy-table", "git-version", - "nix 0.25.1", + "nix", "once_cell", "pageserver_api", "postgres", @@ -928,15 +922,6 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" -[[package]] -name = "cpp_demangle" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f" -dependencies = [ - "cfg-if", -] - [[package]] name = "cpufeatures" version = "0.2.5" @@ -1060,7 +1045,7 @@ dependencies = [ "crossterm_winapi", "libc", "mio", - "parking_lot 0.12.1", + "parking_lot", "signal-hook", "signal-hook-mio", "winapi", @@ -1170,15 +1155,6 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23d8666cb01533c39dde32bcbab8e227b4ed6679b2c925eba05feabea39508fb" -[[package]] -name = "debugid" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6ee87af31d84ef885378aebca32be3d682b0e0dc119d5b4860a2c5bb5046730" -dependencies = [ - "uuid 0.8.2", -] - [[package]] name = "debugid" version = "0.8.0" @@ -1186,7 +1162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d" dependencies = [ "serde", - "uuid 1.2.2", + "uuid", ] [[package]] @@ -1312,18 +1288,6 @@ dependencies = [ "windows-sys 0.42.0", ] -[[package]] -name = "findshlibs" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64" -dependencies = [ - "cc", - "lazy_static", - "libc", - "winapi", -] - [[package]] name = "fixedbitset" version = "0.4.2" @@ -1336,21 +1300,6 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" -[[package]] -name = "foreign-types" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" -dependencies = [ - "foreign-types-shared", -] - -[[package]] -name = "foreign-types-shared" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" - [[package]] name = "form_urlencoded" version = "1.1.0" @@ -1752,16 +1701,16 @@ dependencies = [ ] [[package]] -name = "hyper-tls" -version = "0.5.0" +name = "hyper-tungstenite" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" +checksum = "d62004bcd4f6f85d9e2aa4206f1466ee67031f5ededcb6c6e62d48f9306ad879" dependencies = [ - "bytes", "hyper", - "native-tls", + "pin-project", "tokio", - "tokio-native-tls", + "tokio-tungstenite", + "tungstenite", ] [[package]] @@ -1815,24 +1764,6 @@ dependencies = [ "serde", ] -[[package]] -name = "inferno" -version = "0.10.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3886428c6400486522cf44b8626e7b94ad794c14390290f2a274dcf728a58f" -dependencies = [ - "ahash", - "atty", - "indexmap", - "itoa", - "lazy_static", - "log", - "num-format", - "quick-xml", - "rgb", - "str_stack", -] - [[package]] name = "inotify" version = "0.9.6" @@ -1920,7 +1851,7 @@ version = "8.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09f4f04699947111ec1733e71778d763555737579e44b85844cae8e1940a1828" dependencies = [ - "base64", + "base64 0.13.1", "pem", "ring", "serde", @@ -2059,15 +1990,6 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" -[[package]] -name = "memmap2" -version = "0.5.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b182332558b18d807c4ce1ca8ca983b34c3ee32765e47b3f0f69b90355cc1dc" -dependencies = [ - "libc", -] - [[package]] name = "memoffset" version = "0.6.5" @@ -2135,37 +2057,6 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" -[[package]] -name = "native-tls" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" -dependencies = [ - "lazy_static", - "libc", - "log", - "openssl", - "openssl-probe", - "openssl-sys", - "schannel", - "security-framework", - "security-framework-sys", - "tempfile", -] - -[[package]] -name = "nix" -version = "0.23.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f3790c00a0150112de0f4cd161e3d7fc4b2d8a5542ffc35f099a2562aecb35c" -dependencies = [ - "bitflags", - "cc", - "cfg-if", - "libc", - "memoffset 0.6.5", -] - [[package]] name = "nix" version = "0.25.1" @@ -2229,16 +2120,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-format" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3" -dependencies = [ - "arrayvec", - "itoa", -] - [[package]] name = "num-integer" version = "0.1.45" @@ -2299,51 +2180,12 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" -[[package]] -name = "openssl" -version = "0.10.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29d971fd5722fec23977260f6e81aa67d2f22cadbdc2aa049f1022d9a3be1566" -dependencies = [ - "bitflags", - "cfg-if", - "foreign-types", - "libc", - "once_cell", - "openssl-macros", - "openssl-sys", -] - -[[package]] -name = "openssl-macros" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "openssl-probe" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" -[[package]] -name = "openssl-sys" -version = "0.9.79" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5454462c0eced1e97f2ec09036abc8da362e66802f66fd20f86854d9d8cbcbc4" -dependencies = [ - "autocfg", - "cc", - "libc", - "pkg-config", - "vcpkg", -] - [[package]] name = "os_info" version = "3.5.1" @@ -2394,7 +2236,7 @@ dependencies = [ "hyper", "itertools", "metrics", - "nix 0.25.1", + "nix", "num-traits", "once_cell", "pageserver_api", @@ -2404,11 +2246,11 @@ dependencies = [ "postgres-types", "postgres_connection", "postgres_ffi", - "pprof", "pq_proto", "rand", "regex", "remote_storage", + "reqwest", "rstar", "scopeguard", "serde", @@ -2417,12 +2259,12 @@ dependencies = [ "signal-hook", "storage_broker", "svg_fmt", - "tar", "tempfile", "tenant_size_model", "thiserror", "tokio", "tokio-postgres", + "tokio-tar", "tokio-util", "toml_edit", "tracing", @@ -2447,17 +2289,6 @@ dependencies = [ "workspace_hack", ] -[[package]] -name = "parking_lot" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" -dependencies = [ - "instant", - "lock_api", - "parking_lot_core 0.8.5", -] - [[package]] name = "parking_lot" version = "0.12.1" @@ -2465,21 +2296,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", - "parking_lot_core 0.9.5", -] - -[[package]] -name = "parking_lot_core" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" -dependencies = [ - "cfg-if", - "instant", - "libc", - "redox_syscall", - "smallvec", - "winapi", + "parking_lot_core", ] [[package]] @@ -2507,7 +2324,7 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03c64931a1a212348ec4f3b4362585eca7159d0d09cbdf4a7f74f02173596fd4" dependencies = [ - "base64", + "base64 0.13.1", ] [[package]] @@ -2528,18 +2345,18 @@ dependencies = [ [[package]] name = "phf" -version = "0.10.1" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c" dependencies = [ "phf_shared", ] [[package]] name = "phf_shared" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676" dependencies = [ "siphasher", ] @@ -2576,12 +2393,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" -[[package]] -name = "pkg-config" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" - [[package]] name = "plotters" version = "0.3.4" @@ -2612,12 +2423,12 @@ dependencies = [ [[package]] name = "postgres" -version = "0.19.2" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +version = "0.19.4" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" dependencies = [ "bytes", "fallible-iterator", - "futures", + "futures-util", "log", "tokio", "tokio-postgres", @@ -2626,9 +2437,9 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" dependencies = [ - "base64", + "base64 0.20.0", "byteorder", "bytes", "fallible-iterator", @@ -2643,8 +2454,8 @@ dependencies = [ [[package]] name = "postgres-types" -version = "0.2.3" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +version = "0.2.4" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" dependencies = [ "bytes", "fallible-iterator", @@ -2688,25 +2499,6 @@ dependencies = [ "workspace_hack", ] -[[package]] -name = "pprof" -version = "0.6.1" -source = "git+https://github.com/neondatabase/pprof-rs.git?branch=wallclock-profiling#4e011a87d22fb4d21d15cc38bce81ff1c75e4bc9" -dependencies = [ - "backtrace", - "cfg-if", - "findshlibs", - "inferno", - "lazy_static", - "libc", - "log", - "nix 0.23.2", - "parking_lot 0.11.2", - "symbolic-demangle", - "tempfile", - "thiserror", -] - [[package]] name = "ppv-lite86" version = "0.2.17" @@ -2723,6 +2515,7 @@ dependencies = [ "postgres-protocol", "rand", "serde", + "thiserror", "tokio", "tracing", "workspace_hack", @@ -2801,7 +2594,7 @@ dependencies = [ "lazy_static", "libc", "memchr", - "parking_lot 0.12.1", + "parking_lot", "procfs", "thiserror", ] @@ -2868,7 +2661,7 @@ dependencies = [ "anyhow", "async-trait", "atty", - "base64", + "base64 0.13.1", "bstr", "bytes", "clap 4.0.29", @@ -2878,15 +2671,17 @@ dependencies = [ "hex", "hmac", "hyper", + "hyper-tungstenite", "itertools", "md5", "metrics", "once_cell", - "parking_lot 0.12.1", + "parking_lot", "pin-project-lite", "pq_proto", "rand", "rcgen", + "regex", "reqwest", "routerify", "rstest", @@ -2898,6 +2693,7 @@ dependencies = [ "sha2", "socket2", "thiserror", + "tls-listener", "tokio", "tokio-postgres", "tokio-postgres-rustls", @@ -2906,20 +2702,12 @@ dependencies = [ "tracing-subscriber", "url", "utils", - "uuid 1.2.2", + "uuid", + "webpki-roots", "workspace_hack", "x509-parser", ] -[[package]] -name = "quick-xml" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8533f14c8382aaad0d592c812ac3b826162128b65662331e1127b45c3d18536b" -dependencies = [ - "memchr", -] - [[package]] name = "quote" version = "1.0.21" @@ -3078,7 +2866,7 @@ version = "0.11.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68cc60575865c7831548863cc02356512e3f1dc2f3f82cb837d7fc4cc8f3c97c" dependencies = [ - "base64", + "base64 0.13.1", "bytes", "encoding_rs", "futures-core", @@ -3088,12 +2876,10 @@ dependencies = [ "http-body", "hyper", "hyper-rustls", - "hyper-tls", "ipnet", "js-sys", "log", "mime", - "native-tls", "once_cell", "percent-encoding", "pin-project-lite", @@ -3103,7 +2889,6 @@ dependencies = [ "serde_json", "serde_urlencoded", "tokio", - "tokio-native-tls", "tokio-rustls", "tower-service", "url", @@ -3114,15 +2899,6 @@ dependencies = [ "winreg", ] -[[package]] -name = "rgb" -version = "0.8.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3603b7d71ca82644f79b5a06d1220e9a58ede60bd32255f698cb1af8838b8db3" -dependencies = [ - "bytemuck", -] - [[package]] name = "ring" version = "0.16.20" @@ -3261,7 +3037,7 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0864aeff53f8c05aa08d86e5ef839d3dfcf07aeba2db32f12db0ef716e87bd55" dependencies = [ - "base64", + "base64 0.13.1", ] [[package]] @@ -3303,9 +3079,9 @@ dependencies = [ "humantime", "hyper", "metrics", - "nix 0.25.1", + "nix", "once_cell", - "parking_lot 0.12.1", + "parking_lot", "postgres", "postgres-protocol", "postgres_ffi", @@ -3417,14 +3193,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17ad137b9df78294b98cab1a650bef237cc6c950e82e5ce164655e674d07c5cc" dependencies = [ "httpdate", - "native-tls", "reqwest", + "rustls", "sentry-backtrace", "sentry-contexts", "sentry-core", "sentry-panic", "tokio", "ureq", + "webpki-roots", ] [[package]] @@ -3482,7 +3259,7 @@ version = "0.29.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccc95faa4078768a6bf8df45e2b894bbf372b3dbbfb364e9429c1c58ab7545c6" dependencies = [ - "debugid 0.8.0", + "debugid", "getrandom", "hex", "serde", @@ -3490,7 +3267,7 @@ dependencies = [ "thiserror", "time", "url", - "uuid 1.2.2", + "uuid", ] [[package]] @@ -3542,7 +3319,7 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25bf4a5a814902cd1014dbccfa4d4560fb8432c779471e96e035602519f82eef" dependencies = [ - "base64", + "base64 0.13.1", "chrono", "hex", "indexmap", @@ -3564,6 +3341,17 @@ dependencies = [ "syn", ] +[[package]] +name = "sha-1" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sha1" version = "0.10.5" @@ -3712,7 +3500,7 @@ dependencies = [ "hyper", "metrics", "once_cell", - "parking_lot 0.12.1", + "parking_lot", "prost", "tokio", "tokio-stream", @@ -3723,12 +3511,6 @@ dependencies = [ "workspace_hack", ] -[[package]] -name = "str_stack" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb" - [[package]] name = "stringprep" version = "0.1.2" @@ -3776,29 +3558,6 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2" -[[package]] -name = "symbolic-common" -version = "8.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f551f902d5642e58039aee6a9021a61037926af96e071816361644983966f540" -dependencies = [ - "debugid 0.7.3", - "memmap2", - "stable_deref_trait", - "uuid 0.8.2", -] - -[[package]] -name = "symbolic-demangle" -version = "8.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4564ca7b4e6eb14105aa8bbbce26e080f6b5d9c4373e67167ab31f7b86443750" -dependencies = [ - "cpp_demangle", - "rustc-demangle", - "symbolic-common", -] - [[package]] name = "syn" version = "1.0.105" @@ -3957,10 +3716,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] -name = "tokio" -version = "1.21.1" +name = "tls-listener" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0020c875007ad96677dcc890298f4b942882c5d4eb7cc8f439fc3bf813dc9c95" +checksum = "c9d4ff21187d434ac7709bfc7441ca88f63681247e5ad99f0f08c8c91ddc103d" +dependencies = [ + "futures-util", + "hyper", + "pin-project-lite", + "thiserror", + "tokio", + "tokio-rustls", +] + +[[package]] +name = "tokio" +version = "1.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae" dependencies = [ "autocfg", "bytes", @@ -3968,12 +3741,11 @@ dependencies = [ "memchr", "mio", "num_cpus", - "once_cell", "pin-project-lite", "signal-hook-registry", "socket2", "tokio-macros", - "winapi", + "windows-sys 0.42.0", ] [[package]] @@ -3997,28 +3769,19 @@ dependencies = [ "syn", ] -[[package]] -name = "tokio-native-tls" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b" -dependencies = [ - "native-tls", - "tokio", -] - [[package]] name = "tokio-postgres" -version = "0.7.6" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +version = "0.7.7" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" dependencies = [ "async-trait", "byteorder", "bytes", "fallible-iterator", - "futures", + "futures-channel", + "futures-util", "log", - "parking_lot 0.12.1", + "parking_lot", "percent-encoding", "phf", "pin-project-lite", @@ -4065,6 +3828,32 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-tar" +version = "0.3.0" +source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142" +dependencies = [ + "filetime", + "futures-core", + "libc", + "redox_syscall", + "tokio", + "tokio-stream", + "xattr", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f714dd15bead90401d77e04243611caec13726c2408afd5b31901dfcdcb3b181" +dependencies = [ + "futures-util", + "log", + "tokio", + "tungstenite", +] + [[package]] name = "tokio-util" version = "0.7.4" @@ -4109,7 +3898,7 @@ dependencies = [ "async-stream", "async-trait", "axum", - "base64", + "base64 0.13.1", "bytes", "futures-core", "futures-util", @@ -4291,6 +4080,25 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" +[[package]] +name = "tungstenite" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e27992fd6a8c29ee7eef28fc78349aa244134e10ad447ce3b9f0ac0ed0fa4ce0" +dependencies = [ + "base64 0.13.1", + "byteorder", + "bytes", + "http", + "httparse", + "log", + "rand", + "sha-1", + "thiserror", + "url", + "utf-8", +] + [[package]] name = "typenum" version = "1.16.0" @@ -4351,12 +4159,14 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b97acb4c28a254fd7a4aeec976c46a7fa404eac4d7c134b30c75144846d7cb8f" dependencies = [ - "base64", + "base64 0.13.1", "chunked_transfer", "log", - "native-tls", "once_cell", + "rustls", "url", + "webpki", + "webpki-roots", ] [[package]] @@ -4377,6 +4187,12 @@ version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8db7427f936968176eaa7cdf81b7f98b980b18495ec28f1b5791ac3bfe3eea9" +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utils" version = "0.1.0" @@ -4393,7 +4209,7 @@ dependencies = [ "hyper", "jsonwebtoken", "metrics", - "nix 0.25.1", + "nix", "once_cell", "pq_proto", "rand", @@ -4417,12 +4233,6 @@ dependencies = [ "workspace_hack", ] -[[package]] -name = "uuid" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" - [[package]] name = "uuid" version = "1.2.2" @@ -4439,12 +4249,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" -[[package]] -name = "vcpkg" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" - [[package]] name = "version_check" version = "0.9.4" @@ -4743,9 +4547,9 @@ dependencies = [ name = "workspace_hack" version = "0.1.0" dependencies = [ - "ahash", "anyhow", "bytes", + "chrono", "clap 4.0.29", "crossbeam-utils", "either", @@ -4766,11 +4570,10 @@ dependencies = [ "rand", "regex", "regex-syntax", - "reqwest", "scopeguard", "serde", + "serde_json", "socket2", - "stable_deref_trait", "syn", "tokio", "tokio-util", @@ -4787,7 +4590,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0ecbeb7b67ce215e40e3cc7f2ff902f94a223acf44995934763467e7b1febc8" dependencies = [ "asn1-rs", - "base64", + "base64 0.13.1", "data-encoding", "der-parser", "lazy_static", diff --git a/Cargo.toml b/Cargo.toml index 2f73215d3f..927900d5c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -86,4 +86,4 @@ lto = true # This is only needed for proxy's tests. # TODO: we should probably fork `tokio-postgres-rustls` instead. [patch.crates-io] -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14 index ad036338a0..e7fba49bb1 100644 --- a/Dockerfile.compute-node-v14 +++ b/Dockerfile.compute-node-v14 @@ -170,9 +170,6 @@ RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgto # Remove headers that we won't need anymore - we've completed installation of all extensions RUN rm -r /usr/local/pgsql/include -# Remove now-useless PGXS src infrastructure -RUN rm -r /usr/local/pgsql/lib/pgxs/src - # Remove static postgresql libraries - all compilation is finished, so we # can now remove these files - they must be included in other binaries by now # if they were to be used by other libraries. @@ -207,7 +204,8 @@ RUN apt update && \ libgeos-c1v5 \ libgdal28 \ libproj19 \ - libprotobuf-c1 && \ + libprotobuf-c1 \ + gdb && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* USER postgres diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15 index 4526644421..cd03525b97 100644 --- a/Dockerfile.compute-node-v15 +++ b/Dockerfile.compute-node-v15 @@ -170,9 +170,6 @@ RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgto # Remove headers that we won't need anymore - we've completed installation of all extensions RUN rm -r /usr/local/pgsql/include -# Remove now-useless PGXS src infrastructure -RUN rm -r /usr/local/pgsql/lib/pgxs/src - # Remove static postgresql libraries - all compilation is finished, so we # can now remove these files - they must be included in other binaries by now # if they were to be used by other libraries. @@ -207,7 +204,8 @@ RUN apt update && \ libgeos-c1v5 \ libgdal28 \ libproj19 \ - libprotobuf-c1 && \ + libprotobuf-c1 \ + gdb && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* USER postgres diff --git a/Makefile b/Makefile index 4711dc1c7d..92a4532684 100644 --- a/Makefile +++ b/Makefile @@ -61,146 +61,115 @@ all: neon postgres neon-pg-ext # # The 'postgres_ffi' depends on the Postgres headers. .PHONY: neon -neon: postgres-v14-headers postgres-v15-headers +neon: postgres-headers +@echo "Compiling Neon" $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) ### PostgreSQL parts -# The rules are duplicated for Postgres v14 and 15. We may want to refactor +# Some rules are duplicated for Postgres v14 and 15. We may want to refactor # to avoid the duplication in the future, but it's tolerable for now. # -$(POSTGRES_INSTALL_DIR)/build/v14/config.status: - +@echo "Configuring Postgres v14 build" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14 - (cd $(POSTGRES_INSTALL_DIR)/build/v14 && \ - env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure \ +$(POSTGRES_INSTALL_DIR)/build/%/config.status: + +@echo "Configuring Postgres $* build" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/$* + (cd $(POSTGRES_INSTALL_DIR)/build/$* && \ + env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \ CFLAGS='$(PG_CFLAGS)' \ $(PG_CONFIGURE_OPTS) \ - --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log) - -$(POSTGRES_INSTALL_DIR)/build/v15/config.status: - +@echo "Configuring Postgres v15 build" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15 - (cd $(POSTGRES_INSTALL_DIR)/build/v15 && \ - env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure \ - CFLAGS='$(PG_CFLAGS)' \ - $(PG_CONFIGURE_OPTS) \ - --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log) + --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log) # nicer alias to run 'configure' -.PHONY: postgres-v14-configure -postgres-v14-configure: $(POSTGRES_INSTALL_DIR)/build/v14/config.status - -.PHONY: postgres-v15-configure -postgres-v15-configure: $(POSTGRES_INSTALL_DIR)/build/v15/config.status +# Note: I've been unable to use templates for this part of our configuration. +# I'm not sure why it wouldn't work, but this is the only place (apart from +# the "build-all-versions" entry points) where direct mention of PostgreSQL +# versions is used. +.PHONY: postgres-configure-v15 +postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status +.PHONY: postgres-configure-v14 +postgres-configure-v14: $(POSTGRES_INSTALL_DIR)/build/v14/config.status # Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)//include -.PHONY: postgres-v14-headers -postgres-v14-headers: postgres-v14-configure - +@echo "Installing PostgreSQL v14 headers" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/include MAKELEVEL=0 install - -.PHONY: postgres-v15-headers -postgres-v15-headers: postgres-v15-configure - +@echo "Installing PostgreSQL v15 headers" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/include MAKELEVEL=0 install +.PHONY: postgres-headers-% +postgres-headers-%: postgres-configure-% + +@echo "Installing PostgreSQL $* headers" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/include MAKELEVEL=0 install # Compile and install PostgreSQL -.PHONY: postgres-v14 -postgres-v14: postgres-v14-configure \ - postgres-v14-headers # to prevent `make install` conflicts with neon's `postgres-headers` - +@echo "Compiling PostgreSQL v14" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install - +@echo "Compiling libpq v14" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install - +@echo "Compiling pg_prewarm v14" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_prewarm install - +@echo "Compiling pg_buffercache v14" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install - +@echo "Compiling pageinspect v14" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect install +.PHONY: postgres-% +postgres-%: postgres-configure-% \ + postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers` + +@echo "Compiling PostgreSQL $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 install + +@echo "Compiling libpq $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq install + +@echo "Compiling pg_prewarm $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install + +@echo "Compiling pg_buffercache $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install + +@echo "Compiling pageinspect $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install -.PHONY: postgres-v15 -postgres-v15: postgres-v15-configure \ - postgres-v15-headers # to prevent `make install` conflicts with neon's `postgres-headers` - +@echo "Compiling PostgreSQL v15" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install - +@echo "Compiling libpq v15" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install - +@echo "Compiling pg_prewarm v15" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_prewarm install - +@echo "Compiling pg_buffercache v15" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install - +@echo "Compiling pageinspect v15" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect install +.PHONY: postgres-clean-% +postgres-clean-%: + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean -# shorthand to build all Postgres versions -postgres: postgres-v14 postgres-v15 +.PHONY: neon-pg-ext-% +neon-pg-ext-%: postgres-% + +@echo "Compiling neon $*" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$* + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install + +@echo "Compiling neon_walredo $*" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install + +@echo "Compiling neon_test_utils $*" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install -.PHONY: postgres-v14-clean -postgres-v14-clean: - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq clean +.PHONY: neon-pg-ext-clean-% +neon-pg-ext-clean-%: + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean -.PHONY: postgres-v15-clean -postgres-v15-clean: - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq clean - -neon-pg-ext-v14: postgres-v14 - +@echo "Compiling neon v14" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v14 - (cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) - +@echo "Compiling neon_walredo v14" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 - (cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install) - +@echo "Compiling neon_test_utils" v14 - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 - (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install) - -neon-pg-ext-v15: postgres-v15 - +@echo "Compiling neon v15" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v15 - (cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) - +@echo "Compiling neon_walredo v15" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 - (cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install) - +@echo "Compiling neon_test_utils" v15 - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 - (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install) +.PHONY: neon-pg-ext +neon-pg-ext: \ + neon-pg-ext-v14 \ + neon-pg-ext-v15 .PHONY: neon-pg-ext-clean - $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean - $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean +neon-pg-ext-clean: \ + neon-pg-ext-clean-v14 \ + neon-pg-ext-clean-v15 -neon-pg-ext: neon-pg-ext-v14 neon-pg-ext-v15 -postgres-headers: postgres-v14-headers postgres-v15-headers -postgres-clean: postgres-v14-clean postgres-v15-clean +# shorthand to build all Postgres versions +.PHONY: postgres +postgres: \ + postgres-v14 \ + postgres-v15 + +.PHONY: postgres-headers +postgres-headers: \ + postgres-headers-v14 \ + postgres-headers-v15 + +.PHONY: postgres-clean +postgres-clean: \ + postgres-clean-v14 \ + postgres-clean-v15 # This doesn't remove the effects of 'configure'. .PHONY: clean -clean: - cd $(POSTGRES_INSTALL_DIR)/build/v14 && $(MAKE) clean - cd $(POSTGRES_INSTALL_DIR)/build/v15 && $(MAKE) clean +clean: postgres-clean neon-pg-ext-clean $(CARGO_CMD_PREFIX) cargo clean - cd pgxn/neon && $(MAKE) clean - cd pgxn/neon_test_utils && $(MAKE) clean # This removes everything .PHONY: distclean diff --git a/README.md b/README.md index 30bde949a9..7b629e71a5 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,8 @@ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler * On Fedora, these packages are needed: ```bash dnf install flex bison readline-devel zlib-devel openssl-devel \ - libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler + libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \ + protobuf-devel ``` 2. [Install Rust](https://www.rust-lang.org/tools/install) @@ -117,11 +118,8 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r # Later that would be responsibility of a package install script > ./target/debug/neon_local init Starting pageserver at '127.0.0.1:64000' in '.neon'. -pageserver started, pid: 2545906 -Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9 -Stopped pageserver 1 process with pid 2545906 -# start pageserver and safekeeper +# start pageserver, safekeeper, and broker for their intercommunication > ./target/debug/neon_local start Starting neon broker at 127.0.0.1:50051 storage_broker started, pid: 2918372 @@ -130,6 +128,12 @@ pageserver started, pid: 2918386 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'. safekeeper 1 started, pid: 2918437 +# create initial tenant and use it as a default for every future neon_local invocation +> ./target/debug/neon_local tenant create --set-default +tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver +Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c +Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one + # start postgres compute node > ./target/debug/neon_local pg start main Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ... diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index a35cef197d..4c65649610 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -2,6 +2,7 @@ name = "compute_tools" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] anyhow = "1.0" @@ -12,12 +13,12 @@ futures = "0.3.13" hyper = { version = "0.14", features = ["full"] } log = { version = "0.4", features = ["std", "serde"] } notify = "5.0.0" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } regex = "1" serde = { version = "1.0", features = ["derive"] } serde_json = "1" tar = "0.4" tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } url = "2.2.2" workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 7786d7af9c..f3b787209d 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -105,7 +105,7 @@ fn main() -> Result<()> { tenant, timeline, pageserver_connstr, - metrics: ComputeMetrics::new(), + metrics: ComputeMetrics::default(), state: RwLock::new(ComputeState::new()), }; let compute = Arc::new(compute_state); diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index b6ba1692f9..ee1605c814 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -5,7 +5,7 @@ use tokio_postgres::NoTls; use crate::compute::ComputeNode; -pub fn create_writablity_check_data(client: &mut Client) -> Result<()> { +pub fn create_writability_check_data(client: &mut Client) -> Result<()> { let query = " CREATE TABLE IF NOT EXISTS health_check ( id serial primary key, diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index bfdd2340ec..c2c9ab2230 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -23,11 +23,11 @@ use std::sync::RwLock; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; -use log::info; +use log::{info, warn}; use postgres::{Client, NoTls}; use serde::{Serialize, Serializer}; -use crate::checker::create_writablity_check_data; +use crate::checker::create_writability_check_data; use crate::config; use crate::pg_helpers::*; use crate::spec::*; @@ -91,7 +91,7 @@ pub enum ComputeStatus { Failed, } -#[derive(Serialize)] +#[derive(Default, Serialize)] pub struct ComputeMetrics { pub sync_safekeepers_ms: AtomicU64, pub basebackup_ms: AtomicU64, @@ -99,23 +99,6 @@ pub struct ComputeMetrics { pub total_startup_ms: AtomicU64, } -impl ComputeMetrics { - pub fn new() -> Self { - Self { - sync_safekeepers_ms: AtomicU64::new(0), - basebackup_ms: AtomicU64::new(0), - config_ms: AtomicU64::new(0), - total_startup_ms: AtomicU64::new(0), - } - } -} - -impl Default for ComputeMetrics { - fn default() -> Self { - Self::new() - } -} - impl ComputeNode { pub fn set_status(&self, status: ComputeStatus) { self.state.write().unwrap().status = status; @@ -175,7 +158,7 @@ impl ComputeNode { let start_time = Utc::now(); let sync_handle = Command::new(&self.pgbin) - .args(&["--sync-safekeepers"]) + .args(["--sync-safekeepers"]) .env("PGDATA", &self.pgdata) // we cannot use -D in this mode .stdout(Stdio::piped()) .spawn() @@ -253,7 +236,7 @@ impl ComputeNode { // Run postgres as a child process. let mut pg = Command::new(&self.pgbin) - .args(&["-D", &self.pgdata]) + .args(["-D", &self.pgdata]) .spawn() .expect("cannot start postgres process"); @@ -292,7 +275,7 @@ impl ComputeNode { handle_databases(&self.spec, &mut client)?; handle_role_deletions(self, &mut client)?; handle_grants(self, &mut client)?; - create_writablity_check_data(&mut client)?; + create_writability_check_data(&mut client)?; // 'Close' connection drop(client); @@ -328,6 +311,9 @@ impl ComputeNode { .wait() .expect("failed to start waiting on Postgres process"); + self.check_for_core_dumps() + .expect("failed to check for core dumps"); + Ok(ecode) } @@ -343,4 +329,68 @@ impl ComputeNode { self.prepare_pgdata()?; self.run() } + + // Look for core dumps and collect backtraces. + // + // EKS worker nodes have following core dump settings: + // /proc/sys/kernel/core_pattern -> core + // /proc/sys/kernel/core_uses_pid -> 1 + // ulimint -c -> unlimited + // which results in core dumps being written to postgres data directory as core.. + // + // Use that as a default location and pattern, except macos where core dumps are written + // to /cores/ directory by default. + fn check_for_core_dumps(&self) -> Result<()> { + let core_dump_dir = match std::env::consts::OS { + "macos" => Path::new("/cores/"), + _ => Path::new(&self.pgdata), + }; + + // Collect core dump paths if any + info!("checking for core dumps in {}", core_dump_dir.display()); + let files = fs::read_dir(core_dump_dir)?; + let cores = files.filter_map(|entry| { + let entry = entry.ok()?; + let _ = entry.file_name().to_str()?.strip_prefix("core.")?; + Some(entry.path()) + }); + + // Print backtrace for each core dump + for core_path in cores { + warn!( + "core dump found: {}, collecting backtrace", + core_path.display() + ); + + // Try first with gdb + let backtrace = Command::new("gdb") + .args(["--batch", "-q", "-ex", "bt", &self.pgbin]) + .arg(&core_path) + .output(); + + // Try lldb if no gdb is found -- that is handy for local testing on macOS + let backtrace = match backtrace { + Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => { + warn!("cannot find gdb, trying lldb"); + Command::new("lldb") + .arg("-c") + .arg(&core_path) + .args(["--batch", "-o", "bt all", "-o", "quit"]) + .output() + } + _ => backtrace, + }?; + + warn!( + "core dump backtrace: {}", + String::from_utf8_lossy(&backtrace.stdout) + ); + warn!( + "debugger stderr: {}", + String::from_utf8_lossy(&backtrace.stderr) + ); + } + + Ok(()) + } } diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 4c8bbc608b..44f83e5003 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -9,29 +9,11 @@ use hyper::{Body, Method, Request, Response, Server, StatusCode}; use log::{error, info}; use serde_json; -use crate::compute::{ComputeNode, ComputeStatus}; +use crate::compute::ComputeNode; // Service function to handle all available routes. async fn routes(req: Request, compute: Arc) -> Response { match (req.method(), req.uri().path()) { - // Timestamp of the last Postgres activity in the plain text. - // DEPRECATED in favour of /status - (&Method::GET, "/last_activity") => { - info!("serving /last_active GET request"); - let state = compute.state.read().unwrap(); - - // Use RFC3339 format for consistency. - Response::new(Body::from(state.last_active.to_rfc3339())) - } - - // Has compute setup process finished? -> true/false. - // DEPRECATED in favour of /status - (&Method::GET, "/ready") => { - info!("serving /ready GET request"); - let status = compute.get_status(); - Response::new(Body::from(format!("{}", status == ComputeStatus::Running))) - } - // Serialized compute state. (&Method::GET, "/status") => { info!("serving /status GET request"); @@ -46,16 +28,6 @@ async fn routes(req: Request, compute: Arc) -> Response Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap())) } - // DEPRECATED, use POST instead - (&Method::GET, "/check_writability") => { - info!("serving /check_writability GET request"); - let res = crate::checker::check_writability(&compute).await; - match res { - Ok(_) => Response::new(Body::from("true")), - Err(e) => Response::new(Body::from(e.to_string())), - } - } - (&Method::POST, "/check_writability") => { info!("serving /check_writability POST request"); let res = crate::checker::check_writability(&compute).await; diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 9c0f8e3ccd..a857531d26 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -37,58 +37,7 @@ paths: schema: $ref: "#/components/schemas/ComputeMetrics" - /ready: - get: - deprecated: true - tags: - - "info" - summary: Check whether compute startup process finished successfully - description: "" - operationId: computeIsReady - responses: - "200": - description: Compute is ready ('true') or not ('false') - content: - text/plain: - schema: - type: string - example: "true" - - /last_activity: - get: - deprecated: true - tags: - - "info" - summary: Get timestamp of the last compute activity - description: "" - operationId: getLastComputeActivityTS - responses: - "200": - description: Timestamp of the last compute activity - content: - text/plain: - schema: - type: string - example: "2022-10-12T07:20:50.52Z" - /check_writability: - get: - deprecated: true - tags: - - "check" - summary: Check that we can write new data on this compute - description: "" - operationId: checkComputeWritabilityDeprecated - responses: - "200": - description: Check result - content: - text/plain: - schema: - type: string - description: Error text or 'true' if check passed - example: "true" - post: tags: - "check" diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 58cdf796bc..c871422e78 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -52,10 +52,16 @@ fn watch_compute_activity(compute: &ComputeNode) { let mut idle_backs: Vec> = vec![]; for b in backs.into_iter() { - let state: String = b.get("state"); - let change: String = b.get("state_change"); + let state: String = match b.try_get("state") { + Ok(state) => state, + Err(_) => continue, + }; if state == "idle" { + let change: String = match b.try_get("state_change") { + Ok(state_change) => state_change, + Err(_) => continue, + }; let change = DateTime::parse_from_rfc3339(&change); match change { Ok(t) => idle_backs.push(t.with_timezone(&Utc)), @@ -74,10 +80,8 @@ fn watch_compute_activity(compute: &ComputeNode) { } } - // Sort idle backend `state_change` timestamps. The last one corresponds - // to the last activity. - idle_backs.sort(); - if let Some(last) = idle_backs.last() { + // Get idle backend `state_change` with the max timestamp. + if let Some(last) = idle_backs.iter().max() { last_active = *last; } } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 289f223bda..ff422f1cf5 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -119,16 +119,9 @@ pub trait GenericOptionsSearch { impl GenericOptionsSearch for GenericOptions { /// Lookup option by name fn find(&self, name: &str) -> Option { - match &self { - Some(ops) => { - let op = ops.iter().find(|s| s.name == name); - match op { - Some(op) => op.value.clone(), - None => None, - } - } - None => None, - } + let ops = self.as_ref()?; + let op = ops.iter().find(|s| s.name == name)?; + op.value.clone() } } @@ -161,6 +154,14 @@ impl Role { } impl Database { + pub fn new(name: PgIdent, owner: PgIdent) -> Self { + Self { + name, + owner, + options: None, + } + } + /// Serialize a list of database parameters into a Postgres-acceptable /// string of arguments. /// NB: `TEMPLATE` is actually also an identifier, but so far we only need @@ -219,11 +220,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { &[], )? .iter() - .map(|row| Database { - name: row.get("datname"), - owner: row.get("owner"), - options: None, - }) + .map(|row| Database::new(row.get("datname"), row.get("owner"))) .collect(); Ok(postgres_dbs) diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 58c94d74ae..81e01fe555 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,5 +1,6 @@ use std::path::Path; use std::str::FromStr; +use std::time::Instant; use anyhow::Result; use log::{info, log_enabled, warn, Level}; @@ -197,22 +198,18 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { /// Reassign all dependent objects and delete requested roles. pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> { - let spec = &node.spec; - - // First, reassign all dependent objects to db owners. - if let Some(ops) = &spec.delta_operations { + if let Some(ops) = &node.spec.delta_operations { + // First, reassign all dependent objects to db owners. info!("reassigning dependent objects of to-be-deleted roles"); for op in ops { if op.action == "delete_role" { reassign_owned_objects(node, &op.name)?; } } - } - // Second, proceed with role deletions. - let mut xact = client.transaction()?; - if let Some(ops) = &spec.delta_operations { + // Second, proceed with role deletions. info!("processing role deletions"); + let mut xact = client.transaction()?; for op in ops { // We do not check either role exists or not, // Postgres will take care of it for us @@ -223,6 +220,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result< xact.execute(query.as_str(), &[])?; } } + xact.commit()?; } Ok(()) @@ -317,6 +315,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { // XXX: with a limited number of databases it is fine, but consider making it a HashMap let pg_db = existing_dbs.iter().find(|r| r.name == *name); + let start_time = Instant::now(); if let Some(r) = pg_db { // XXX: db owner name is returned as quoted string from Postgres, // when quoting is needed. @@ -335,6 +334,8 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { info_print!(" -> update"); client.execute(query.as_str(), &[])?; + let elapsed = start_time.elapsed().as_millis(); + info_print!(" ({} ms)", elapsed); } } else { let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote()); @@ -342,6 +343,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { query.push_str(&db.to_pg_options()); client.execute(query.as_str(), &[])?; + + let elapsed = start_time.elapsed().as_millis(); + info_print!(" ({} ms)", elapsed); } info_print!("\n"); diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 24cad4663a..431d9794bc 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -38,4 +38,33 @@ mod pg_helpers_tests { assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\""); } + + #[test] + fn generic_options_search() { + let generic_options: GenericOptions = Some(vec![ + GenericOption { + name: "present_value".into(), + value: Some("value".into()), + vartype: "string".into(), + }, + GenericOption { + name: "missed_value".into(), + value: None, + vartype: "int".into(), + }, + ]); + assert_eq!(generic_options.find("present_value"), Some("value".into())); + assert_eq!(generic_options.find("missed_value"), None); + assert_eq!(generic_options.find("invalid_value"), None); + + let empty_generic_options: GenericOptions = Some(vec![]); + assert_eq!(empty_generic_options.find("present_value"), None); + assert_eq!(empty_generic_options.find("missed_value"), None); + assert_eq!(empty_generic_options.find("invalid_value"), None); + + let none_generic_options: GenericOptions = None; + assert_eq!(none_generic_options.find("present_value"), None); + assert_eq!(none_generic_options.find("missed_value"), None); + assert_eq!(none_generic_options.find("invalid_value"), None); + } } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 00b34aafb1..1c6cd6d882 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -2,6 +2,7 @@ name = "control_plane" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] anyhow = "1.0" @@ -10,7 +11,7 @@ comfy-table = "6.1" git-version = "0.3.5" nix = "0.25" once_cell = "1.13.0" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" } regex = "1" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } serde = { version = "1.0", features = ["derive"] } diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 8909e27c94..1f3f8f45ea 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -136,22 +136,6 @@ where anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds"); } -/// Send SIGTERM to child process -pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<()> { - let pid = child.id(); - match kill( - nix::unistd::Pid::from_raw(pid.try_into().unwrap()), - Signal::SIGTERM, - ) { - Ok(()) => Ok(()), - Err(Errno::ESRCH) => { - println!("child process with pid {pid} does not exist"); - Ok(()) - } - Err(e) => anyhow::bail!("Failed to send signal to child process with pid {pid}: {e}"), - } -} - /// Stops the process, using the pid file given. Returns Ok also if the process is already not running. pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> { let pid = match pid_file::read(pid_file) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index f0c3b983f0..4b2aa3c957 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -263,7 +263,7 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R } else if let Some(default_id) = env.default_tenant_id { Ok(default_id) } else { - bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file"); + anyhow::bail!("No tenant id. Use --tenant-id, or set a default tenant"); } } @@ -284,8 +284,6 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result anyhow::Result { - let initial_timeline_id_arg = parse_timeline_id(init_match)?; - // Create config file let toml_file: String = if let Some(config_path) = init_match.get_one::("config") { // load and parse the file @@ -309,30 +307,16 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; env.init(pg_version) .context("Failed to initialize neon repository")?; - let initial_tenant_id = env - .default_tenant_id - .expect("default_tenant_id should be generated by the `env.init()` call above"); // Initialize pageserver, create initial tenant and timeline. let pageserver = PageServerNode::from_env(&env); - let initial_timeline_id = pageserver - .initialize( - Some(initial_tenant_id), - initial_timeline_id_arg, - &pageserver_config_overrides(init_match), - pg_version, - ) + pageserver + .initialize(&pageserver_config_overrides(init_match)) .unwrap_or_else(|e| { eprintln!("pageserver init failed: {e:?}"); exit(1); }); - env.register_branch_mapping( - DEFAULT_BRANCH_NAME.to_owned(), - initial_tenant_id, - initial_timeline_id, - )?; - Ok(env) } @@ -388,6 +372,17 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an println!( "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}", ); + + if create_match.get_flag("set-default") { + println!("Setting tenant {new_tenant_id} as a default one"); + env.default_tenant_id = Some(new_tenant_id); + } + } + Some(("set-default", set_default_match)) => { + let tenant_id = + parse_tenant_id(set_default_match)?.context("No tenant id specified")?; + println!("Setting tenant {tenant_id} as a default one"); + env.default_tenant_id = Some(tenant_id); } Some(("config", create_match)) => { let tenant_id = get_tenant_id(create_match, env)?; @@ -549,7 +544,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { table.load_preset(comfy_table::presets::NOTHING); - table.set_header(&[ + table.set_header([ "NODE", "ADDRESS", "TIMELINE", @@ -584,7 +579,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { .map(|name| name.as_str()) .unwrap_or("?"); - table.add_row(&[ + table.add_row([ node_name.as_str(), &node.address.to_string(), &node.timeline_id.to_string(), @@ -747,7 +742,7 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result Resul } fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { - broker::start_broker_process(env)?; - let pageserver = PageServerNode::from_env(env); - // Postgres nodes are not started automatically + broker::start_broker_process(env)?; + + let pageserver = PageServerNode::from_env(env); if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) { - eprintln!("pageserver start failed: {e}"); - try_stop_storage_broker_process(env); + eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e); + try_stop_all(env, true); exit(1); } for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.start() { - eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id); - try_stop_storage_broker_process(env); + eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e); + try_stop_all(env, false); exit(1); } } @@ -832,35 +827,41 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result< let immediate = sub_match.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); + try_stop_all(env, immediate); + + Ok(()) +} + +fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { let pageserver = PageServerNode::from_env(env); // Stop all compute nodes - let cplane = ComputeControlPlane::load(env.clone())?; - for (_k, node) in cplane.nodes { - if let Err(e) = node.stop(false) { - eprintln!("postgres stop failed: {}", e); + match ComputeControlPlane::load(env.clone()) { + Ok(cplane) => { + for (_k, node) in cplane.nodes { + if let Err(e) = node.stop(false) { + eprintln!("postgres stop failed: {e:#}"); + } + } + } + Err(e) => { + eprintln!("postgres stop failed, could not restore control plane data from env: {e:#}") } } if let Err(e) = pageserver.stop(immediate) { - eprintln!("pageserver stop failed: {}", e); + eprintln!("pageserver {} stop failed: {:#}", env.pageserver.id, e); } for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.stop(immediate) { - eprintln!("safekeeper '{}' stop failed: {}", safekeeper.id, e); + eprintln!("safekeeper {} stop failed: {:#}", safekeeper.id, e); } } - try_stop_storage_broker_process(env); - - Ok(()) -} - -fn try_stop_storage_broker_process(env: &local_env::LocalEnv) { if let Err(e) = broker::stop_broker_process(env) { - eprintln!("neon broker stop failed: {e}"); + eprintln!("neon broker stop failed: {e:#}"); } } @@ -900,6 +901,7 @@ fn cli() -> Command { let stop_mode_arg = Arg::new("stop-mode") .short('m') .value_parser(["fast", "immediate"]) + .default_value("fast") .help("If 'immediate', don't flush repository data at shutdown") .required(false) .value_name("stop-mode"); @@ -921,9 +923,8 @@ fn cli() -> Command { .version(GIT_VERSION) .subcommand( Command::new("init") - .about("Initialize a new Neon repository") + .about("Initialize a new Neon repository, preparing configs for services to start with") .arg(pageserver_config_args.clone()) - .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) .arg( Arg::new("config") .long("config") @@ -985,11 +986,14 @@ fn cli() -> Command { .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)) .arg(pg_version_arg.clone()) + .arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false) + .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified")) ) + .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true)) + .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified")) .subcommand(Command::new("config") .arg(tenant_id_arg.clone()) - .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)) - ) + .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))) ) .subcommand( Command::new("pageserver") diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs index bd60580012..6c0604a076 100644 --- a/control_plane/src/broker.rs +++ b/control_plane/src/broker.rs @@ -17,7 +17,7 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { "storage_broker", &env.base_data_dir, &env.storage_broker_bin(), - &args, + args, [], background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)), || { diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 0eec25c51e..8731cf2583 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -44,7 +44,7 @@ impl ComputeControlPlane { let mut nodes = BTreeMap::default(); let pgdatadirspath = &env.pg_data_dirs_path(); - for tenant_dir in fs::read_dir(&pgdatadirspath) + for tenant_dir in fs::read_dir(pgdatadirspath) .with_context(|| format!("failed to list {}", pgdatadirspath.display()))? { let tenant_dir = tenant_dir?; @@ -67,8 +67,8 @@ impl ComputeControlPlane { fn get_port(&mut self) -> u16 { 1 + self .nodes - .iter() - .map(|(_name, node)| node.address.port()) + .values() + .map(|node| node.address.port()) .max() .unwrap_or(self.base_port) } @@ -183,7 +183,7 @@ impl PostgresNode { fn sync_safekeepers(&self, auth_token: &Option, pg_version: u32) -> Result { let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres"); - let mut cmd = Command::new(&pg_path); + let mut cmd = Command::new(pg_path); cmd.arg("--sync-safekeepers") .env_clear() @@ -201,7 +201,7 @@ impl PostgresNode { .stderr(Stdio::piped()); if let Some(token) = auth_token { - cmd.env("ZENITH_AUTH_TOKEN", token); + cmd.env("NEON_AUTH_TOKEN", token); } let sync_handle = cmd @@ -261,7 +261,7 @@ impl PostgresNode { } fn create_pgdata(&self) -> Result<()> { - fs::create_dir_all(&self.pgdata()).with_context(|| { + fs::create_dir_all(self.pgdata()).with_context(|| { format!( "could not create data directory {}", self.pgdata().display() @@ -304,17 +304,17 @@ impl PostgresNode { // Set up authentication // - // $ZENITH_AUTH_TOKEN will be replaced with value from environment + // $NEON_AUTH_TOKEN will be replaced with value from environment // variable during compute pg startup. It is done this way because // otherwise user will be able to retrieve the value using SHOW // command or pg_settings let password = if let AuthType::NeonJWT = auth_type { - "$ZENITH_AUTH_TOKEN" + "$NEON_AUTH_TOKEN" } else { "" }; // NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere. - // Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN + // Also note that not all parameters are supported here. Because in compute we substitute $NEON_AUTH_TOKEN // We parse this string and build it back with token from env var, and for simplicity rebuild // uses only needed variables namely host, port, user, password. format!("postgresql://no_user:{password}@{host}:{port}") @@ -323,7 +323,7 @@ impl PostgresNode { conf.append_line(""); conf.append("neon.pageserver_connstring", &pageserver_connstr); if let AuthType::NeonJWT = auth_type { - conf.append("neon.safekeeper_token_env", "$ZENITH_AUTH_TOKEN"); + conf.append("neon.safekeeper_token_env", "$NEON_AUTH_TOKEN"); } conf.append("neon.tenant_id", &self.tenant_id.to_string()); conf.append("neon.timeline_id", &self.timeline_id.to_string()); @@ -448,7 +448,7 @@ impl PostgresNode { self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(), ); if let Some(token) = auth_token { - cmd.env("ZENITH_AUTH_TOKEN", token); + cmd.env("NEON_AUTH_TOKEN", token); } let pg_ctl = cmd.output().context("pg_ctl failed")?; @@ -478,7 +478,7 @@ impl PostgresNode { postgresql_conf_path.to_str().unwrap() ) })?; - fs::remove_dir_all(&self.pgdata())?; + fs::remove_dir_all(self.pgdata())?; self.create_pgdata()?; // 2. Bring back config files @@ -514,7 +514,7 @@ impl PostgresNode { "Destroying postgres data directory '{}'", self.pgdata().to_str().unwrap() ); - fs::remove_dir_all(&self.pgdata())?; + fs::remove_dir_all(self.pgdata())?; } else { self.pg_ctl(&["stop"], &None)?; } diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index ed9e467eee..003152c578 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -296,11 +296,6 @@ impl LocalEnv { env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); } - // If no initial tenant ID was given, generate it. - if env.default_tenant_id.is_none() { - env.default_tenant_id = Some(TenantId::generate()); - } - env.base_data_dir = base_path(); Ok(env) @@ -404,7 +399,7 @@ impl LocalEnv { } } - fs::create_dir(&base_path)?; + fs::create_dir(base_path)?; // generate keys for jwt // openssl genrsa -out private_key.pem 2048 @@ -413,7 +408,7 @@ impl LocalEnv { private_key_path = base_path.join("auth_private_key.pem"); let keygen_output = Command::new("openssl") .arg("genrsa") - .args(&["-out", private_key_path.to_str().unwrap()]) + .args(["-out", private_key_path.to_str().unwrap()]) .arg("2048") .stdout(Stdio::null()) .output() @@ -430,10 +425,10 @@ impl LocalEnv { // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem let keygen_output = Command::new("openssl") .arg("rsa") - .args(&["-in", private_key_path.to_str().unwrap()]) + .args(["-in", private_key_path.to_str().unwrap()]) .arg("-pubout") - .args(&["-outform", "PEM"]) - .args(&["-out", public_key_path.to_str().unwrap()]) + .args(["-outform", "PEM"]) + .args(["-out", public_key_path.to_str().unwrap()]) .stdout(Stdio::null()) .output() .context("failed to generate auth private key")?; diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 3575e75db9..9cebe028e4 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -7,7 +7,7 @@ use std::path::PathBuf; use std::process::{Child, Command}; use std::{io, result}; -use anyhow::{bail, ensure, Context}; +use anyhow::{bail, Context}; use pageserver_api::models::{ TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo, }; @@ -130,83 +130,15 @@ impl PageServerNode { overrides } - /// Initializes a pageserver node by creating its config with the overrides provided, - /// and creating an initial tenant and timeline afterwards. - pub fn initialize( - &self, - create_tenant: Option, - initial_timeline_id: Option, - config_overrides: &[&str], - pg_version: u32, - ) -> anyhow::Result { + /// Initializes a pageserver node by creating its config with the overrides provided. + pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> { // First, run `pageserver --init` and wait for it to write a config into FS and exit. self.pageserver_init(config_overrides).with_context(|| { format!( "Failed to run init for pageserver node {}", self.env.pageserver.id, ) - })?; - - // Then, briefly start it fully to run HTTP commands on it, - // to create initial tenant and timeline. - // We disable the remote storage, since we stop pageserver right after the timeline creation, - // hence most of the uploads will either aborted or not started: no point to start them at all. - let disabled_remote_storage_override = "remote_storage={}"; - let mut pageserver_process = self - .start_node( - &[disabled_remote_storage_override], - // Previous overrides will be taken from the config created before, don't overwrite them. - false, - ) - .with_context(|| { - format!( - "Failed to start a process for pageserver node {}", - self.env.pageserver.id, - ) - })?; - - let init_result = self - .try_init_timeline(create_tenant, initial_timeline_id, pg_version) - .context("Failed to create initial tenant and timeline for pageserver"); - match &init_result { - Ok(initial_timeline_id) => { - println!("Successfully initialized timeline {initial_timeline_id}") - } - Err(e) => eprintln!("{e:#}"), - } - background_process::send_stop_child_process(&pageserver_process)?; - - let exit_code = pageserver_process.wait()?; - ensure!( - exit_code.success(), - format!( - "pageserver init failed with exit code {:?}", - exit_code.code() - ) - ); - println!( - "Stopped pageserver {} process with pid {}", - self.env.pageserver.id, - pageserver_process.id(), - ); - init_result - } - - fn try_init_timeline( - &self, - new_tenant_id: Option, - new_timeline_id: Option, - pg_version: u32, - ) -> anyhow::Result { - let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?; - let initial_timeline_info = self.timeline_create( - initial_tenant_id, - new_timeline_id, - None, - None, - Some(pg_version), - )?; - Ok(initial_timeline_info.timeline_id) + }) } pub fn repo_path(&self) -> PathBuf { @@ -241,7 +173,7 @@ impl PageServerNode { let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str); args.push(Cow::Borrowed("--init")); - let init_output = Command::new(&self.env.pageserver_bin()) + let init_output = Command::new(self.env.pageserver_bin()) .args(args.iter().map(Cow::as_ref)) .envs(self.pageserver_env_variables()?) .output() @@ -320,7 +252,7 @@ impl PageServerNode { let token = self .env .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?; - vec![("ZENITH_AUTH_TOKEN".to_owned(), token)] + vec![("NEON_AUTH_TOKEN".to_owned(), token)] } else { Vec::new() }) diff --git a/deny.toml b/deny.toml new file mode 100644 index 0000000000..3a0fe36f87 --- /dev/null +++ b/deny.toml @@ -0,0 +1,90 @@ +# This file was auto-generated using `cargo deny init`. +# cargo-deny is a cargo plugin that lets you lint your project's +# dependency graph to ensure all your dependencies conform +# to your expectations and requirements. + +# Root options +targets = [] +all-features = false +no-default-features = false +feature-depth = 1 + +# This section is considered when running `cargo deny check advisories` +# More documentation for the advisories section can be found here: +# https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html +[advisories] +db-urls = ["https://github.com/rustsec/advisory-db"] +vulnerability = "deny" +unmaintained = "warn" +yanked = "warn" +notice = "warn" +ignore = [] + +# This section is considered when running `cargo deny check licenses` +# More documentation for the licenses section can be found here: +# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html +[licenses] +unlicensed = "deny" +allow = [ + "Apache-2.0", + "Artistic-2.0", + "BSD-2-Clause", + "BSD-3-Clause", + "ISC", + "MIT", + "MPL-2.0", + "OpenSSL", + "Unicode-DFS-2016", +] +deny = [] +copyleft = "warn" +allow-osi-fsf-free = "neither" +default = "deny" +confidence-threshold = 0.8 +exceptions = [ + # Zlib license has some restrictions if we decide to change sth + { allow = ["Zlib"], name = "const_format_proc_macros", version = "*" }, + { allow = ["Zlib"], name = "const_format", version = "*" }, +] + +[[licenses.clarify]] +name = "ring" +version = "*" +expression = "MIT AND ISC AND OpenSSL" +license-files = [ + { path = "LICENSE", hash = 0xbd0eed23 }, +] + +[licenses.private] +ignore = true +registries = [] + +# This section is considered when running `cargo deny check bans`. +# More documentation about the 'bans' section can be found here: +# https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html +[bans] +multiple-versions = "warn" +wildcards = "allow" +highlight = "all" +workspace-default-features = "allow" +external-default-features = "allow" +allow = [] +deny = [] +skip = [] +skip-tree = [] + +# This section is considered when running `cargo deny check sources`. +# More documentation about the 'sources' section can be found here: +# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html +[sources] +unknown-registry = "warn" +unknown-git = "warn" +allow-registry = ["https://github.com/rust-lang/crates.io-index"] +allow-git = [] + +[sources.allow-org] +github = [ + "neondatabase", +] +gitlab = [] +bitbucket = [] diff --git a/docs/authentication.md b/docs/authentication.md index 0752fae19f..e22d7b700f 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -65,7 +65,7 @@ There is no administrative API except those provided by PostgreSQL. #### Outgoing connections Compute connects to Pageserver for getting pages. -The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$ZENITH_AUTH_TOKEN@localhost:15028`. +The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$NEON_AUTH_TOKEN@localhost:15028`. The environment variable inside the connection string is substituted with the JWT token. @@ -77,7 +77,7 @@ If the GUC is unset, no token is passed. Note that both tokens can be (and typically are) the same; the scope is the tenant and the token is usually passed through the -`$ZENITH_AUTH_TOKEN` environment variable. +`$NEON_AUTH_TOKEN` environment variable. ### Pageserver #### Overview @@ -114,7 +114,7 @@ either of three values: Pageserver makes a connection to a Safekeeper for each active timeline. As Pageserver may want to access any timeline it has on the disk, it is given a blanket JWT token to access any data on any Safekeeper. -This token is passed through an environment variable called `ZENITH_AUTH_TOKEN` +This token is passed through an environment variable called `NEON_AUTH_TOKEN` (non-configurable as of writing this text). A better way _may be_ to store JWT token for each timeline next to it, diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index d0cd46d2a9..d155f1e07d 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -2,6 +2,7 @@ name = "metrics" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 2102ae5373..68d4c609f0 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -2,6 +2,7 @@ name = "pageserver_api" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] serde = { version = "1.0", features = ["derive"] } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index e49b7051d2..d954e5d21f 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -163,6 +163,8 @@ pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] pub id: TenantId, pub state: TenantState, + /// Sum of the size of all layer files. + /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub has_in_progress_downloads: Option, } @@ -191,9 +193,12 @@ pub struct TimelineInfo { #[serde_as(as = "DisplayFromStr")] pub remote_consistent_lsn: Lsn, pub current_logical_size: Option, // is None when timeline is Unloaded + /// Sum of the size of all layer files. + /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, - pub current_physical_size_non_incremental: Option, + + pub timeline_dir_layer_file_size_sum: Option, pub wal_source_connstr: Option, #[serde_as(as = "Option")] @@ -203,29 +208,22 @@ pub struct TimelineInfo { pub pg_version: u32, pub state: TimelineState, - - // Some of the above fields are duplicated in 'local' and 'remote', for backwards- - // compatility with older clients. - pub local: LocalTimelineInfo, - pub remote: RemoteTimelineInfo, } -#[serde_as] #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct LocalTimelineInfo { - #[serde_as(as = "Option")] - pub ancestor_timeline_id: Option, - #[serde_as(as = "Option")] - pub ancestor_lsn: Option, - pub current_logical_size: Option, // is None when timeline is Unloaded - pub current_physical_size: Option, // is None when timeline is Unloaded +pub struct DownloadRemoteLayersTaskInfo { + pub task_id: String, + pub state: DownloadRemoteLayersTaskState, + pub total_layer_count: u64, // stable once `completed` + pub successful_download_count: u64, // stable once `completed` + pub failed_download_count: u64, // stable once `completed` } -#[serde_as] #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct RemoteTimelineInfo { - #[serde_as(as = "Option")] - pub remote_consistent_lsn: Option, +pub enum DownloadRemoteLayersTaskState { + Running, + Completed, + ShutDown, } pub type ConfigureFailpointsRequest = Vec; @@ -325,7 +323,7 @@ impl PagestreamFeMessage { match self { Self::Exists(req) => { bytes.put_u8(0); - bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u8(u8::from(req.latest)); bytes.put_u64(req.lsn.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); @@ -335,7 +333,7 @@ impl PagestreamFeMessage { Self::Nblocks(req) => { bytes.put_u8(1); - bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u8(u8::from(req.latest)); bytes.put_u64(req.lsn.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); @@ -345,7 +343,7 @@ impl PagestreamFeMessage { Self::GetPage(req) => { bytes.put_u8(2); - bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u8(u8::from(req.latest)); bytes.put_u64(req.lsn.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); @@ -356,7 +354,7 @@ impl PagestreamFeMessage { Self::DbSize(req) => { bytes.put_u8(3); - bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u8(u8::from(req.latest)); bytes.put_u64(req.lsn.0); bytes.put_u32(req.dbnode); } diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml index 314f3c6f1c..12b7abcc93 100644 --- a/libs/postgres_connection/Cargo.toml +++ b/libs/postgres_connection/Cargo.toml @@ -2,14 +2,15 @@ name = "postgres_connection" version = "0.1.0" edition = "2021" +license = "Apache-2.0" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] anyhow = "1.0" itertools = "0.10.3" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } url = "2.2.2" workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 01ff6ab60e..aa076b08d3 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -2,6 +2,7 @@ name = "postgres_ffi" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] rand = "0.8.3" @@ -21,7 +22,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" } [dev-dependencies] env_logger = "0.9" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } wal_craft = { path = "wal_craft" } [build-dependencies] diff --git a/libs/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs index 01e5554b8a..5acf90be70 100644 --- a/libs/postgres_ffi/src/nonrelfile_utils.rs +++ b/libs/postgres_ffi/src/nonrelfile_utils.rs @@ -14,8 +14,8 @@ pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) { status ); - let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32) - / pg_constants::CLOG_XACTS_PER_BYTE) as usize; + let byteno: usize = + ((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize; let bshift: u8 = ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8; @@ -25,13 +25,13 @@ pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) { } pub fn transaction_id_get_status(xid: u32, page: &[u8]) -> u8 { - let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32) - / pg_constants::CLOG_XACTS_PER_BYTE) as usize; + let byteno: usize = + ((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize; let bshift: u8 = ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8; - ((page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK) as u8 + (page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK } // See CLOGPagePrecedes in clog.c diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 953723a8f0..272c4d6dcc 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -333,7 +333,7 @@ impl CheckPoint { // We need this segment to start compute node. // pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result { - let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE as usize); + let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE); let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); let hdr = XLogLongPageHeaderData { @@ -574,7 +574,7 @@ mod tests { // Rename file to partial to actually find last valid lsn, then rename it back. fs::rename( - cfg.wal_dir().join(&last_segment), + cfg.wal_dir().join(last_segment), cfg.wal_dir().join(format!("{}.partial", last_segment)), ) .unwrap(); diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index 4c35c5a650..abfc263550 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -2,7 +2,7 @@ name = "wal_craft" version = "0.1.0" edition = "2021" - +license = "Apache-2.0" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] @@ -11,7 +11,7 @@ clap = "4.0" env_logger = "0.9" log = "0.4" once_cell = "1.13.0" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } postgres_ffi = { path = "../" } tempfile = "3.2" workspace_hack = { version = "0.1", path = "../../../workspace_hack" } diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index feec3b2ace..969befc8e7 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -81,7 +81,7 @@ impl Conf { .new_pg_command("initdb")? .arg("-D") .arg(self.datadir.as_os_str()) - .args(&["-U", "postgres", "--no-instructions", "--no-sync"]) + .args(["-U", "postgres", "--no-instructions", "--no-sync"]) .output()?; debug!("initdb output: {:?}", output); ensure!( @@ -105,12 +105,12 @@ impl Conf { let unix_socket_dir_path = unix_socket_dir.path().to_owned(); let server_process = self .new_pg_command("postgres")? - .args(&["-c", "listen_addresses="]) + .args(["-c", "listen_addresses="]) .arg("-k") .arg(unix_socket_dir_path.as_os_str()) .arg("-D") .arg(self.datadir.as_os_str()) - .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output + .args(["-c", "logging_collector=on"]) // stderr will mess up with tests output .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg])) .stderr(Stdio::from(log_file)) .spawn()?; @@ -142,7 +142,7 @@ impl Conf { ); let output = self .new_pg_command("pg_waldump")? - .args(&[ + .args([ &first_segment_file.as_os_str(), &last_segment_file.as_os_str(), ]) diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml index 4d48e431b4..b9c6a1eab0 100644 --- a/libs/pq_proto/Cargo.toml +++ b/libs/pq_proto/Cargo.toml @@ -2,15 +2,17 @@ name = "pq_proto" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] anyhow = "1.0" bytes = "1.0.1" pin-project-lite = "0.2.7" -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } rand = "0.8.3" serde = { version = "1.0", features = ["derive"] } tokio = { version = "1.17", features = ["macros"] } tracing = "0.1" +thiserror = "1.0" workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 2e311dd6e3..c5e4dbd1f0 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -5,7 +5,7 @@ // Tools for calling certain async methods in sync contexts. pub mod sync; -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use postgres_protocol::PG_EPOCH; use serde::{Deserialize, Serialize}; @@ -194,6 +194,35 @@ macro_rules! retry_read { }; } +/// An error occured during connection being open. +#[derive(thiserror::Error, Debug)] +pub enum ConnectionError { + /// IO error during writing to or reading from the connection socket. + #[error("Socket IO error: {0}")] + Socket(std::io::Error), + /// Invalid packet was received from client + #[error("Protocol error: {0}")] + Protocol(String), + /// Failed to parse a protocol mesage + #[error("Message parse error: {0}")] + MessageParse(anyhow::Error), +} + +impl From for ConnectionError { + fn from(e: anyhow::Error) -> Self { + Self::MessageParse(e) + } +} + +impl ConnectionError { + pub fn into_io_error(self) -> io::Error { + match self { + ConnectionError::Socket(io) => io, + other => io::Error::new(io::ErrorKind::Other, other.to_string()), + } + } +} + impl FeMessage { /// Read one message from the stream. /// This function returns `Ok(None)` in case of EOF. @@ -216,7 +245,9 @@ impl FeMessage { /// } /// ``` #[inline(never)] - pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result> { + pub fn read( + stream: &mut (impl io::Read + Unpin), + ) -> Result, ConnectionError> { Self::read_fut(&mut AsyncishRead(stream)).wait() } @@ -224,7 +255,7 @@ impl FeMessage { /// See documentation for `Self::read`. pub fn read_fut( stream: &mut Reader, - ) -> SyncFuture>> + '_> + ) -> SyncFuture, ConnectionError>> + '_> where Reader: tokio::io::AsyncRead + Unpin, { @@ -238,17 +269,21 @@ impl FeMessage { let tag = match retry_read!(stream.read_u8().await) { Ok(b) => b, Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), - Err(e) => return Err(e.into()), + Err(e) => return Err(ConnectionError::Socket(e)), }; // The message length includes itself, so it better be at least 4. - let len = retry_read!(stream.read_u32().await)? + let len = retry_read!(stream.read_u32().await) + .map_err(ConnectionError::Socket)? .checked_sub(4) - .context("invalid message length")?; + .ok_or_else(|| ConnectionError::Protocol("invalid message length".to_string()))?; let body = { let mut buffer = vec![0u8; len as usize]; - stream.read_exact(&mut buffer).await?; + stream + .read_exact(&mut buffer) + .await + .map_err(ConnectionError::Socket)?; Bytes::from(buffer) }; @@ -265,7 +300,11 @@ impl FeMessage { b'c' => Ok(Some(FeMessage::CopyDone)), b'f' => Ok(Some(FeMessage::CopyFail)), b'p' => Ok(Some(FeMessage::PasswordMessage(body))), - tag => bail!("unknown message tag: {},'{:?}'", tag, body), + tag => { + return Err(ConnectionError::Protocol(format!( + "unknown message tag: {tag},'{body:?}'" + ))) + } } }) } @@ -275,7 +314,9 @@ impl FeStartupPacket { /// Read startup message from the stream. // XXX: It's tempting yet undesirable to accept `stream` by value, // since such a change will cause user-supplied &mut references to be consumed - pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result> { + pub fn read( + stream: &mut (impl io::Read + Unpin), + ) -> Result, ConnectionError> { Self::read_fut(&mut AsyncishRead(stream)).wait() } @@ -284,7 +325,7 @@ impl FeStartupPacket { // since such a change will cause user-supplied &mut references to be consumed pub fn read_fut( stream: &mut Reader, - ) -> SyncFuture>> + '_> + ) -> SyncFuture, ConnectionError>> + '_> where Reader: tokio::io::AsyncRead + Unpin, { @@ -302,31 +343,41 @@ impl FeStartupPacket { let len = match retry_read!(stream.read_u32().await) { Ok(len) => len as usize, Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), - Err(e) => return Err(e.into()), + Err(e) => return Err(ConnectionError::Socket(e)), }; #[allow(clippy::manual_range_contains)] if len < 4 || len > MAX_STARTUP_PACKET_LENGTH { - bail!("invalid message length"); + return Err(ConnectionError::Protocol(format!( + "invalid message length {len}" + ))); } - let request_code = retry_read!(stream.read_u32().await)?; + let request_code = + retry_read!(stream.read_u32().await).map_err(ConnectionError::Socket)?; // the rest of startup packet are params let params_len = len - 8; let mut params_bytes = vec![0u8; params_len]; - stream.read_exact(params_bytes.as_mut()).await?; + stream + .read_exact(params_bytes.as_mut()) + .await + .map_err(ConnectionError::Socket)?; // Parse params depending on request code let req_hi = request_code >> 16; let req_lo = request_code & ((1 << 16) - 1); let message = match (req_hi, req_lo) { (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => { - ensure!(params_len == 8, "expected 8 bytes for CancelRequest params"); + if params_len != 8 { + return Err(ConnectionError::Protocol( + "expected 8 bytes for CancelRequest params".to_string(), + )); + } let mut cursor = Cursor::new(params_bytes); FeStartupPacket::CancelRequest(CancelKeyData { - backend_pid: cursor.read_i32().await?, - cancel_key: cursor.read_i32().await?, + backend_pid: cursor.read_i32().await.map_err(ConnectionError::Socket)?, + cancel_key: cursor.read_i32().await.map_err(ConnectionError::Socket)?, }) } (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => { @@ -338,7 +389,9 @@ impl FeStartupPacket { FeStartupPacket::GssEncRequest } (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => { - bail!("Unrecognized request code {}", unrecognized_code) + return Err(ConnectionError::Protocol(format!( + "Unrecognized request code {unrecognized_code}" + ))); } // TODO bail if protocol major_version is not 3? (major_version, minor_version) => { @@ -346,15 +399,21 @@ impl FeStartupPacket { // See `postgres: ProcessStartupPacket, build_startup_packet`. let mut tokens = str::from_utf8(¶ms_bytes) .context("StartupMessage params: invalid utf-8")? - .strip_suffix('\0') // drop packet's own null terminator - .context("StartupMessage params: missing null terminator")? + .strip_suffix('\0') // drop packet's own null + .ok_or_else(|| { + ConnectionError::Protocol( + "StartupMessage params: missing null terminator".to_string(), + ) + })? .split_terminator('\0'); let mut params = HashMap::new(); while let Some(name) = tokens.next() { - let value = tokens - .next() - .context("StartupMessage params: key without value")?; + let value = tokens.next().ok_or_else(|| { + ConnectionError::Protocol( + "StartupMessage params: key without value".to_string(), + ) + })?; params.insert(name.to_owned(), value.to_owned()); } @@ -458,12 +517,15 @@ pub enum BeMessage<'a> { CloseComplete, // None means column is NULL DataRow(&'a [Option<&'a [u8]>]), - ErrorResponse(&'a str), + ErrorResponse(&'a str, Option<&'a [u8; 5]>), /// Single byte - used in response to SSLRequest/GSSENCRequest. EncryptionResponse(bool), NoData, ParameterDescription, - ParameterStatus(BeParameterStatusMessage<'a>), + ParameterStatus { + name: &'a [u8], + value: &'a [u8], + }, ParseComplete, ReadyForQuery, RowDescription(&'a [RowDescriptor<'a>]), @@ -472,6 +534,28 @@ pub enum BeMessage<'a> { KeepAlive(WalSndKeepAlive), } +/// Common shorthands. +impl<'a> BeMessage<'a> { + /// A [`BeMessage::ParameterStatus`] holding the client encoding, i.e. UTF-8. + /// This is a sensible default, given that: + /// * rust strings only support this encoding out of the box. + /// * tokio-postgres, postgres-jdbc (and probably more) mandate it. + /// + /// TODO: do we need to report `server_encoding` as well? + pub const CLIENT_ENCODING: Self = Self::ParameterStatus { + name: b"client_encoding", + value: b"UTF8", + }; + + /// Build a [`BeMessage::ParameterStatus`] holding the server version. + pub fn server_version(version: &'a str) -> Self { + Self::ParameterStatus { + name: b"server_version", + value: version.as_bytes(), + } + } +} + #[derive(Debug)] pub enum BeAuthenticationSaslMessage<'a> { Methods(&'a [&'a str]), @@ -485,12 +569,6 @@ pub enum BeParameterStatusMessage<'a> { ServerVersion(&'a str), } -impl BeParameterStatusMessage<'static> { - pub fn encoding() -> BeMessage<'static> { - BeMessage::ParameterStatus(Self::Encoding("UTF8")) - } -} - // One row description in RowDescription packet. #[derive(Debug)] pub struct RowDescriptor<'a> { @@ -587,14 +665,15 @@ fn write_body(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R { } /// Safe write of s into buf as cstring (String in the protocol). -fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> { - if s.contains(&0) { +fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> { + let bytes = s.as_ref(); + if bytes.contains(&0) { return Err(io::Error::new( io::ErrorKind::InvalidInput, "string contains embedded null", )); } - buf.put_slice(s); + buf.put_slice(bytes); buf.put_u8(0); Ok(()) } @@ -606,6 +685,8 @@ fn read_cstr(buf: &mut Bytes) -> anyhow::Result { Ok(result) } +pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000"; + impl<'a> BeMessage<'a> { /// Write message to the given buf. // Unlike the reading side, we use BytesMut @@ -644,7 +725,7 @@ impl<'a> BeMessage<'a> { Methods(methods) => { buf.put_i32(10); // Specifies that SASL auth method is used. for method in methods.iter() { - write_cstr(method.as_bytes(), buf)?; + write_cstr(method, buf)?; } buf.put_u8(0); // zero terminator for the list } @@ -745,10 +826,7 @@ impl<'a> BeMessage<'a> { // First byte of each field represents type of this field. Set just enough fields // to satisfy rust-postgres client: 'S' -- severity, 'C' -- error, 'M' -- error // message text. - BeMessage::ErrorResponse(error_msg) => { - // For all the errors set Severity to Error and error code to - // 'internal error'. - + BeMessage::ErrorResponse(error_msg, pg_error_code) => { // 'E' signalizes ErrorResponse messages buf.put_u8(b'E'); write_body(buf, |buf| { @@ -756,10 +834,12 @@ impl<'a> BeMessage<'a> { buf.put_slice(b"ERROR\0"); buf.put_u8(b'C'); // SQLSTATE error code - buf.put_slice(b"CXX000\0"); + buf.put_slice(&terminate_code( + pg_error_code.unwrap_or(SQLSTATE_INTERNAL_ERROR), + )); buf.put_u8(b'M'); // the message - write_cstr(error_msg.as_bytes(), buf)?; + write_cstr(error_msg, buf)?; buf.put_u8(0); // terminator Ok::<_, io::Error>(()) @@ -779,7 +859,7 @@ impl<'a> BeMessage<'a> { buf.put_slice(b"NOTICE\0"); buf.put_u8(b'C'); // SQLSTATE error code - buf.put_slice(b"CXX000\0"); + buf.put_slice(&terminate_code(SQLSTATE_INTERNAL_ERROR)); buf.put_u8(b'M'); // the message write_cstr(error_msg.as_bytes(), buf)?; @@ -799,24 +879,12 @@ impl<'a> BeMessage<'a> { buf.put_u8(response); } - BeMessage::ParameterStatus(param) => { - use std::io::{IoSlice, Write}; - use BeParameterStatusMessage::*; - - let [name, value] = match param { - Encoding(name) => [b"client_encoding", name.as_bytes()], - ServerVersion(version) => [b"server_version", version.as_bytes()], - }; - - // Parameter names and values are passed as null-terminated strings - let iov = &mut [name, b"\0", value, b"\0"].map(IoSlice::new); - let mut buffer = [0u8; 64]; // this should be enough - let cnt = buffer.as_mut().write_vectored(iov).unwrap(); - + BeMessage::ParameterStatus { name, value } => { buf.put_u8(b'S'); write_body(buf, |buf| { - buf.put_slice(&buffer[..cnt]); - }); + write_cstr(name, buf)?; + write_cstr(value, buf) + })?; } BeMessage::ParameterDescription => { @@ -873,7 +941,7 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'k'); buf.put_u64(req.sent_ptr); buf.put_i64(req.timestamp); - buf.put_u8(if req.request_reply { 1 } else { 0 }); + buf.put_u8(u8::from(req.request_reply)); }); } } @@ -1079,3 +1147,12 @@ mod tests { let _ = FeStartupPacket::read_fut(stream).await; } } + +fn terminate_code(code: &[u8; 5]) -> [u8; 6] { + let mut terminated = [0; 6]; + for (i, &elem) in code.iter().enumerate() { + terminated[i] = elem; + } + + terminated +} diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index ebd30fc1eb..5a39f27209 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -2,6 +2,7 @@ name = "remote_storage" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] anyhow = { version = "1.0", features = ["backtrace"] } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 28858fcbab..1091a8bd5c 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -7,6 +7,7 @@ //! mod local_fs; mod s3_bucket; +mod simulate_failures; use std::{ collections::HashMap, @@ -24,7 +25,7 @@ use tokio::io; use toml_edit::Item; use tracing::info; -pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket}; +pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper}; /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage. /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency @@ -77,7 +78,10 @@ pub trait RemoteStorage: Send + Sync + 'static { /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS) /// so this method doesnt need to. - async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result>; + async fn list_prefixes( + &self, + prefix: Option<&RemotePath>, + ) -> Result, DownloadError>; /// Streams the local file contents into remote into the remote storage entry. async fn upload( @@ -150,6 +154,7 @@ impl std::error::Error for DownloadError {} pub enum GenericRemoteStorage { LocalFs(LocalFs), AwsS3(Arc), + Unreliable(Arc), } impl Deref for GenericRemoteStorage { @@ -159,27 +164,30 @@ impl Deref for GenericRemoteStorage { match self { GenericRemoteStorage::LocalFs(local_fs) => local_fs, GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(), + GenericRemoteStorage::Unreliable(s) => s.as_ref(), } } } impl GenericRemoteStorage { - pub fn from_config( - storage_config: &RemoteStorageConfig, - ) -> anyhow::Result { + pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { Ok(match &storage_config.storage { RemoteStorageKind::LocalFs(root) => { info!("Using fs root '{}' as a remote storage", root.display()); - GenericRemoteStorage::LocalFs(LocalFs::new(root.clone())?) + Self::LocalFs(LocalFs::new(root.clone())?) } RemoteStorageKind::AwsS3(s3_config) => { info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'", s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); - GenericRemoteStorage::AwsS3(Arc::new(S3Bucket::new(s3_config)?)) + Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?)) } }) } + pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self { + Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first))) + } + /// Takes storage object contents and its size and uploads to remote storage, /// mapping `from_path` to the corresponding remote object id in the storage. /// diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 50a84eb33f..f1289569ae 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -92,13 +92,17 @@ impl RemoteStorage for LocalFs { .collect()) } - async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result> { + async fn list_prefixes( + &self, + prefix: Option<&RemotePath>, + ) -> Result, DownloadError> { let path = match prefix { Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)), None => Cow::Borrowed(&self.storage_root), }; Ok(get_all_files(path.as_ref(), false) - .await? + .await + .map_err(DownloadError::Other)? .into_iter() .map(|path| { path.strip_prefix(&self.storage_root) diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 740f3753d8..18a2c5dedd 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -286,7 +286,10 @@ impl RemoteStorage for S3Bucket { /// See the doc for `RemoteStorage::list_prefixes` /// Note: it wont include empty "directories" - async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result> { + async fn list_prefixes( + &self, + prefix: Option<&RemotePath>, + ) -> Result, DownloadError> { // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix .map(|p| self.relative_path_to_s3_object(p)) @@ -308,7 +311,8 @@ impl RemoteStorage for S3Bucket { .concurrency_limiter .acquire() .await - .context("Concurrency limiter semaphore got closed during S3 list")?; + .context("Concurrency limiter semaphore got closed during S3 list") + .map_err(DownloadError::Other)?; metrics::inc_list_objects(); @@ -324,7 +328,9 @@ impl RemoteStorage for S3Bucket { .map_err(|e| { metrics::inc_list_objects_fail(); e - })?; + }) + .context("Failed to list S3 prefixes") + .map_err(DownloadError::Other)?; document_keys.extend( fetch_response diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs new file mode 100644 index 0000000000..643bb99dce --- /dev/null +++ b/libs/remote_storage/src/simulate_failures.rs @@ -0,0 +1,129 @@ +//! This module provides a wrapper around a real RemoteStorage implementation that +//! causes the first N attempts at each upload or download operatio to fail. For +//! testing purposes. +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::sync::Mutex; + +use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata}; + +pub struct UnreliableWrapper { + inner: crate::GenericRemoteStorage, + + // This many attempts of each operation will fail, then we let it succeed. + attempts_to_fail: u64, + + // Tracks how many failed attempts of each operation has been made. + attempts: Mutex>, +} + +/// Used to identify retries of different unique operation. +#[derive(Debug, Hash, Eq, PartialEq)] +enum RemoteOp { + List, + ListPrefixes(Option), + Upload(RemotePath), + Download(RemotePath), + Delete(RemotePath), +} + +impl UnreliableWrapper { + pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self { + assert!(attempts_to_fail > 0); + UnreliableWrapper { + inner, + attempts_to_fail, + attempts: Mutex::new(HashMap::new()), + } + } + + /// + /// Common functionality for all operations. + /// + /// On the first attempts of this operation, return an error. After 'attempts_to_fail' + /// attempts, let the operation go ahead, and clear the counter. + /// + fn attempt(&self, op: RemoteOp) -> Result { + let mut attempts = self.attempts.lock().unwrap(); + + match attempts.entry(op) { + Entry::Occupied(mut e) => { + let attempts_before_this = { + let p = e.get_mut(); + *p += 1; + *p + }; + + if attempts_before_this >= self.attempts_to_fail { + // let it succeed + e.remove(); + Ok(attempts_before_this) + } else { + let error = + anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); + Err(DownloadError::Other(error)) + } + } + Entry::Vacant(e) => { + let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); + e.insert(1); + Err(DownloadError::Other(error)) + } + } + } +} + +#[async_trait::async_trait] +impl RemoteStorage for UnreliableWrapper { + /// Lists all items the storage has right now. + async fn list(&self) -> anyhow::Result> { + self.attempt(RemoteOp::List)?; + self.inner.list().await + } + + async fn list_prefixes( + &self, + prefix: Option<&RemotePath>, + ) -> Result, DownloadError> { + self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?; + self.inner.list_prefixes(prefix).await + } + + async fn upload( + &self, + data: Box<(dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static)>, + // S3 PUT request requires the content length to be specified, + // otherwise it starts to fail with the concurrent connection count increasing. + data_size_bytes: usize, + to: &RemotePath, + metadata: Option, + ) -> anyhow::Result<()> { + self.attempt(RemoteOp::Upload(to.clone()))?; + self.inner.upload(data, data_size_bytes, to, metadata).await + } + + async fn download(&self, from: &RemotePath) -> Result { + self.attempt(RemoteOp::Download(from.clone()))?; + self.inner.download(from).await + } + + async fn download_byte_range( + &self, + from: &RemotePath, + start_inclusive: u64, + end_exclusive: Option, + ) -> Result { + // Note: We treat any download_byte_range as an "attempt" of the same + // operation. We don't pay attention to the ranges. That's good enough + // for now. + self.attempt(RemoteOp::Download(from.clone()))?; + self.inner + .download_byte_range(from, start_inclusive, end_exclusive) + .await + } + + async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { + self.attempt(RemoteOp::Delete(path.clone()))?; + self.inner.delete(path).await + } +} diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index 15bdecd71d..32cda78be4 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -2,6 +2,7 @@ name = "safekeeper_api" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] serde = { version = "1.0", features = ["derive"] } diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml index 1aabf5a4f9..3a1a0f7915 100644 --- a/libs/tenant_size_model/Cargo.toml +++ b/libs/tenant_size_model/Cargo.toml @@ -3,6 +3,7 @@ name = "tenant_size_model" version = "0.1.0" edition = "2021" publish = false +license = "Apache-2.0" [dependencies] workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 47639e8205..9c7fcafe23 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -2,9 +2,10 @@ name = "utils" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] -sentry = "0.29.0" +sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } async-trait = "0.1" anyhow = "1.0" bincode = "1.3" diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index 3726779cb2..2c7e6e20ab 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -157,34 +157,34 @@ mod tests { assert_eq!(err.kind(), io::ErrorKind::AlreadyExists); let invalid_dir_path = file_path.join("folder"); - create_dir_all(&invalid_dir_path).unwrap_err(); + create_dir_all(invalid_dir_path).unwrap_err(); } #[test] fn test_path_with_suffix_extension() { let p = PathBuf::from("/foo/bar"); assert_eq!( - &path_with_suffix_extension(&p, "temp").to_string_lossy(), + &path_with_suffix_extension(p, "temp").to_string_lossy(), "/foo/bar.temp" ); let p = PathBuf::from("/foo/bar"); assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + &path_with_suffix_extension(p, "temp.temp").to_string_lossy(), "/foo/bar.temp.temp" ); let p = PathBuf::from("/foo/bar.baz"); assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + &path_with_suffix_extension(p, "temp.temp").to_string_lossy(), "/foo/bar.baz.temp.temp" ); let p = PathBuf::from("/foo/bar.baz"); assert_eq!( - &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + &path_with_suffix_extension(p, ".temp").to_string_lossy(), "/foo/bar.baz..temp" ); let p = PathBuf::from("/foo/bar/dir/"); assert_eq!( - &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + &path_with_suffix_extension(p, ".temp").to_string_lossy(), "/foo/bar/dir..temp" ); } diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 89f7197718..f3e3835bda 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -3,11 +3,11 @@ //! implementation determining how to process the queries. Currently its API //! is rather narrow, but we can extend it once required. +use crate::postgres_backend_async::{log_query_error, short_error, QueryError}; use crate::sock_split::{BidiStream, ReadStream, WriteStream}; -use anyhow::{bail, ensure, Context, Result}; +use anyhow::Context; use bytes::{Bytes, BytesMut}; -use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket}; -use rand::Rng; +use pq_proto::{BeMessage, FeMessage, FeStartupPacket}; use serde::{Deserialize, Serialize}; use std::fmt; use std::io::{self, Write}; @@ -22,25 +22,32 @@ pub trait Handler { /// postgres_backend will issue ReadyForQuery after calling this (this /// might be not what we want after CopyData streaming, but currently we don't /// care). - fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>; + fn process_query( + &mut self, + pgb: &mut PostgresBackend, + query_string: &str, + ) -> Result<(), QueryError>; /// Called on startup packet receival, allows to process params. /// /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow /// to override whole init logic in implementations. - fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> { + fn startup( + &mut self, + _pgb: &mut PostgresBackend, + _sm: &FeStartupPacket, + ) -> Result<(), QueryError> { Ok(()) } - /// Check auth md5 - fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> { - bail!("MD5 auth failed") - } - /// Check auth jwt - fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> { - bail!("JWT auth failed") + fn check_auth_jwt( + &mut self, + _pgb: &mut PostgresBackend, + _jwt_response: &[u8], + ) -> Result<(), QueryError> { + Err(QueryError::Other(anyhow::anyhow!("JWT auth failed"))) } fn is_shutdown_requested(&self) -> bool { @@ -61,7 +68,6 @@ pub enum ProtoState { #[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] pub enum AuthType { Trust, - MD5, // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT NeonJWT, } @@ -72,9 +78,8 @@ impl FromStr for AuthType { fn from_str(s: &str) -> Result { match s { "Trust" => Ok(Self::Trust), - "MD5" => Ok(Self::MD5), "NeonJWT" => Ok(Self::NeonJWT), - _ => bail!("invalid value \"{s}\" for auth type"), + _ => anyhow::bail!("invalid value \"{s}\" for auth type"), } } } @@ -83,7 +88,6 @@ impl fmt::Display for AuthType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(match self { AuthType::Trust => "Trust", - AuthType::MD5 => "MD5", AuthType::NeonJWT => "NeonJWT", }) } @@ -134,7 +138,6 @@ pub struct PostgresBackend { pub state: ProtoState, - md5_salt: [u8; 4], auth_type: AuthType, peer_addr: SocketAddr, @@ -164,7 +167,7 @@ pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool { } // Cast a byte slice to a string slice, dropping null terminator if there's one. -fn cstr_to_str(bytes: &[u8]) -> Result<&str> { +fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> { let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); std::str::from_utf8(without_null).map_err(|e| e.into()) } @@ -187,7 +190,6 @@ impl PostgresBackend { stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))), buf_out: BytesMut::with_capacity(10 * 1024), state: ProtoState::Initialization, - md5_salt: [0u8; 4], auth_type, tls_config, peer_addr, @@ -199,10 +201,10 @@ impl PostgresBackend { } /// Get direct reference (into the Option) to the read stream. - fn get_stream_in(&mut self) -> Result<&mut BidiStream> { + fn get_stream_in(&mut self) -> anyhow::Result<&mut BidiStream> { match &mut self.stream { Some(Stream::Bidirectional(stream)) => Ok(stream), - _ => bail!("reader taken"), + _ => anyhow::bail!("reader taken"), } } @@ -226,7 +228,7 @@ impl PostgresBackend { } /// Read full message or return None if connection is closed. - pub fn read_message(&mut self) -> Result> { + pub fn read_message(&mut self) -> Result, QueryError> { let (state, stream) = (self.state, self.get_stream_in()?); use ProtoState::*; @@ -234,6 +236,7 @@ impl PostgresBackend { Initialization | Encrypted => FeStartupPacket::read(stream), Authentication | Established => FeMessage::read(stream), } + .map_err(QueryError::from) } /// Write message into internal output buffer. @@ -257,7 +260,7 @@ impl PostgresBackend { } // Wrapper for run_message_loop() that shuts down socket when we are done - pub fn run(mut self, handler: &mut impl Handler) -> Result<()> { + pub fn run(mut self, handler: &mut impl Handler) -> Result<(), QueryError> { let ret = self.run_message_loop(handler); if let Some(stream) = self.stream.as_mut() { let _ = stream.shutdown(Shutdown::Both); @@ -265,7 +268,7 @@ impl PostgresBackend { ret } - fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<()> { + fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> { trace!("postgres backend to {:?} started", self.peer_addr); let mut unnamed_query_string = Bytes::new(); @@ -274,7 +277,7 @@ impl PostgresBackend { match self.read_message() { Ok(message) => { if let Some(msg) = message { - trace!("got message {:?}", msg); + trace!("got message {msg:?}"); match self.process_message(handler, msg, &mut unnamed_query_string)? { ProcessMsgResult::Continue => continue, @@ -285,10 +288,12 @@ impl PostgresBackend { } } Err(e) => { - // If it is a timeout error, continue the loop - if !is_socket_read_timed_out(&e) { - return Err(e); + if let QueryError::Other(e) = &e { + if is_socket_read_timed_out(e) { + continue; + } } + return Err(e); } } } @@ -306,7 +311,7 @@ impl PostgresBackend { } stream => { self.stream = stream; - bail!("can't start TLs without bidi stream"); + anyhow::bail!("can't start TLs without bidi stream"); } } } @@ -316,17 +321,16 @@ impl PostgresBackend { handler: &mut impl Handler, msg: FeMessage, unnamed_query_string: &mut Bytes, - ) -> Result { + ) -> Result { // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth // TODO: change that to proper top-level match of protocol state with separate message handling for each state - if self.state < ProtoState::Established { - ensure!( - matches!( - msg, - FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_) - ), - "protocol violation" - ); + if self.state < ProtoState::Established + && !matches!( + msg, + FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_) + ) + { + return Err(QueryError::Other(anyhow::anyhow!("protocol violation"))); } let have_tls = self.tls_config.is_some(); @@ -350,8 +354,13 @@ impl PostgresBackend { } FeStartupPacket::StartupMessage { .. } => { if have_tls && !matches!(self.state, ProtoState::Encrypted) { - self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?; - bail!("client did not connect with TLS"); + self.write_message(&BeMessage::ErrorResponse( + "must connect with TLS", + None, + ))?; + return Err(QueryError::Other(anyhow::anyhow!( + "client did not connect with TLS" + ))); } // NB: startup() may change self.auth_type -- we are using that in proxy code @@ -361,21 +370,12 @@ impl PostgresBackend { match self.auth_type { AuthType::Trust => { self.write_message_noflush(&BeMessage::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())? + .write_message_noflush(&BeMessage::CLIENT_ENCODING)? // The async python driver requires a valid server_version - .write_message_noflush(&BeMessage::ParameterStatus( - BeParameterStatusMessage::ServerVersion("14.1"), - ))? + .write_message_noflush(&BeMessage::server_version("14.1"))? .write_message(&BeMessage::ReadyForQuery)?; self.state = ProtoState::Established; } - AuthType::MD5 => { - rand::thread_rng().fill(&mut self.md5_salt); - self.write_message(&BeMessage::AuthenticationMD5Password( - self.md5_salt, - ))?; - self.state = ProtoState::Authentication; - } AuthType::NeonJWT => { self.write_message(&BeMessage::AuthenticationCleartextPassword)?; self.state = ProtoState::Authentication; @@ -395,25 +395,20 @@ impl PostgresBackend { match self.auth_type { AuthType::Trust => unreachable!(), - AuthType::MD5 => { - let (_, md5_response) = m.split_last().context("protocol violation")?; - - if let Err(e) = handler.check_auth_md5(self, md5_response) { - self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; - bail!("auth failed: {}", e); - } - } AuthType::NeonJWT => { let (_, jwt_response) = m.split_last().context("protocol violation")?; if let Err(e) = handler.check_auth_jwt(self, jwt_response) { - self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; - bail!("auth failed: {}", e); + self.write_message(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))?; + return Err(e); } } } self.write_message_noflush(&BeMessage::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())? + .write_message_noflush(&BeMessage::CLIENT_ENCODING)? .write_message(&BeMessage::ReadyForQuery)?; self.state = ProtoState::Established; } @@ -422,33 +417,14 @@ impl PostgresBackend { // remove null terminator let query_string = cstr_to_str(&body)?; - trace!("got query {:?}", query_string); - // xxx distinguish fatal and recoverable errors? + trace!("got query {query_string:?}"); if let Err(e) = handler.process_query(self, query_string) { - // ":?" uses the alternate formatting style, which makes anyhow display the - // full cause of the error, not just the top-level context + its trace. - // We don't want to send that in the ErrorResponse though, - // because it's not relevant to the compute node logs. - // - // We also don't want to log full stacktrace when the error is primitive, - // such as usual connection closed. - let short_error = format!("{:#}", e); - let root_cause = e.root_cause().to_string(); - if root_cause.contains("connection closed unexpectedly") - || root_cause.contains("Broken pipe (os error 32)") - { - error!( - "query handler for '{}' failed: {}", - query_string, short_error - ); - } else { - error!("query handler for '{}' failed: {:?}", query_string, e); - } - self.write_message_noflush(&BeMessage::ErrorResponse(&short_error))?; - // TODO: untangle convoluted control flow - if e.to_string().contains("failed to run") { - return Ok(ProcessMsgResult::Break); - } + log_query_error(query_string, &e); + let short_error = short_error(&e); + self.write_message_noflush(&BeMessage::ErrorResponse( + &short_error, + Some(e.pg_error_code()), + ))?; } self.write_message(&BeMessage::ReadyForQuery)?; } @@ -473,11 +449,13 @@ impl PostgresBackend { FeMessage::Execute(_) => { let query_string = cstr_to_str(unnamed_query_string)?; - trace!("got execute {:?}", query_string); - // xxx distinguish fatal and recoverable errors? + trace!("got execute {query_string:?}"); if let Err(e) = handler.process_query(self, query_string) { - error!("query handler for '{}' failed: {:?}", query_string, e); - self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + log_query_error(query_string, &e); + self.write_message(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))?; } // NOTE there is no ReadyForQuery message. This handler is used // for basebackup and it uses CopyOut which doesn't require @@ -496,7 +474,9 @@ impl PostgresBackend { // We prefer explicit pattern matching to wildcards, because // this helps us spot the places where new variants are missing FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => { - bail!("unexpected message type: {:?}", msg); + return Err(QueryError::Other(anyhow::anyhow!( + "unexpected message type: {msg:?}" + ))); } } diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs index 376819027b..95b7b3fd15 100644 --- a/libs/utils/src/postgres_backend_async.rs +++ b/libs/utils/src/postgres_backend_async.rs @@ -4,45 +4,87 @@ //! is rather narrow, but we can extend it once required. use crate::postgres_backend::AuthType; -use anyhow::{bail, Context, Result}; -use bytes::{Bytes, BytesMut}; -use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket}; -use rand::Rng; +use anyhow::Context; +use bytes::{Buf, Bytes, BytesMut}; +use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR}; use std::future::Future; +use std::io; use std::net::SocketAddr; use std::pin::Pin; use std::sync::Arc; use std::task::Poll; -use tracing::{debug, error, trace}; +use tracing::{debug, error, info, trace}; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader}; use tokio_rustls::TlsAcceptor; +pub fn is_expected_io_error(e: &io::Error) -> bool { + use io::ErrorKind::*; + matches!( + e.kind(), + ConnectionRefused | ConnectionAborted | ConnectionReset + ) +} + +/// An error, occurred during query processing: +/// either during the connection ([`ConnectionError`]) or before/after it. +#[derive(thiserror::Error, Debug)] +pub enum QueryError { + /// The connection was lost while processing the query. + #[error(transparent)] + Disconnected(#[from] ConnectionError), + /// Some other error + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl From for QueryError { + fn from(e: io::Error) -> Self { + Self::Disconnected(ConnectionError::Socket(e)) + } +} + +impl QueryError { + pub fn pg_error_code(&self) -> &'static [u8; 5] { + match self { + Self::Disconnected(_) => b"08006", // connection failure + Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error + } + } +} + #[async_trait::async_trait] pub trait Handler { /// Handle single query. /// postgres_backend will issue ReadyForQuery after calling this (this /// might be not what we want after CopyData streaming, but currently we don't /// care). - async fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>; + async fn process_query( + &mut self, + pgb: &mut PostgresBackend, + query_string: &str, + ) -> Result<(), QueryError>; /// Called on startup packet receival, allows to process params. /// /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow /// to override whole init logic in implementations. - fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> { + fn startup( + &mut self, + _pgb: &mut PostgresBackend, + _sm: &FeStartupPacket, + ) -> Result<(), QueryError> { Ok(()) } - /// Check auth md5 - fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> { - bail!("MD5 auth failed") - } - /// Check auth jwt - fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> { - bail!("JWT auth failed") + fn check_auth_jwt( + &mut self, + _pgb: &mut PostgresBackend, + _jwt_response: &[u8], + ) -> Result<(), QueryError> { + Err(QueryError::Other(anyhow::anyhow!("JWT auth failed"))) } } @@ -76,17 +118,14 @@ impl AsyncWrite for Stream { self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, buf: &[u8], - ) -> Poll> { + ) -> Poll> { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf), Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf), Self::Broken => unreachable!(), } } - fn poll_flush( - self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { + fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll> { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx), Self::Tls(stream) => Pin::new(stream).poll_flush(cx), @@ -96,7 +135,7 @@ impl AsyncWrite for Stream { fn poll_shutdown( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, - ) -> Poll> { + ) -> Poll> { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx), Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx), @@ -109,7 +148,7 @@ impl AsyncRead for Stream { self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, buf: &mut tokio::io::ReadBuf<'_>, - ) -> Poll> { + ) -> Poll> { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf), Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf), @@ -120,12 +159,14 @@ impl AsyncRead for Stream { pub struct PostgresBackend { stream: Stream, + // Output buffer. c.f. BeMessage::write why we are using BytesMut here. + // The data between 0 and "current position" as tracked by the bytes::Buf + // implementation of BytesMut, have already been written. buf_out: BytesMut, pub state: ProtoState, - md5_salt: [u8; 4], auth_type: AuthType, peer_addr: SocketAddr, @@ -143,7 +184,7 @@ pub fn query_from_cstring(query_string: Bytes) -> Vec { } // Cast a byte slice to a string slice, dropping null terminator if there's one. -fn cstr_to_str(bytes: &[u8]) -> Result<&str> { +fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> { let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); std::str::from_utf8(without_null).map_err(|e| e.into()) } @@ -153,14 +194,13 @@ impl PostgresBackend { socket: tokio::net::TcpStream, auth_type: AuthType, tls_config: Option>, - ) -> std::io::Result { + ) -> io::Result { let peer_addr = socket.peer_addr()?; Ok(Self { stream: Stream::Unencrypted(BufReader::new(socket)), buf_out: BytesMut::with_capacity(10 * 1024), state: ProtoState::Initialization, - md5_salt: [0u8; 4], auth_type, tls_config, peer_addr, @@ -172,30 +212,68 @@ impl PostgresBackend { } /// Read full message or return None if connection is closed. - pub async fn read_message(&mut self) -> Result> { + pub async fn read_message(&mut self) -> Result, QueryError> { use ProtoState::*; match self.state { Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await, Authentication | Established => FeMessage::read_fut(&mut self.stream).await, Closed => Ok(None), } + .map_err(QueryError::from) } /// Flush output buffer into the socket. - pub async fn flush(&mut self) -> std::io::Result<&mut Self> { - self.stream.write_all(&self.buf_out).await?; + pub async fn flush(&mut self) -> io::Result<()> { + while self.buf_out.has_remaining() { + let bytes_written = self.stream.write(self.buf_out.chunk()).await?; + self.buf_out.advance(bytes_written); + } self.buf_out.clear(); - Ok(self) + Ok(()) } /// Write message into internal output buffer. - pub fn write_message(&mut self, message: &BeMessage<'_>) -> Result<&mut Self, std::io::Error> { + pub fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { BeMessage::write(&mut self.buf_out, message)?; Ok(self) } + /// Returns an AsyncWrite implementation that wraps all the data written + /// to it in CopyData messages, and writes them to the connection + /// + /// The caller is responsible for sending CopyOutResponse and CopyDone messages. + pub fn copyout_writer(&mut self) -> CopyDataWriter { + CopyDataWriter { pgb: self } + } + + /// A polling function that tries to write all the data from 'buf_out' to the + /// underlying stream. + fn poll_write_buf( + &mut self, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + while self.buf_out.has_remaining() { + match Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk()) { + Poll::Ready(Ok(bytes_written)) => { + self.buf_out.advance(bytes_written); + } + Poll::Ready(Err(err)) => return Poll::Ready(Err(err)), + Poll::Pending => return Poll::Pending, + } + } + Poll::Ready(Ok(())) + } + + fn poll_flush(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { + Pin::new(&mut self.stream).poll_flush(cx) + } + // Wrapper for run_message_loop() that shuts down socket when we are done - pub async fn run(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()> + pub async fn run( + mut self, + handler: &mut impl Handler, + shutdown_watcher: F, + ) -> Result<(), QueryError> where F: Fn() -> S, S: Future, @@ -209,7 +287,7 @@ impl PostgresBackend { &mut self, handler: &mut impl Handler, shutdown_watcher: F, - ) -> Result<()> + ) -> Result<(), QueryError> where F: Fn() -> S, S: Future, @@ -245,7 +323,7 @@ impl PostgresBackend { return Ok(()); } } - Ok::<(), anyhow::Error>(()) + Ok::<(), QueryError>(()) } => { // Handshake complete. result?; @@ -290,14 +368,14 @@ impl PostgresBackend { self.stream = Stream::Tls(Box::new(tls_stream)); return Ok(()); }; - bail!("TLS already started"); + anyhow::bail!("TLS already started"); } async fn process_handshake_message( &mut self, handler: &mut impl Handler, msg: FeMessage, - ) -> Result { + ) -> Result { assert!(self.state < ProtoState::Established); let have_tls = self.tls_config.is_some(); match msg { @@ -320,8 +398,13 @@ impl PostgresBackend { } FeStartupPacket::StartupMessage { .. } => { if have_tls && !matches!(self.state, ProtoState::Encrypted) { - self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?; - bail!("client did not connect with TLS"); + self.write_message(&BeMessage::ErrorResponse( + "must connect with TLS", + None, + ))?; + return Err(QueryError::Other(anyhow::anyhow!( + "client did not connect with TLS" + ))); } // NB: startup() may change self.auth_type -- we are using that in proxy code @@ -331,21 +414,12 @@ impl PostgresBackend { match self.auth_type { AuthType::Trust => { self.write_message(&BeMessage::AuthenticationOk)? - .write_message(&BeParameterStatusMessage::encoding())? + .write_message(&BeMessage::CLIENT_ENCODING)? // The async python driver requires a valid server_version - .write_message(&BeMessage::ParameterStatus( - BeParameterStatusMessage::ServerVersion("14.1"), - ))? + .write_message(&BeMessage::server_version("14.1"))? .write_message(&BeMessage::ReadyForQuery)?; self.state = ProtoState::Established; } - AuthType::MD5 => { - rand::thread_rng().fill(&mut self.md5_salt); - self.write_message(&BeMessage::AuthenticationMD5Password( - self.md5_salt, - ))?; - self.state = ProtoState::Authentication; - } AuthType::NeonJWT => { self.write_message(&BeMessage::AuthenticationCleartextPassword)?; self.state = ProtoState::Authentication; @@ -366,25 +440,20 @@ impl PostgresBackend { match self.auth_type { AuthType::Trust => unreachable!(), - AuthType::MD5 => { - let (_, md5_response) = m.split_last().context("protocol violation")?; - - if let Err(e) = handler.check_auth_md5(self, md5_response) { - self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; - bail!("auth failed: {}", e); - } - } AuthType::NeonJWT => { let (_, jwt_response) = m.split_last().context("protocol violation")?; if let Err(e) = handler.check_auth_jwt(self, jwt_response) { - self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; - bail!("auth failed: {}", e); + self.write_message(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))?; + return Err(e); } } } self.write_message(&BeMessage::AuthenticationOk)? - .write_message(&BeParameterStatusMessage::encoding())? + .write_message(&BeMessage::CLIENT_ENCODING)? .write_message(&BeMessage::ReadyForQuery)?; self.state = ProtoState::Established; } @@ -402,33 +471,28 @@ impl PostgresBackend { handler: &mut impl Handler, msg: FeMessage, unnamed_query_string: &mut Bytes, - ) -> Result { + ) -> Result { // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth // TODO: change that to proper top-level match of protocol state with separate message handling for each state assert!(self.state == ProtoState::Established); match msg { FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => { - bail!("protocol violation"); + return Err(QueryError::Other(anyhow::anyhow!("protocol violation"))); } FeMessage::Query(body) => { // remove null terminator let query_string = cstr_to_str(&body)?; - trace!("got query {:?}", query_string); - // xxx distinguish fatal and recoverable errors? + trace!("got query {query_string:?}"); if let Err(e) = handler.process_query(self, query_string).await { - // ":?" uses the alternate formatting style, which makes anyhow display the - // full cause of the error, not just the top-level context + its trace. - // We don't want to send that in the ErrorResponse though, - // because it's not relevant to the compute node logs. - error!("query handler for '{}' failed: {:?}", query_string, e); - self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; - // TODO: untangle convoluted control flow - if e.to_string().contains("failed to run") { - return Ok(ProcessMsgResult::Break); - } + log_query_error(query_string, &e); + let short_error = short_error(&e); + self.write_message(&BeMessage::ErrorResponse( + &short_error, + Some(e.pg_error_code()), + ))?; } self.write_message(&BeMessage::ReadyForQuery)?; } @@ -453,11 +517,13 @@ impl PostgresBackend { FeMessage::Execute(_) => { let query_string = cstr_to_str(unnamed_query_string)?; - trace!("got execute {:?}", query_string); - // xxx distinguish fatal and recoverable errors? + trace!("got execute {query_string:?}"); if let Err(e) = handler.process_query(self, query_string).await { - error!("query handler for '{}' failed: {:?}", query_string, e); - self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + log_query_error(query_string, &e); + self.write_message(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))?; } // NOTE there is no ReadyForQuery message. This handler is used // for basebackup and it uses CopyOut which doesn't require @@ -476,10 +542,99 @@ impl PostgresBackend { // We prefer explicit pattern matching to wildcards, because // this helps us spot the places where new variants are missing FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => { - bail!("unexpected message type: {:?}", msg); + return Err(QueryError::Other(anyhow::anyhow!( + "unexpected message type: {:?}", + msg + ))); } } Ok(ProcessMsgResult::Continue) } } + +/// +/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData +/// messages. +/// + +pub struct CopyDataWriter<'a> { + pgb: &'a mut PostgresBackend, +} + +impl<'a> AsyncWrite for CopyDataWriter<'a> { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> Poll> { + let this = self.get_mut(); + + // It's not strictly required to flush between each message, but makes it easier + // to view in wireshark, and usually the messages that the callers write are + // decently-sized anyway. + match this.pgb.poll_write_buf(cx) { + Poll::Ready(Ok(())) => {} + Poll::Ready(Err(err)) => return Poll::Ready(Err(err)), + Poll::Pending => return Poll::Pending, + } + + // CopyData + // XXX: if the input is large, we should split it into multiple messages. + // Not sure what the threshold should be, but the ultimate hard limit is that + // the length cannot exceed u32. + this.pgb.write_message(&BeMessage::CopyData(buf))?; + + Poll::Ready(Ok(buf.len())) + } + + fn poll_flush( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + let this = self.get_mut(); + match this.pgb.poll_write_buf(cx) { + Poll::Ready(Ok(())) => {} + Poll::Ready(Err(err)) => return Poll::Ready(Err(err)), + Poll::Pending => return Poll::Pending, + } + this.pgb.poll_flush(cx) + } + fn poll_shutdown( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + let this = self.get_mut(); + match this.pgb.poll_write_buf(cx) { + Poll::Ready(Ok(())) => {} + Poll::Ready(Err(err)) => return Poll::Ready(Err(err)), + Poll::Pending => return Poll::Pending, + } + this.pgb.poll_flush(cx) + } +} + +pub fn short_error(e: &QueryError) -> String { + match e { + QueryError::Disconnected(connection_error) => connection_error.to_string(), + QueryError::Other(e) => format!("{e:#}"), + } +} + +pub(super) fn log_query_error(query: &str, e: &QueryError) { + match e { + QueryError::Disconnected(ConnectionError::Socket(io_error)) => { + if is_expected_io_error(io_error) { + info!("query handler for '{query}' failed with expected io error: {io_error}"); + } else { + error!("query handler for '{query}' failed with io error: {io_error}"); + } + } + QueryError::Disconnected(other_connection_error) => { + error!("query handler for '{query}' failed with connection error: {other_connection_error:?}") + } + QueryError::Other(e) => { + error!("query handler for '{query}' failed: {e:?}"); + } + } +} diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index bf330a482c..e3f0b505da 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -11,11 +11,13 @@ use tokio::time::timeout; /// An error happened while waiting for a number #[derive(Debug, PartialEq, Eq, thiserror::Error)] -#[error("SeqWaitError")] pub enum SeqWaitError { /// The wait timeout was reached + #[error("seqwait timeout was reached")] Timeout, + /// [`SeqWait::shutdown`] was called + #[error("SeqWait::shutdown was called")] Shutdown, } diff --git a/libs/utils/src/sock_split.rs b/libs/utils/src/sock_split.rs index 5e4598daf1..b0e5a0bf6a 100644 --- a/libs/utils/src/sock_split.rs +++ b/libs/utils/src/sock_split.rs @@ -50,7 +50,7 @@ impl BufStream { /// Returns a reference to the underlying TcpStream. fn get_ref(&self) -> &TcpStream { - &*self.0.get_ref().0 + &self.0.get_ref().0 } } diff --git a/libs/utils/tests/ssl_test.rs b/libs/utils/tests/ssl_test.rs index 248400c2c1..fae707f049 100644 --- a/libs/utils/tests/ssl_test.rs +++ b/libs/utils/tests/ssl_test.rs @@ -9,7 +9,10 @@ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use once_cell::sync::Lazy; -use utils::postgres_backend::{AuthType, Handler, PostgresBackend}; +use utils::{ + postgres_backend::{AuthType, Handler, PostgresBackend}, + postgres_backend_async::QueryError, +}; fn make_tcp_pair() -> (TcpStream, TcpStream) { let listener = TcpListener::bind("127.0.0.1:0").unwrap(); @@ -105,7 +108,7 @@ fn ssl() { &mut self, _pgb: &mut PostgresBackend, query_string: &str, - ) -> anyhow::Result<()> { + ) -> Result<(), QueryError> { self.got_query = query_string == QUERY; Ok(()) } @@ -152,7 +155,7 @@ fn no_ssl() { &mut self, _pgb: &mut PostgresBackend, _query_string: &str, - ) -> anyhow::Result<()> { + ) -> Result<(), QueryError> { panic!() } } @@ -212,7 +215,7 @@ fn server_forces_ssl() { &mut self, _pgb: &mut PostgresBackend, _query_string: &str, - ) -> anyhow::Result<()> { + ) -> Result<(), QueryError> { panic!() } } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 54bbe4714d..1854b6762f 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -2,6 +2,7 @@ name = "pageserver" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [features] default = [] @@ -9,8 +10,6 @@ default = [] # which adds some runtime cost to run tests on outage conditions testing = ["fail/failpoints"] -profiling = ["pprof"] - [dependencies] amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" } anyhow = { version = "1.0", features = ["backtrace"] } @@ -18,7 +17,7 @@ async-stream = "0.3" async-trait = "0.1" byteorder = "1.4.3" bytes = "1.0.1" -chrono = { version = "0.4.23", default-features = false, features = ["clock"] } +chrono = { version = "0.4.23", default-features = false, features = ["clock", "serde"] } clap = { version = "4.0", features = ["string"] } close_fds = "0.3.2" const_format = "0.2.21" @@ -36,23 +35,22 @@ nix = "0.25" num-traits = "0.2.15" once_cell = "1.13.0" pin-project-lite = "0.2.7" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } +postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } rand = "0.8.3" regex = "1.4.5" rstar = "0.9.3" scopeguard = "1.1.0" serde = { version = "1.0", features = ["derive"] } -serde_json = "1" +serde_json = { version = "1.0", features = ["raw_value"] } serde_with = "2.0" signal-hook = "0.3.10" svg_fmt = "0.4.1" -tar = "0.4.33" +tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" } thiserror = "1.0" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } tokio-util = { version = "0.7.3", features = ["io", "io-util"] } toml_edit = { version = "0.14", features = ["easy"] } tracing = "0.1.36" @@ -69,6 +67,7 @@ storage_broker = { version = "0.1", path = "../storage_broker" } tenant_size_model = { path = "../libs/tenant_size_model" } utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } +reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } [dev-dependencies] criterion = "0.4" diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 6001377811..6a01fdfc6f 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,8 +1,7 @@ use anyhow::Result; use pageserver::repository::Key; -use pageserver::tenant::filename::{DeltaFileName, ImageFileName}; use pageserver::tenant::layer_map::LayerMap; -use pageserver::tenant::storage_layer::ValueReconstructState; +use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState}; use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult}; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use std::cmp::{max, min}; @@ -163,7 +162,7 @@ fn bench_from_captest_env(c: &mut Criterion) { c.bench_function("captest_uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1).unwrap(); + layer_map.search(q.0, q.1); } }); }); @@ -192,7 +191,7 @@ fn bench_from_real_project(c: &mut Criterion) { c.bench_function("real_map_uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1).unwrap(); + layer_map.search(q.0, q.1); } }); }); @@ -238,7 +237,7 @@ fn bench_sequential(c: &mut Criterion) { // Run the search queries b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1).unwrap(); + layer_map.search(q.0, q.1); } }); }); diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 8f53fce027..61011c9f36 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -84,7 +84,7 @@ fn add_multithreaded_walredo_requesters( barrier.wait(); - execute_all(input, &*manager).unwrap(); + execute_all(input, &manager).unwrap(); barrier.wait(); } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 973c3cd3a6..1978becf83 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,19 +10,24 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{anyhow, bail, ensure, Context, Result}; +use anyhow::{anyhow, bail, ensure, Context}; use bytes::{BufMut, BytesMut}; use fail::fail_point; -use itertools::Itertools; use std::fmt::Write as FmtWrite; -use std::io; -use std::io::Write; -use std::sync::Arc; use std::time::SystemTime; -use tar::{Builder, EntryType, Header}; +use tokio::io; +use tokio::io::AsyncWrite; use tracing::*; -use crate::tenant::Timeline; +/// NB: This relies on a modified version of tokio_tar that does *not* write the +/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped +/// without explicitly calling 'finish' or 'into_inner'! +/// +/// See https://github.com/neondatabase/tokio-tar/pull/1 +/// +use tokio_tar::{Builder, EntryType, Header}; + +use crate::tenant::{with_ondemand_download, Timeline}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; @@ -33,116 +38,130 @@ use postgres_ffi::PG_TLI; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; +/// Create basebackup with non-rel data in it. +/// Only include relational data if 'full_backup' is true. +/// +/// Currently we use empty 'req_lsn' in two cases: +/// * During the basebackup right after timeline creation +/// * When working without safekeepers. In this situation it is important to match the lsn +/// we are taking basebackup on with the lsn that is used in pageserver's walreceiver +/// to start the replication. +pub async fn send_basebackup_tarball<'a, W>( + write: &'a mut W, + timeline: &'a Timeline, + req_lsn: Option, + prev_lsn: Option, + full_backup: bool, +) -> anyhow::Result<()> +where + W: AsyncWrite + Send + Sync + Unpin, +{ + // Compute postgres doesn't have any previous WAL files, but the first + // record that it's going to write needs to include the LSN of the + // previous record (xl_prev). We include prev_record_lsn in the + // "zenith.signal" file, so that postgres can read it during startup. + // + // We don't keep full history of record boundaries in the page server, + // however, only the predecessor of the latest record on each + // timeline. So we can only provide prev_record_lsn when you take a + // base backup at the end of the timeline, i.e. at last_record_lsn. + // Even at the end of the timeline, we sometimes don't have a valid + // prev_lsn value; that happens if the timeline was just branched from + // an old LSN and it doesn't have any WAL of its own yet. We will set + // prev_lsn to Lsn(0) if we cannot provide the correct value. + let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { + // Backup was requested at a particular LSN. The caller should've + // already checked that it's a valid LSN. + + // If the requested point is the end of the timeline, we can + // provide prev_lsn. (get_last_record_rlsn() might return it as + // zero, though, if no WAL has been generated on this timeline + // yet.) + let end_of_timeline = timeline.get_last_record_rlsn(); + if req_lsn == end_of_timeline.last { + (end_of_timeline.prev, req_lsn) + } else { + (Lsn(0), req_lsn) + } + } else { + // Backup was requested at end of the timeline. + let end_of_timeline = timeline.get_last_record_rlsn(); + (end_of_timeline.prev, end_of_timeline.last) + }; + + // Consolidate the derived and the provided prev_lsn values + let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn { + if backup_prev != Lsn(0) { + ensure!(backup_prev == provided_prev_lsn); + } + provided_prev_lsn + } else { + backup_prev + }; + + info!( + "taking basebackup lsn={}, prev_lsn={} (full_backup={})", + backup_lsn, prev_lsn, full_backup + ); + + let basebackup = Basebackup { + ar: Builder::new_non_terminated(write), + timeline, + lsn: backup_lsn, + prev_record_lsn: prev_lsn, + full_backup, + }; + basebackup + .send_tarball() + .instrument(info_span!("send_tarball", backup_lsn=%backup_lsn)) + .await +} + /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions /// used for constructing tarball. -pub struct Basebackup<'a, W> +struct Basebackup<'a, W> where - W: Write, + W: AsyncWrite + Send + Sync + Unpin, { - ar: Builder>, - timeline: &'a Arc, - pub lsn: Lsn, + ar: Builder<&'a mut W>, + timeline: &'a Timeline, + lsn: Lsn, prev_record_lsn: Lsn, full_backup: bool, - finished: bool, } -// Create basebackup with non-rel data in it. -// Only include relational data if 'full_backup' is true. -// -// Currently we use empty lsn in two cases: -// * During the basebackup right after timeline creation -// * When working without safekeepers. In this situation it is important to match the lsn -// we are taking basebackup on with the lsn that is used in pageserver's walreceiver -// to start the replication. impl<'a, W> Basebackup<'a, W> where - W: Write, + W: AsyncWrite + Send + Sync + Unpin, { - pub fn new( - write: W, - timeline: &'a Arc, - req_lsn: Option, - prev_lsn: Option, - full_backup: bool, - ) -> Result> { - // Compute postgres doesn't have any previous WAL files, but the first - // record that it's going to write needs to include the LSN of the - // previous record (xl_prev). We include prev_record_lsn in the - // "zenith.signal" file, so that postgres can read it during startup. - // - // We don't keep full history of record boundaries in the page server, - // however, only the predecessor of the latest record on each - // timeline. So we can only provide prev_record_lsn when you take a - // base backup at the end of the timeline, i.e. at last_record_lsn. - // Even at the end of the timeline, we sometimes don't have a valid - // prev_lsn value; that happens if the timeline was just branched from - // an old LSN and it doesn't have any WAL of its own yet. We will set - // prev_lsn to Lsn(0) if we cannot provide the correct value. - let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { - // Backup was requested at a particular LSN. The caller should've - // already checked that it's a valid LSN. - - // If the requested point is the end of the timeline, we can - // provide prev_lsn. (get_last_record_rlsn() might return it as - // zero, though, if no WAL has been generated on this timeline - // yet.) - let end_of_timeline = timeline.get_last_record_rlsn(); - if req_lsn == end_of_timeline.last { - (end_of_timeline.prev, req_lsn) - } else { - (Lsn(0), req_lsn) - } - } else { - // Backup was requested at end of the timeline. - let end_of_timeline = timeline.get_last_record_rlsn(); - (end_of_timeline.prev, end_of_timeline.last) - }; - - // Consolidate the derived and the provided prev_lsn values - let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn { - if backup_prev != Lsn(0) { - ensure!(backup_prev == provided_prev_lsn) - } - provided_prev_lsn - } else { - backup_prev - }; - - info!( - "taking basebackup lsn={}, prev_lsn={} (full_backup={})", - backup_lsn, prev_lsn, full_backup - ); - - Ok(Basebackup { - ar: Builder::new(AbortableWrite::new(write)), - timeline, - lsn: backup_lsn, - prev_record_lsn: prev_lsn, - full_backup, - finished: false, - }) - } - - pub fn send_tarball(mut self) -> anyhow::Result<()> { + async fn send_tarball(mut self) -> anyhow::Result<()> { // TODO include checksum // Create pgdata subdirs structure for dir in PGDATA_SUBDIRS.iter() { - let header = new_tar_header_dir(*dir)?; - self.ar.append(&header, &mut io::empty())?; + let header = new_tar_header_dir(dir)?; + self.ar + .append(&header, &mut io::empty()) + .await + .context("could not add directory to basebackup tarball")?; } - // Send empty config files. + // Send config files. for filepath in PGDATA_SPECIAL_FILES.iter() { if *filepath == "pg_hba.conf" { let data = PG_HBA.as_bytes(); let header = new_tar_header(filepath, data.len() as u64)?; - self.ar.append(&header, data)?; + self.ar + .append(&header, data) + .await + .context("could not add config file to basebackup tarball")?; } else { let header = new_tar_header(filepath, 0)?; - self.ar.append(&header, &mut io::empty())?; + self.ar + .append(&header, &mut io::empty()) + .await + .context("could not add config file to basebackup tarball")?; } } @@ -152,24 +171,31 @@ where SlruKind::MultiXactOffsets, SlruKind::MultiXactMembers, ] { - for segno in self.timeline.list_slru_segments(kind, self.lsn)? { - self.add_slru_segment(kind, segno)?; + for segno in + with_ondemand_download(|| self.timeline.list_slru_segments(kind, self.lsn)).await? + { + self.add_slru_segment(kind, segno).await?; } } // Create tablespace directories - for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? { - self.add_dbdir(spcnode, dbnode, has_relmap_file)?; + for ((spcnode, dbnode), has_relmap_file) in + with_ondemand_download(|| self.timeline.list_dbdirs(self.lsn)).await? + { + self.add_dbdir(spcnode, dbnode, has_relmap_file).await?; // Gather and send relational files in each database if full backup is requested. if self.full_backup { - for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? { - self.add_rel(rel)?; + for rel in + with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn)) + .await? + { + self.add_rel(rel).await?; } } } - for xid in self.timeline.list_twophase_files(self.lsn)? { - self.add_twophase_file(xid)?; + for xid in with_ondemand_download(|| self.timeline.list_twophase_files(self.lsn)).await? { + self.add_twophase_file(xid).await?; } fail_point!("basebackup-before-control-file", |_| { @@ -177,42 +203,46 @@ where }); // Generate pg_control and bootstrap WAL segment. - self.add_pgcontrol_file()?; - self.ar.finish()?; - self.finished = true; + self.add_pgcontrol_file().await?; + self.ar.finish().await?; debug!("all tarred up!"); Ok(()) } - fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> { - let nblocks = self.timeline.get_rel_size(tag, self.lsn, false)?; - - // Function that adds relation segment data to archive - let mut add_file = |segment_index, data: &Vec| -> anyhow::Result<()> { - let file_name = tag.to_segfile_name(segment_index as u32); - let header = new_tar_header(&file_name, data.len() as u64)?; - self.ar.append(&header, data.as_slice())?; - Ok(()) - }; + async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> { + let nblocks = + with_ondemand_download(|| self.timeline.get_rel_size(tag, self.lsn, false)).await?; // If the relation is empty, create an empty file if nblocks == 0 { - add_file(0, &vec![])?; + let file_name = tag.to_segfile_name(0); + let header = new_tar_header(&file_name, 0)?; + self.ar.append(&header, &mut io::empty()).await?; return Ok(()); } // Add a file for each chunk of blocks (aka segment) - let chunks = (0..nblocks).chunks(RELSEG_SIZE as usize); - for (seg, blocks) in chunks.into_iter().enumerate() { + let mut startblk = 0; + let mut seg = 0; + while startblk < nblocks { + let endblk = std::cmp::min(startblk + RELSEG_SIZE, nblocks); + let mut segment_data: Vec = vec![]; - for blknum in blocks { - let img = self - .timeline - .get_rel_page_at_lsn(tag, blknum, self.lsn, false)?; + for blknum in startblk..endblk { + let img = with_ondemand_download(|| { + self.timeline + .get_rel_page_at_lsn(tag, blknum, self.lsn, false) + }) + .await?; segment_data.extend_from_slice(&img[..]); } - add_file(seg, &segment_data)?; + let file_name = tag.to_segfile_name(seg as u32); + let header = new_tar_header(&file_name, segment_data.len() as u64)?; + self.ar.append(&header, segment_data.as_slice()).await?; + + seg += 1; + startblk = endblk; } Ok(()) @@ -221,14 +251,18 @@ where // // Generate SLRU segment files from repository. // - fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { - let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?; + async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { + let nblocks = + with_ondemand_download(|| self.timeline.get_slru_segment_size(slru, segno, self.lsn)) + .await?; let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize); for blknum in 0..nblocks { - let img = self - .timeline - .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?; + let img = with_ondemand_download(|| { + self.timeline + .get_slru_page_at_lsn(slru, segno, blknum, self.lsn) + }) + .await?; if slru == SlruKind::Clog { ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8); @@ -241,7 +275,7 @@ where let segname = format!("{}/{:>04X}", slru.to_str(), segno); let header = new_tar_header(&segname, slru_buf.len() as u64)?; - self.ar.append(&header, slru_buf.as_slice())?; + self.ar.append(&header, slru_buf.as_slice()).await?; trace!("Added to basebackup slru {} relsize {}", segname, nblocks); Ok(()) @@ -253,14 +287,16 @@ where // Each directory contains a PG_VERSION file, and the default database // directories also contain pg_filenode.map files. // - fn add_dbdir( + async fn add_dbdir( &mut self, spcnode: u32, dbnode: u32, has_relmap_file: bool, ) -> anyhow::Result<()> { let relmap_img = if has_relmap_file { - let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?; + let img = + with_ondemand_download(|| self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)) + .await?; ensure!(img.len() == 512); Some(img) } else { @@ -270,14 +306,14 @@ where if spcnode == GLOBALTABLESPACE_OID { let pg_version_str = self.timeline.pg_version.to_string(); let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; - self.ar.append(&header, pg_version_str.as_bytes())?; + self.ar.append(&header, pg_version_str.as_bytes()).await?; info!("timeline.pg_version {}", self.timeline.pg_version); if let Some(img) = relmap_img { // filenode map for global tablespace let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?; - self.ar.append(&header, &img[..])?; + self.ar.append(&header, &img[..]).await?; } else { warn!("global/pg_filenode.map is missing"); } @@ -293,9 +329,8 @@ where // XLOG_TBLSPC_DROP records. But we probably should just // throw an error on CREATE TABLESPACE in the first place. if !has_relmap_file - && self - .timeline - .list_rels(spcnode, dbnode, self.lsn)? + && with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn)) + .await? .is_empty() { return Ok(()); @@ -306,18 +341,18 @@ where // Append dir path for each database let path = format!("base/{}", dbnode); let header = new_tar_header_dir(&path)?; - self.ar.append(&header, &mut io::empty())?; + self.ar.append(&header, &mut io::empty()).await?; if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); let pg_version_str = self.timeline.pg_version.to_string(); let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; - self.ar.append(&header, pg_version_str.as_bytes())?; + self.ar.append(&header, pg_version_str.as_bytes()).await?; let relmap_path = format!("base/{}/pg_filenode.map", dbnode); let header = new_tar_header(&relmap_path, img.len() as u64)?; - self.ar.append(&header, &img[..])?; + self.ar.append(&header, &img[..]).await?; } }; Ok(()) @@ -326,8 +361,8 @@ where // // Extract twophase state files // - fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { - let img = self.timeline.get_twophase_file(xid, self.lsn)?; + async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { + let img = with_ondemand_download(|| self.timeline.get_twophase_file(xid, self.lsn)).await?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); @@ -335,7 +370,7 @@ where buf.put_u32_le(crc); let path = format!("pg_twophase/{:>08X}", xid); let header = new_tar_header(&path, buf.len() as u64)?; - self.ar.append(&header, &buf[..])?; + self.ar.append(&header, &buf[..]).await?; Ok(()) } @@ -344,7 +379,7 @@ where // Add generated pg_control file and bootstrap WAL segment. // Also send zenith.signal file with extra bootstrap data. // - fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> { + async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> { // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { @@ -356,18 +391,18 @@ where } else { write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?; } - self.ar.append( - &new_tar_header("zenith.signal", zenith_signal.len() as u64)?, - zenith_signal.as_bytes(), - )?; + self.ar + .append( + &new_tar_header("zenith.signal", zenith_signal.len() as u64)?, + zenith_signal.as_bytes(), + ) + .await?; - let checkpoint_bytes = self - .timeline - .get_checkpoint(self.lsn) + let checkpoint_bytes = with_ondemand_download(|| self.timeline.get_checkpoint(self.lsn)) + .await .context("failed to get checkpoint bytes")?; - let pg_control_bytes = self - .timeline - .get_control_file(self.lsn) + let pg_control_bytes = with_ondemand_download(|| self.timeline.get_control_file(self.lsn)) + .await .context("failed get control bytes")?; let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control( @@ -379,7 +414,7 @@ where //send pg_control let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; - self.ar.append(&header, &pg_control_bytes[..])?; + self.ar.append(&header, &pg_control_bytes[..]).await?; //send wal segment let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE); @@ -391,24 +426,11 @@ where postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version) .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); - self.ar.append(&header, &wal_seg[..])?; + self.ar.append(&header, &wal_seg[..]).await?; Ok(()) } } -impl<'a, W> Drop for Basebackup<'a, W> -where - W: Write, -{ - /// If the basebackup was not finished, prevent the Archive::drop() from - /// writing the end-of-archive marker. - fn drop(&mut self) { - if !self.finished { - self.ar.get_mut().abort(); - } - } -} - // // Create new tarball entry header // @@ -444,49 +466,3 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result
{ header.set_cksum(); Ok(header) } - -/// A wrapper that passes through all data to the underlying Write, -/// until abort() is called. -/// -/// tar::Builder has an annoying habit of finishing the archive with -/// a valid tar end-of-archive marker (two 512-byte sectors of zeros), -/// even if an error occurs and we don't finish building the archive. -/// We'd rather abort writing the tarball immediately than construct -/// a seemingly valid but incomplete archive. This wrapper allows us -/// to swallow the end-of-archive marker that Builder::drop() emits, -/// without writing it to the underlying sink. -/// -struct AbortableWrite { - w: W, - aborted: bool, -} - -impl AbortableWrite { - pub fn new(w: W) -> Self { - AbortableWrite { w, aborted: false } - } - - pub fn abort(&mut self) { - self.aborted = true; - } -} - -impl Write for AbortableWrite -where - W: Write, -{ - fn write(&mut self, data: &[u8]) -> io::Result { - if self.aborted { - Ok(data.len()) - } else { - self.w.write(data) - } - } - fn flush(&mut self) -> io::Result<()> { - if self.aborted { - Ok(()) - } else { - self.w.flush() - } - } -} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 345f391e61..18ec1ac68b 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -7,19 +7,20 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr}; use anyhow::{anyhow, Context}; use clap::{Arg, ArgAction, Command}; use fail::FailScenario; +use remote_storage::GenericRemoteStorage; use tracing::*; use metrics::set_build_info_metric; use pageserver::{ config::{defaults::*, PageServerConf}, - http, page_cache, page_service, profiling, task_mgr, + http, page_cache, page_service, task_mgr, task_mgr::TaskKind, task_mgr::{ BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, }, - tenant_mgr, virtual_file, + tenant::mgr, + virtual_file, }; -use remote_storage::GenericRemoteStorage; use utils::{ auth::JwtAuth, logging, @@ -39,8 +40,6 @@ const FEATURES: &[&str] = &[ "testing", #[cfg(feature = "fail/failpoints")] "fail/failpoints", - #[cfg(feature = "profiling")] - "profiling", ]; fn version() -> String { @@ -127,7 +126,7 @@ fn initialize_config( ); } // Supplement the CLI arguments with the config file - let cfg_file_contents = std::fs::read_to_string(&cfg_file_path).with_context(|| { + let cfg_file_contents = std::fs::read_to_string(cfg_file_path).with_context(|| { format!( "Failed to read pageserver config at '{}'", cfg_file_path.display() @@ -181,7 +180,7 @@ fn initialize_config( if update_config { info!("Writing pageserver config to '{}'", cfg_file_path.display()); - std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| { + std::fs::write(cfg_file_path, toml.to_string()).with_context(|| { format!( "Failed to write pageserver config to '{}'", cfg_file_path.display() @@ -201,8 +200,12 @@ fn initialize_config( } fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { + // Initialize logging logging::init(conf.log_format)?; + + // Print version to the log, and expose it as a prometheus metric too. info!("version: {}", version()); + set_build_info_metric(GIT_VERSION); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes @@ -218,40 +221,36 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { ) } + // Create and lock PID file. This ensures that there cannot be more than one + // pageserver process running at the same time. let lock_file_path = conf.workdir.join(PID_FILE_NAME); let lock_file = utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?; info!("Claimed pid file at {lock_file_path:?}"); - // ensure that the lock file is held even if the main thread of the process is panics - // we need to release the lock file only when the current process is gone + // Ensure that the lock file is held even if the main thread of the process panics. + // We need to release the lock file only when the process exits. std::mem::forget(lock_file); - // TODO: Check that it looks like a valid repository before going further + // Bind the HTTP and libpq ports early, so that if they are in use by some other + // process, we error out early. + let http_addr = &conf.listen_http_addr; + info!("Starting pageserver http handler on {http_addr}"); + let http_listener = tcp_listener::bind(http_addr)?; - // bind sockets before daemonizing so we report errors early and do not return until we are listening - info!( - "Starting pageserver http handler on {}", - conf.listen_http_addr - ); - let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?; - - info!( - "Starting pageserver pg protocol handler on {}", - conf.listen_pg_addr - ); - let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?; + let pg_addr = &conf.listen_pg_addr; + info!("Starting pageserver pg protocol handler on {pg_addr}"); + let pageserver_listener = tcp_listener::bind(pg_addr)?; + // Install signal handlers let signals = signals::install_shutdown_handlers()?; - // start profiler (if enabled) - let profiler_guard = profiling::init_profiler(conf); - + // Launch broker client WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?; - // initialize authentication for incoming connections + // Initialize authentication for incoming connections let auth = match &conf.auth_type { - AuthType::Trust | AuthType::MD5 => None, + AuthType::Trust => None, AuthType::NeonJWT => { // unwrap is ok because check is performed when creating config, so path is set and file exists let key_path = conf.auth_validation_public_key_path.as_ref().unwrap(); @@ -260,54 +259,54 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { }; info!("Using auth: {:#?}", conf.auth_type); - match var("ZENITH_AUTH_TOKEN") { - Ok(v) => { + // TODO: remove ZENITH_AUTH_TOKEN once it's not used anywhere in development/staging/prod configuration. + match (var("ZENITH_AUTH_TOKEN"), var("NEON_AUTH_TOKEN")) { + (old, Ok(v)) => { info!("Loaded JWT token for authentication with Safekeeper"); + if let Ok(v_old) = old { + warn!( + "JWT token for Safekeeper is specified twice, ZENITH_AUTH_TOKEN is deprecated" + ); + if v_old != v { + warn!("JWT token for Safekeeper has two different values, choosing NEON_AUTH_TOKEN"); + } + } pageserver::config::SAFEKEEPER_AUTH_TOKEN .set(Arc::new(v)) .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?; } - Err(VarError::NotPresent) => { + (Ok(v), _) => { + info!("Loaded JWT token for authentication with Safekeeper"); + warn!("Please update pageserver configuration: the JWT token should be NEON_AUTH_TOKEN, not ZENITH_AUTH_TOKEN"); + pageserver::config::SAFEKEEPER_AUTH_TOKEN + .set(Arc::new(v)) + .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?; + } + (_, Err(VarError::NotPresent)) => { info!("No JWT token for authentication with Safekeeper detected"); } - Err(e) => { + (_, Err(e)) => { return Err(e).with_context(|| { - "Failed to either load to detect non-present ZENITH_AUTH_TOKEN environment variable" + "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable" }) } }; - let remote_storage = conf - .remote_storage_config - .as_ref() - .map(GenericRemoteStorage::from_config) - .transpose() - .context("Failed to init generic remote storage")?; + // Set up remote storage client + let remote_storage = create_remote_storage_client(conf)?; - let (init_result_sender, init_result_receiver) = - std::sync::mpsc::channel::>(); - let storage_for_spawn = remote_storage.clone(); - let _handler = BACKGROUND_RUNTIME.spawn(async move { - let result = tenant_mgr::init_tenant_mgr(conf, storage_for_spawn).await; - init_result_sender.send(result) - }); - match init_result_receiver.recv() { - Ok(init_result) => init_result.context("Failed to init tenant_mgr")?, - Err(_sender_dropped_err) => { - anyhow::bail!("Failed to init tenant_mgr: no init status was returned"); - } - } + // Scan the local 'tenants/' directory and start loading the tenants + BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?; - // Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME. - // bind before launching separate thread so the error reported before startup exits - - // Create a Service from the router above to handle incoming requests. + // Start up the service to handle HTTP mgmt API request. We created the + // listener earlier already. { let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); - let router = http::make_router(conf, auth.clone(), remote_storage)?; - let service = - utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap(); + let router = http::make_router(conf, auth.clone(), remote_storage)? + .build() + .map_err(|err| anyhow!(err))?; + let service = utils::http::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)? .serve(service) .with_graceful_shutdown(task_mgr::shutdown_watcher()); @@ -324,10 +323,31 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { Ok(()) }, ); + + if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint { + task_mgr::spawn( + MGMT_REQUEST_RUNTIME.handle(), + TaskKind::MetricsCollection, + None, + None, + "consumption metrics collection", + true, + async move { + pageserver::consumption_metrics::collect_metrics( + metric_collection_endpoint, + conf.metric_collection_interval, + conf.id, + ) + .instrument(info_span!("metrics_collection")) + .await?; + Ok(()) + }, + ); + } } // Spawn a task to listen for libpq connections. It will spawn further tasks - // for each connection. + // for each connection. We created the listener earlier already. task_mgr::spawn( COMPUTE_REQUEST_RUNTIME.handle(), TaskKind::LibpqEndpointListener, @@ -340,8 +360,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { }, ); - set_build_info_metric(GIT_VERSION); - // All started up! Now just sit and wait for shutdown signal. signals.handle(|signal| match signal { Signal::Quit => { @@ -349,7 +367,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { "Got {}. Terminating in immediate shutdown mode", signal.name() ); - profiling::exit_profiler(conf, &profiler_guard); std::process::exit(111); } @@ -358,13 +375,42 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { "Got {}. Terminating gracefully in fast shutdown mode", signal.name() ); - profiling::exit_profiler(conf, &profiler_guard); BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0)); unreachable!() } }) } +fn create_remote_storage_client( + conf: &'static PageServerConf, +) -> anyhow::Result> { + let config = if let Some(config) = &conf.remote_storage_config { + config + } else { + // No remote storage configured. + return Ok(None); + }; + + // Create the client + let mut remote_storage = GenericRemoteStorage::from_config(config)?; + + // If `test_remote_failures` is non-zero, wrap the client with a + // wrapper that simulates failures. + if conf.test_remote_failures > 0 { + if !cfg!(feature = "testing") { + anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature"); + } + info!( + "Simulating remote failures for first {} attempts of each op", + conf.test_remote_failures + ); + remote_storage = + GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures); + } + + Ok(Some(remote_storage)) +} + fn cli() -> Command { Command::new("Neon page server") .about("Materializes WAL stream to pages and serves them to the postgres") diff --git a/pageserver/src/bin/pageserver_binutils.rs b/pageserver/src/bin/pageserver_binutils.rs index b1484ac45a..9da173c873 100644 --- a/pageserver/src/bin/pageserver_binutils.rs +++ b/pageserver/src/bin/pageserver_binutils.rs @@ -60,7 +60,7 @@ fn main() -> anyhow::Result<()> { } fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> { - let control_file = ControlFileData::decode(&std::fs::read(&control_file_path)?)?; + let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?; println!("{control_file:?}"); let control_file_initdb = Lsn(control_file.checkPoint); println!( @@ -79,7 +79,7 @@ fn print_layerfile(path: &Path) -> anyhow::Result<()> { } fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> { - let metadata_bytes = std::fs::read(&path)?; + let metadata_bytes = std::fs::read(path)?; let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; println!("Current metadata:\n{meta:?}"); let mut update_meta = false; @@ -110,7 +110,7 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an if update_meta { let metadata_bytes = meta.to_bytes()?; - std::fs::write(&path, &metadata_bytes)?; + std::fs::write(path, metadata_bytes)?; } Ok(()) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 48e9f32276..7b99d98581 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -12,6 +12,7 @@ use utils::crashsafe::path_with_suffix_extension; use utils::id::ConnectionId; use once_cell::sync::OnceCell; +use reqwest::Url; use std::num::NonZeroUsize; use std::path::{Path, PathBuf}; use std::str::FromStr; @@ -26,14 +27,15 @@ use utils::{ postgres_backend::AuthType, }; +use crate::tenant::config::TenantConf; +use crate::tenant::config::TenantConfOpt; use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME}; -use crate::tenant_config::{TenantConf, TenantConfOpt}; use crate::{ IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX, }; pub mod defaults { - use crate::tenant_config::defaults::*; + use crate::tenant::config::defaults::*; use const_format::formatcp; pub use pageserver_api::{ @@ -55,6 +57,8 @@ pub mod defaults { pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = super::ConfigurableSemaphore::DEFAULT_INITIAL.get(); + pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min"; + pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; /// /// Default built-in configuration file. /// @@ -78,6 +82,8 @@ pub mod defaults { #concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}' +#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}' + # [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -132,16 +138,22 @@ pub struct PageServerConf { pub auth_validation_public_key_path: Option, pub remote_storage_config: Option, - pub profiling: ProfilingConfig, pub default_tenant_conf: TenantConf, /// Storage broker endpoints to connect to. pub broker_endpoint: Uri, + pub broker_keepalive_interval: Duration, pub log_format: LogFormat, /// Number of concurrent [`Tenant::gather_size_inputs`] allowed. pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore, + + // How often to collect metrics and send them to the metrics endpoint. + pub metric_collection_interval: Duration, + pub metric_collection_endpoint: Option, + + pub test_remote_failures: u64, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -152,25 +164,6 @@ pub struct PageServerConf { /// startup code to the connection code through a dozen layers. pub static SAFEKEEPER_AUTH_TOKEN: OnceCell> = OnceCell::new(); -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum ProfilingConfig { - Disabled, - PageRequests, -} - -impl FromStr for ProfilingConfig { - type Err = anyhow::Error; - - fn from_str(s: &str) -> Result { - let result = match s { - "disabled" => ProfilingConfig::Disabled, - "page_requests" => ProfilingConfig::PageRequests, - _ => bail!("invalid value \"{s}\" for profiling option, valid values are \"disabled\" and \"page_requests\""), - }; - Ok(result) - } -} - // use dedicated enum for builder to better indicate the intention // and avoid possible confusion with nested options pub enum BuilderValue { @@ -213,12 +206,17 @@ struct PageServerConfigBuilder { id: BuilderValue, - profiling: BuilderValue, broker_endpoint: BuilderValue, + broker_keepalive_interval: BuilderValue, log_format: BuilderValue, concurrent_tenant_size_logical_size_queries: BuilderValue, + + metric_collection_interval: BuilderValue, + metric_collection_endpoint: BuilderValue>, + + test_remote_failures: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -243,13 +241,23 @@ impl Default for PageServerConfigBuilder { auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), id: NotSet, - profiling: Set(ProfilingConfig::Disabled), broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT .parse() .expect("failed to parse default broker endpoint")), + broker_keepalive_interval: Set(humantime::parse_duration( + storage_broker::DEFAULT_KEEPALIVE_INTERVAL, + ) + .expect("cannot parse default keepalive interval")), log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()), + metric_collection_interval: Set(humantime::parse_duration( + DEFAULT_METRIC_COLLECTION_INTERVAL, + ) + .expect("cannot parse default metric collection interval")), + metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT), + + test_remote_failures: Set(0), } } } @@ -310,12 +318,12 @@ impl PageServerConfigBuilder { self.broker_endpoint = BuilderValue::Set(broker_endpoint) } - pub fn id(&mut self, node_id: NodeId) { - self.id = BuilderValue::Set(node_id) + pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) { + self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval) } - pub fn profiling(&mut self, profiling: ProfilingConfig) { - self.profiling = BuilderValue::Set(profiling) + pub fn id(&mut self, node_id: NodeId) { + self.id = BuilderValue::Set(node_id) } pub fn log_format(&mut self, log_format: LogFormat) { @@ -326,6 +334,18 @@ impl PageServerConfigBuilder { self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u); } + pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) { + self.metric_collection_interval = BuilderValue::Set(metric_collection_interval) + } + + pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option) { + self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint) + } + + pub fn test_remote_failures(&mut self, fail_first: u64) { + self.test_remote_failures = BuilderValue::Set(fail_first); + } + pub fn build(self) -> anyhow::Result { Ok(PageServerConf { listen_pg_addr: self @@ -359,18 +379,29 @@ impl PageServerConfigBuilder { .remote_storage_config .ok_or(anyhow!("missing remote_storage_config"))?, id: self.id.ok_or(anyhow!("missing id"))?, - profiling: self.profiling.ok_or(anyhow!("missing profiling"))?, // TenantConf is handled separately default_tenant_conf: TenantConf::default(), broker_endpoint: self .broker_endpoint .ok_or(anyhow!("No broker endpoints provided"))?, + broker_keepalive_interval: self + .broker_keepalive_interval + .ok_or(anyhow!("No broker keepalive interval provided"))?, log_format: self.log_format.ok_or(anyhow!("missing log_format"))?, concurrent_tenant_size_logical_size_queries: self .concurrent_tenant_size_logical_size_queries .ok_or(anyhow!( "missing concurrent_tenant_size_logical_size_queries" ))?, + metric_collection_interval: self + .metric_collection_interval + .ok_or(anyhow!("missing metric_collection_interval"))?, + metric_collection_endpoint: self + .metric_collection_endpoint + .ok_or(anyhow!("missing metric_collection_endpoint"))?, + test_remote_failures: self + .test_remote_failures + .ok_or(anyhow!("missing test_remote_failuers"))?, }) } } @@ -530,8 +561,8 @@ impl PageServerConf { t_conf = Self::parse_toml_tenant_conf(item)?; } "id" => builder.id(NodeId(parse_toml_u64(key, item)?)), - "profiling" => builder.profiling(parse_toml_from_str(key, item)?), "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?), + "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?), "log_format" => builder.log_format( LogFormat::from_config(&parse_toml_string(key, item)?)? ), @@ -541,6 +572,13 @@ impl PageServerConf { let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?; ConfigurableSemaphore::new(permits) }), + "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?), + "metric_collection_endpoint" => { + let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?; + builder.metric_collection_endpoint(Some(endpoint)); + }, + + "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -656,11 +694,14 @@ impl PageServerConf { auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, - profiling: ProfilingConfig::Disabled, - default_tenant_conf: TenantConf::dummy_conf(), + default_tenant_conf: TenantConf::default(), broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), + broker_keepalive_interval: Duration::from_secs(5000), log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), + metric_collection_interval: Duration::from_secs(60), + metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, + test_remote_failures: 0, } } } @@ -791,6 +832,8 @@ max_file_descriptors = 333 initial_superuser_name = 'zzzz' id = 10 +metric_collection_interval = '222 s' +metric_collection_endpoint = 'http://localhost:80/metrics' log_format = 'json' "#; @@ -826,11 +869,18 @@ log_format = 'json' auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, - profiling: ProfilingConfig::Disabled, default_tenant_conf: TenantConf::default(), broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), + broker_keepalive_interval: humantime::parse_duration( + storage_broker::DEFAULT_KEEPALIVE_INTERVAL + )?, log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), + metric_collection_interval: humantime::parse_duration( + defaults::DEFAULT_METRIC_COLLECTION_INTERVAL + )?, + metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, + test_remote_failures: 0, }, "Correct defaults should be used when no config values are provided" ); @@ -869,11 +919,14 @@ log_format = 'json' auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, - profiling: ProfilingConfig::Disabled, default_tenant_conf: TenantConf::default(), broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), + broker_keepalive_interval: Duration::from_secs(5), log_format: LogFormat::Json, concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), + metric_collection_interval: Duration::from_secs(222), + metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), + test_remote_failures: 0, }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs new file mode 100644 index 0000000000..c411a9e025 --- /dev/null +++ b/pageserver/src/consumption_metrics.rs @@ -0,0 +1,324 @@ +//! +//! Periodically collect consumption metrics for all active tenants +//! and push them to a HTTP endpoint. +//! Cache metrics to send only the updated ones. +//! + +use anyhow; +use tracing::*; +use utils::id::NodeId; +use utils::id::TimelineId; + +use crate::task_mgr; +use crate::tenant::mgr; +use pageserver_api::models::TenantState; +use utils::id::TenantId; + +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; +use std::collections::HashMap; +use std::fmt; +use std::str::FromStr; +use std::time::Duration; + +use chrono::{DateTime, Utc}; +use rand::Rng; +use reqwest::Url; + +/// ConsumptionMetric struct that defines the format for one metric entry +/// i.e. +/// +/// ```json +/// { +/// "metric": "remote_storage_size", +/// "type": "absolute", +/// "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d", +/// "timeline_id": "a03ebb4f5922a1c56ff7485cc8854143", +/// "time": "2022-12-28T11:07:19.317310284Z", +/// "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019", +/// "value": 12345454, +/// } +/// ``` +#[serde_as] +#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)] +pub struct ConsumptionMetric { + pub metric: ConsumptionMetricKind, + #[serde(rename = "type")] + pub metric_type: &'static str, + #[serde_as(as = "DisplayFromStr")] + pub tenant_id: TenantId, + #[serde_as(as = "Option")] + #[serde(skip_serializing_if = "Option::is_none")] + pub timeline_id: Option, + pub time: DateTime, + pub idempotency_key: String, + pub value: u64, +} + +impl ConsumptionMetric { + pub fn new_absolute( + metric: ConsumptionMetricKind, + tenant_id: TenantId, + timeline_id: Option, + value: u64, + node_id: NodeId, + rng: &mut R, + ) -> Self { + Self { + metric, + metric_type: "absolute", + tenant_id, + timeline_id, + time: Utc::now(), + // key that allows metric collector to distinguish unique events + idempotency_key: format!("{}-{}-{:04}", Utc::now(), node_id, rng.gen_range(0..=9999)), + value, + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ConsumptionMetricKind { + /// Amount of WAL produced , by a timeline, i.e. last_record_lsn + /// This is an absolute, per-timeline metric. + WrittenSize, + /// Size of all tenant branches including WAL + /// This is an absolute, per-tenant metric. + /// This is the same metric that tenant/tenant_id/size endpoint returns. + SyntheticStorageSize, + /// Size of all the layer files in the tenant's directory on disk on the pageserver. + /// This is an absolute, per-tenant metric. + /// See also prometheus metric RESIDENT_PHYSICAL_SIZE. + ResidentSize, + /// Size of the remote storage (S3) directory. + /// This is an absolute, per-tenant metric. + RemoteStorageSize, + /// Logical size of the data in the timeline + /// This is an absolute, per-timeline metric + TimelineLogicalSize, +} + +impl FromStr for ConsumptionMetricKind { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "written_size" => Ok(Self::WrittenSize), + "synthetic_storage_size" => Ok(Self::SyntheticStorageSize), + "resident_size" => Ok(Self::ResidentSize), + "remote_storage_size" => Ok(Self::RemoteStorageSize), + "timeline_logical_size" => Ok(Self::TimelineLogicalSize), + _ => anyhow::bail!("invalid value \"{s}\" for metric type"), + } + } +} + +impl fmt::Display for ConsumptionMetricKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(match self { + ConsumptionMetricKind::WrittenSize => "written_size", + ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size", + ConsumptionMetricKind::ResidentSize => "resident_size", + ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size", + ConsumptionMetricKind::TimelineLogicalSize => "timeline_logical_size", + }) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ConsumptionMetricsKey { + tenant_id: TenantId, + timeline_id: Option, + metric: ConsumptionMetricKind, +} + +#[derive(serde::Serialize)] +struct EventChunk<'a> { + events: &'a [ConsumptionMetric], +} + +/// Main thread that serves metrics collection +pub async fn collect_metrics( + metric_collection_endpoint: &Url, + metric_collection_interval: Duration, + node_id: NodeId, +) -> anyhow::Result<()> { + let mut ticker = tokio::time::interval(metric_collection_interval); + + info!("starting collect_metrics"); + + // define client here to reuse it for all requests + let client = reqwest::Client::new(); + let mut cached_metrics: HashMap = HashMap::new(); + + loop { + tokio::select! { + _ = task_mgr::shutdown_watcher() => { + info!("collect_metrics received cancellation request"); + return Ok(()); + }, + _ = ticker.tick() => { + collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await?; + } + } + } +} + +/// One iteration of metrics collection +/// +/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`. +/// Cache metrics to avoid sending the same metrics multiple times. +pub async fn collect_metrics_task( + client: &reqwest::Client, + cached_metrics: &mut HashMap, + metric_collection_endpoint: &reqwest::Url, + node_id: NodeId, +) -> anyhow::Result<()> { + let mut current_metrics: Vec<(ConsumptionMetricsKey, u64)> = Vec::new(); + trace!( + "starting collect_metrics_task. metric_collection_endpoint: {}", + metric_collection_endpoint + ); + + // get list of tenants + let tenants = mgr::list_tenants().await; + + // iterate through list of Active tenants and collect metrics + for (tenant_id, tenant_state) in tenants { + if tenant_state != TenantState::Active { + continue; + } + + let tenant = mgr::get_tenant(tenant_id, true).await?; + + let mut tenant_resident_size = 0; + + // iterate through list of timelines in tenant + for timeline in tenant.list_timelines().iter() { + // collect per-timeline metrics only for active timelines + if timeline.is_active() { + let timeline_written_size = u64::from(timeline.get_last_record_lsn()); + + current_metrics.push(( + ConsumptionMetricsKey { + tenant_id, + timeline_id: Some(timeline.timeline_id), + metric: ConsumptionMetricKind::WrittenSize, + }, + timeline_written_size, + )); + + let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?; + // Only send timeline logical size when it is fully calculated. + if is_exact { + current_metrics.push(( + ConsumptionMetricsKey { + tenant_id, + timeline_id: Some(timeline.timeline_id), + metric: ConsumptionMetricKind::TimelineLogicalSize, + }, + timeline_logical_size, + )); + } + } + + let timeline_resident_size = timeline.get_resident_physical_size(); + tenant_resident_size += timeline_resident_size; + } + + let tenant_remote_size = tenant.get_remote_size().await?; + debug!( + "collected current metrics for tenant: {}: state={:?} resident_size={} remote_size={}", + tenant_id, tenant_state, tenant_resident_size, tenant_remote_size + ); + + current_metrics.push(( + ConsumptionMetricsKey { + tenant_id, + timeline_id: None, + metric: ConsumptionMetricKind::ResidentSize, + }, + tenant_resident_size, + )); + + current_metrics.push(( + ConsumptionMetricsKey { + tenant_id, + timeline_id: None, + metric: ConsumptionMetricKind::RemoteStorageSize, + }, + tenant_remote_size, + )); + + // TODO add SyntheticStorageSize metric + } + + // Filter metrics + current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) { + Some(val) => val != curr_val, + None => true, + }); + + if current_metrics.is_empty() { + trace!("no new metrics to send"); + return Ok(()); + } + + // Send metrics. + // Split into chunks of 1000 metrics to avoid exceeding the max request size + const CHUNK_SIZE: usize = 1000; + let chunks = current_metrics.chunks(CHUNK_SIZE); + + let mut chunk_to_send: Vec = Vec::with_capacity(1000); + + for chunk in chunks { + chunk_to_send.clear(); + + // this code block is needed to convince compiler + // that rng is not reused aroung await point + { + // enrich metrics with timestamp and metric_kind before sending + let mut rng = rand::thread_rng(); + chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| { + ConsumptionMetric::new_absolute( + curr_key.metric, + curr_key.tenant_id, + curr_key.timeline_id, + *curr_val, + node_id, + &mut rng, + ) + })); + } + + let chunk_json = serde_json::value::to_raw_value(&EventChunk { + events: &chunk_to_send, + }) + .expect("ConsumptionMetric should not fail serialization"); + + let res = client + .post(metric_collection_endpoint.clone()) + .json(&chunk_json) + .send() + .await; + + match res { + Ok(res) => { + if res.status().is_success() { + // update cached metrics after they were sent successfully + for (curr_key, curr_val) in chunk.iter() { + cached_metrics.insert(curr_key.clone(), *curr_val); + } + } else { + error!("metrics endpoint refused the sent metrics: {:?}", res); + } + } + Err(err) => { + error!("failed to send metrics: {:?}", err); + } + } + } + + Ok(()) +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index b372410c0d..f9b8a81dad 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -77,16 +77,6 @@ paths: schema: type: string format: hex - - name: include-non-incremental-logical-size - in: query - schema: - type: string - description: Controls calculation of current_logical_size_non_incremental - - name: include-non-incremental-physical-size - in: query - schema: - type: string - description: Controls calculation of current_physical_size_non_incremental get: description: Get timelines for tenant responses: @@ -139,17 +129,6 @@ paths: format: hex get: description: Get info about the timeline - parameters: - - name: include-non-incremental-logical-size - in: query - schema: - type: string - description: Controls calculation of current_logical_size_non_incremental - - name: include-non-incremental-physical-size - in: query - schema: - type: string - description: Controls calculation of current_physical_size_non_incremental responses: "200": description: TimelineInfo @@ -779,10 +758,6 @@ components: type: integer current_physical_size: type: integer - current_logical_size_non_incremental: - type: integer - current_physical_size_non_incremental: - type: integer wal_source_connstr: type: string last_received_msg_lsn: @@ -795,37 +770,6 @@ components: latest_gc_cutoff_lsn: type: string format: hex - - # These 'local' and 'remote' fields just duplicate some of the fields - # above. They are kept for backwards-compatibility. They can be removed, - # when the control plane has been updated to look at the above fields - # directly. - local: - $ref: "#/components/schemas/LocalTimelineInfo" - remote: - $ref: "#/components/schemas/RemoteTimelineInfo" - - LocalTimelineInfo: - type: object - properties: - ancestor_timeline_id: - type: string - format: hex - ancestor_lsn: - type: string - format: hex - current_logical_size: - type: integer - current_physical_size: - type: integer - RemoteTimelineInfo: - type: object - required: - - remote_consistent_lsn - properties: - remote_consistent_lsn: - type: string - format: hex Error: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 0ef555c4aa..1c5eacd362 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -4,16 +4,17 @@ use anyhow::{anyhow, Context, Result}; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use remote_storage::GenericRemoteStorage; +use tokio_util::sync::CancellationToken; use tracing::*; use super::models::{ - LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, TenantConfigRequest, - TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo, + StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, + TimelineCreateRequest, TimelineInfo, }; use crate::pgdatadir_mapping::LsnForTimestamp; -use crate::tenant::Timeline; -use crate::tenant_config::TenantConfOpt; -use crate::{config::PageServerConf, tenant_mgr}; +use crate::tenant::config::TenantConfOpt; +use crate::tenant::{with_ondemand_download, Timeline}; +use crate::{config::PageServerConf, tenant::mgr}; use utils::{ auth::JwtAuth, http::{ @@ -30,8 +31,6 @@ use utils::{ // Imports only used for testing APIs #[cfg(feature = "testing")] use super::models::{ConfigureFailpointsRequest, TimelineGcRequest}; -#[cfg(feature = "testing")] -use crate::CheckpointConfig; struct State { conf: &'static PageServerConf, @@ -79,19 +78,23 @@ fn check_permission(request: &Request, tenant_id: Option) -> Res } // Helper function to construct a TimelineInfo struct for a timeline -fn build_timeline_info( +async fn build_timeline_info( timeline: &Arc, include_non_incremental_logical_size: bool, - include_non_incremental_physical_size: bool, ) -> anyhow::Result { let mut info = build_timeline_info_common(timeline)?; if include_non_incremental_logical_size { - info.current_logical_size_non_incremental = - Some(timeline.get_current_logical_size_non_incremental(info.last_record_lsn)?); - } - if include_non_incremental_physical_size { - info.current_physical_size_non_incremental = - Some(timeline.get_physical_size_non_incremental()?) + // XXX we should be using spawn_ondemand_logical_size_calculation here. + // Otherwise, if someone deletes the timeline / detaches the tenant while + // we're executing this function, we will outlive the timeline on-disk state. + info.current_logical_size_non_incremental = Some( + timeline + .get_current_logical_size_non_incremental( + info.last_record_lsn, + CancellationToken::new(), + ) + .await?, + ); } Ok(info) } @@ -117,13 +120,13 @@ fn build_timeline_info_common(timeline: &Arc) -> anyhow::Result Some(lsn), }; let current_logical_size = match timeline.get_current_logical_size() { - Ok(size) => Some(size), + Ok((size, _)) => Some(size), Err(err) => { error!("Timeline info creation failed to get current logical size: {err:?}"); None } }; - let current_physical_size = Some(timeline.get_physical_size()); + let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok()); let state = timeline.current_state(); let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0)); @@ -140,25 +143,13 @@ fn build_timeline_info_common(timeline: &Arc) -> anyhow::Result) -> Result) -> Result, let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let include_non_incremental_logical_size = query_param_present(&request, "include-non-incremental-logical-size"); - let include_non_incremental_physical_size = - query_param_present(&request, "include-non-incremental-physical-size"); check_permission(&request, Some(tenant_id))?; let response_data = async { - let tenant = tenant_mgr::get_tenant(tenant_id, true) + let tenant = mgr::get_tenant(tenant_id, true) .await .map_err(ApiError::NotFound)?; let timelines = tenant.list_timelines(); let mut response_data = Vec::with_capacity(timelines.len()); for timeline in timelines { - let timeline_info = build_timeline_info( - &timeline, - include_non_incremental_logical_size, - include_non_incremental_physical_size, - ) - .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}") - .map_err(ApiError::InternalServerError)?; + let timeline_info = + build_timeline_info(&timeline, include_non_incremental_logical_size) + .await + .context( + "Failed to convert tenant timeline {timeline_id} into the local one: {e:?}", + ) + .map_err(ApiError::InternalServerError)?; response_data.push(timeline_info); } - Ok(response_data) } .instrument(info_span!("timeline_list", tenant = %tenant_id)) @@ -271,12 +259,10 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result(timeline_info) } @@ -311,14 +294,15 @@ async fn get_lsn_by_timestamp_handler(request: Request) -> Result format!("{lsn}"), LsnForTimestamp::Future(_lsn) => "future".into(), LsnForTimestamp::Past(_lsn) => "past".into(), @@ -338,7 +322,7 @@ async fn tenant_attach_handler(request: Request) -> Result, if let Some(remote_storage) = &state.remote_storage { // FIXME: distinguish between "Tenant already exists" and other errors - tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone()) + mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone()) .instrument(info_span!("tenant_attach", tenant = %tenant_id)) .await .map_err(ApiError::InternalServerError)?; @@ -356,7 +340,7 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, let state = get_state(&request); let conf = state.conf; - tenant_mgr::detach_tenant(conf, tenant_id) + mgr::detach_tenant(conf, tenant_id) .instrument(info_span!("tenant_detach", tenant = %tenant_id)) .await // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors. @@ -388,7 +372,7 @@ async fn tenant_load_handler(request: Request) -> Result, A check_permission(&request, Some(tenant_id))?; let state = get_state(&request); - tenant_mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone()) + mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone()) .instrument(info_span!("load", tenant = %tenant_id)) .await .map_err(ApiError::InternalServerError)?; @@ -402,7 +386,7 @@ async fn tenant_ignore_handler(request: Request) -> Result, let state = get_state(&request); let conf = state.conf; - tenant_mgr::ignore_tenant(conf, tenant_id) + mgr::ignore_tenant(conf, tenant_id) .instrument(info_span!("ignore_tenant", tenant = %tenant_id)) .await // FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors. @@ -415,7 +399,7 @@ async fn tenant_ignore_handler(request: Request) -> Result, async fn tenant_list_handler(request: Request) -> Result, ApiError> { check_permission(&request, None)?; - let response_data = tenant_mgr::list_tenants() + let response_data = mgr::list_tenants() .instrument(info_span!("tenant_list")) .await .iter() @@ -435,12 +419,12 @@ async fn tenant_status(request: Request) -> Result, ApiErro check_permission(&request, Some(tenant_id))?; let tenant_info = async { - let tenant = tenant_mgr::get_tenant(tenant_id, false).await?; + let tenant = mgr::get_tenant(tenant_id, false).await?; // Calculate total physical size of all timelines let mut current_physical_size = 0; for timeline in tenant.list_timelines().iter() { - current_physical_size += timeline.get_physical_size(); + current_physical_size += timeline.layer_size_sum().approximate_is_ok(); } let state = tenant.current_state(); @@ -462,7 +446,7 @@ async fn tenant_size_handler(request: Request) -> Result, A let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - let tenant = tenant_mgr::get_tenant(tenant_id, true) + let tenant = mgr::get_tenant(tenant_id, true) .await .map_err(ApiError::InternalServerError)?; @@ -583,7 +567,7 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result = result_receiver + .await + .context("receive compaction result") + .map_err(ApiError::InternalServerError)?; + result.map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, ()) } @@ -775,20 +759,63 @@ async fn timeline_checkpoint_handler(request: Request) -> Result, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let tenant = mgr::get_tenant(tenant_id, true) + .await + .map_err(ApiError::NotFound)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(ApiError::NotFound)?; + match timeline.spawn_download_all_remote_layers().await { + Ok(st) => json_response(StatusCode::ACCEPTED, st), + Err(st) => json_response(StatusCode::CONFLICT, st), + } +} + +async fn timeline_download_remote_layers_handler_get( + request: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let tenant = mgr::get_tenant(tenant_id, true) + .await + .map_err(ApiError::NotFound)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(ApiError::NotFound)?; + let info = timeline + .get_download_all_remote_layers_task_info() + .context("task never started since last pageserver process start") + .map_err(ApiError::NotFound)?; + json_response(StatusCode::OK, info) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -873,6 +900,14 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", testing_api!("run timeline checkpoint", timeline_checkpoint_handler), ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", + timeline_download_remote_layers_handler_post, + ) + .get( + "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", + timeline_download_remote_layers_handler_get, + ) .delete( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler, diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 642e41765b..ca1514dd00 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -2,12 +2,13 @@ //! Import data and WAL from a PostgreSQL data directory and WAL segments into //! a neon Timeline. //! -use std::fs::File; -use std::io::{Read, Seek, SeekFrom}; use std::path::{Path, PathBuf}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; +use futures::StreamExt; +use tokio::io::{AsyncRead, AsyncReadExt}; +use tokio_tar::Archive; use tracing::*; use walkdir::WalkDir; @@ -42,7 +43,7 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result { /// This is currently only used to import a cluster freshly created by initdb. /// The code that deals with the checkpoint would not work right if the /// cluster was not shut down cleanly. -pub fn import_timeline_from_postgres_datadir( +pub async fn import_timeline_from_postgres_datadir( tline: &Timeline, pgdata_path: &Path, pgdata_lsn: Lsn, @@ -65,9 +66,11 @@ pub fn import_timeline_from_postgres_datadir( let absolute_path = entry.path(); let relative_path = absolute_path.strip_prefix(pgdata_path)?; - let file = File::open(absolute_path)?; + let mut file = tokio::fs::File::open(absolute_path).await?; let len = metadata.len() as usize; - if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? { + if let Some(control_file) = + import_file(&mut modification, relative_path, &mut file, len).await? + { pg_control = Some(control_file); } modification.flush()?; @@ -96,18 +99,19 @@ pub fn import_timeline_from_postgres_datadir( tline, Lsn(pg_control.checkPointCopy.redo), pgdata_lsn, - )?; + ) + .await?; Ok(()) } // subroutine of import_timeline_from_postgres_datadir(), to load one relation file. -fn import_rel( - modification: &mut DatadirModification, +async fn import_rel( + modification: &mut DatadirModification<'_>, path: &Path, spcoid: Oid, dboid: Oid, - mut reader: Reader, + reader: &mut (impl AsyncRead + Send + Sync + Unpin), len: usize, ) -> anyhow::Result<()> { // Does it look like a relation file? @@ -148,7 +152,7 @@ fn import_rel( } loop { - let r = reader.read_exact(&mut buf); + let r = reader.read_exact(&mut buf).await; match r { Ok(_) => { modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; @@ -181,19 +185,19 @@ fn import_rel( /// Import an SLRU segment file /// -fn import_slru( - modification: &mut DatadirModification, +async fn import_slru( + modification: &mut DatadirModification<'_>, slru: SlruKind, path: &Path, - mut reader: Reader, + reader: &mut (impl AsyncRead + Send + Sync + Unpin), len: usize, -) -> Result<()> { - trace!("importing slru file {}", path.display()); +) -> anyhow::Result<()> { + info!("importing slru file {path:?}"); let mut buf: [u8; 8192] = [0u8; 8192]; let filename = &path .file_name() - .expect("missing slru filename") + .with_context(|| format!("missing slru filename for path {path:?}"))? .to_string_lossy(); let segno = u32::from_str_radix(filename, 16)?; @@ -206,7 +210,7 @@ fn import_slru( let mut rpageno = 0; loop { - let r = reader.read_exact(&mut buf); + let r = reader.read_exact(&mut buf).await; match r { Ok(_) => { modification.put_slru_page_image( @@ -237,14 +241,20 @@ fn import_slru( /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. -fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> { +async fn import_wal( + walpath: &Path, + tline: &Timeline, + startpoint: Lsn, + endpoint: Lsn, +) -> anyhow::Result<()> { + use std::io::Read; let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version); let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE); let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = startpoint; - let mut walingest = WalIngest::new(tline, startpoint)?; + let mut walingest = WalIngest::new(tline, startpoint).await?; while last_lsn <= endpoint { // FIXME: assume postgresql tli 1 for now @@ -260,14 +270,15 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) } // Slurp the WAL file - let mut file = File::open(&path)?; + let mut file = std::fs::File::open(&path)?; if offset > 0 { - file.seek(SeekFrom::Start(offset as u64))?; + use std::io::Seek; + file.seek(std::io::SeekFrom::Start(offset as u64))?; } let nread = file.read_to_end(&mut buf)?; - if nread != WAL_SEGMENT_SIZE - offset as usize { + if nread != WAL_SEGMENT_SIZE - offset { // Maybe allow this for .partial files? error!("read only {} bytes from WAL file", nread); } @@ -279,7 +290,9 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; + walingest + .ingest_record(recdata, lsn, &mut modification, &mut decoded) + .await?; last_lsn = lsn; nrecords += 1; @@ -303,9 +316,9 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) Ok(()) } -pub fn import_basebackup_from_tar( +pub async fn import_basebackup_from_tar( tline: &Timeline, - reader: Reader, + reader: &mut (impl AsyncRead + Send + Sync + Unpin), base_lsn: Lsn, ) -> Result<()> { info!("importing base at {base_lsn}"); @@ -315,21 +328,24 @@ pub fn import_basebackup_from_tar( let mut pg_control: Option = None; // Import base - for base_tar_entry in tar::Archive::new(reader).entries()? { - let entry = base_tar_entry?; + let mut entries = Archive::new(reader).entries()?; + while let Some(base_tar_entry) = entries.next().await { + let mut entry = base_tar_entry?; let header = entry.header(); let len = header.entry_size()? as usize; let file_path = header.path()?.into_owned(); match header.entry_type() { - tar::EntryType::Regular => { - if let Some(res) = import_file(&mut modification, file_path.as_ref(), entry, len)? { + tokio_tar::EntryType::Regular => { + if let Some(res) = + import_file(&mut modification, file_path.as_ref(), &mut entry, len).await? + { // We found the pg_control file. pg_control = Some(res); } modification.flush()?; } - tar::EntryType::Directory => { + tokio_tar::EntryType::Directory => { debug!("directory {:?}", file_path); } _ => { @@ -349,9 +365,9 @@ pub fn import_basebackup_from_tar( Ok(()) } -pub fn import_wal_from_tar( +pub async fn import_wal_from_tar( tline: &Timeline, - reader: Reader, + reader: &mut (impl AsyncRead + Send + Sync + Unpin), start_lsn: Lsn, end_lsn: Lsn, ) -> Result<()> { @@ -360,20 +376,23 @@ pub fn import_wal_from_tar( let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE); let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = start_lsn; - let mut walingest = WalIngest::new(tline, start_lsn)?; + let mut walingest = WalIngest::new(tline, start_lsn).await?; // Ingest wal until end_lsn info!("importing wal until {}", end_lsn); - let mut pg_wal_tar = tar::Archive::new(reader); - let mut pg_wal_entries_iter = pg_wal_tar.entries()?; + let mut pg_wal_tar = Archive::new(reader); + let mut pg_wal_entries = pg_wal_tar.entries()?; while last_lsn <= end_lsn { let bytes = { - let entry = pg_wal_entries_iter.next().expect("expected more wal")?; + let mut entry = pg_wal_entries + .next() + .await + .ok_or_else(|| anyhow::anyhow!("expected more wal"))??; let header = entry.header(); let file_path = header.path()?.into_owned(); match header.entry_type() { - tar::EntryType::Regular => { + tokio_tar::EntryType::Regular => { // FIXME: assume postgresql tli 1 for now let expected_filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE); let file_name = file_path @@ -383,9 +402,9 @@ pub fn import_wal_from_tar( ensure!(expected_filename == file_name); debug!("processing wal file {:?}", file_path); - read_all_bytes(entry)? + read_all_bytes(&mut entry).await? } - tar::EntryType::Directory => { + tokio_tar::EntryType::Directory => { debug!("directory {:?}", file_path); continue; } @@ -405,7 +424,9 @@ pub fn import_wal_from_tar( let mut decoded = DecodedWALRecord::default(); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; + walingest + .ingest_record(recdata, lsn, &mut modification, &mut decoded) + .await?; last_lsn = lsn; debug!("imported record at {} (end {})", lsn, end_lsn); @@ -424,7 +445,7 @@ pub fn import_wal_from_tar( } // Log any extra unused files - for e in &mut pg_wal_entries_iter { + while let Some(e) = pg_wal_entries.next().await { let entry = e?; let header = entry.header(); let file_path = header.path()?.into_owned(); @@ -434,24 +455,30 @@ pub fn import_wal_from_tar( Ok(()) } -fn import_file( - modification: &mut DatadirModification, +async fn import_file( + modification: &mut DatadirModification<'_>, file_path: &Path, - reader: Reader, + reader: &mut (impl AsyncRead + Send + Sync + Unpin), len: usize, ) -> Result> { + let file_name = match file_path.file_name() { + Some(name) => name.to_string_lossy(), + None => return Ok(None), + }; + + if file_name.starts_with('.') { + // tar archives on macOs, created without COPYFILE_DISABLE=1 env var + // will contain "fork files", skip them. + return Ok(None); + } + if file_path.starts_with("global") { let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; let dbnode = 0; - match file_path - .file_name() - .expect("missing filename") - .to_string_lossy() - .as_ref() - { + match file_name.as_ref() { "pg_control" => { - let bytes = read_all_bytes(reader)?; + let bytes = read_all_bytes(reader).await?; // Extract the checkpoint record and import it separately. let pg_control = ControlFileData::decode(&bytes[..])?; @@ -464,7 +491,7 @@ fn import_file( return Ok(Some(pg_control)); } "pg_filenode.map" => { - let bytes = read_all_bytes(reader)?; + let bytes = read_all_bytes(reader).await?; modification.put_relmap_file(spcnode, dbnode, bytes)?; debug!("imported relmap file") } @@ -472,7 +499,7 @@ fn import_file( debug!("ignored PG_VERSION file"); } _ => { - import_rel(modification, file_path, spcnode, dbnode, reader, len)?; + import_rel(modification, file_path, spcnode, dbnode, reader, len).await?; debug!("imported rel creation"); } } @@ -485,14 +512,9 @@ fn import_file( .to_string_lossy() .parse()?; - match file_path - .file_name() - .expect("missing base filename") - .to_string_lossy() - .as_ref() - { + match file_name.as_ref() { "pg_filenode.map" => { - let bytes = read_all_bytes(reader)?; + let bytes = read_all_bytes(reader).await?; modification.put_relmap_file(spcnode, dbnode, bytes)?; debug!("imported relmap file") } @@ -500,40 +522,36 @@ fn import_file( debug!("ignored PG_VERSION file"); } _ => { - import_rel(modification, file_path, spcnode, dbnode, reader, len)?; + import_rel(modification, file_path, spcnode, dbnode, reader, len).await?; debug!("imported rel creation"); } } } else if file_path.starts_with("pg_xact") { let slru = SlruKind::Clog; - import_slru(modification, slru, file_path, reader, len)?; + import_slru(modification, slru, file_path, reader, len).await?; debug!("imported clog slru"); } else if file_path.starts_with("pg_multixact/offsets") { let slru = SlruKind::MultiXactOffsets; - import_slru(modification, slru, file_path, reader, len)?; + import_slru(modification, slru, file_path, reader, len).await?; debug!("imported multixact offsets slru"); } else if file_path.starts_with("pg_multixact/members") { let slru = SlruKind::MultiXactMembers; - import_slru(modification, slru, file_path, reader, len)?; + import_slru(modification, slru, file_path, reader, len).await?; debug!("imported multixact members slru"); } else if file_path.starts_with("pg_twophase") { - let file_name = &file_path - .file_name() - .expect("missing twophase filename") - .to_string_lossy(); - let xid = u32::from_str_radix(file_name, 16)?; + let xid = u32::from_str_radix(file_name.as_ref(), 16)?; - let bytes = read_all_bytes(reader)?; + let bytes = read_all_bytes(reader).await?; modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?; debug!("imported twophase file"); } else if file_path.starts_with("pg_wal") { debug!("found wal file in base section. ignore it"); } else if file_path.starts_with("zenith.signal") { // Parse zenith signal file to set correct previous LSN - let bytes = read_all_bytes(reader)?; + let bytes = read_all_bytes(reader).await?; // zenith.signal format is "PREV LSN: prev_lsn" // TODO write serialization and deserialization in the same place. let zenith_signal = std::str::from_utf8(&bytes)?.trim(); @@ -570,8 +588,8 @@ fn import_file( Ok(None) } -fn read_all_bytes(mut reader: Reader) -> Result { +async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result { let mut buf: Vec = vec![]; - reader.read_to_end(&mut buf)?; + reader.read_to_end(&mut buf).await?; Ok(Bytes::copy_from_slice(&buf[..])) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index eafcaa88d9..91cde477ad 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -1,6 +1,7 @@ mod auth; pub mod basebackup; pub mod config; +pub mod consumption_metrics; pub mod http; pub mod import_datadir; pub mod keyspace; @@ -8,15 +9,9 @@ pub(crate) mod metrics; pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; -pub mod profiling; pub mod repository; -pub mod storage_sync2; -pub use storage_sync2 as storage_sync; pub mod task_mgr; pub mod tenant; -pub mod tenant_config; -pub mod tenant_mgr; -pub mod tenant_tasks; pub mod trace; pub mod virtual_file; pub mod walingest; @@ -26,9 +21,8 @@ pub mod walredo; use std::path::Path; -use tracing::info; - use crate::task_mgr::TaskKind; +use tracing::info; /// Current storage format version /// @@ -47,15 +41,6 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61; static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); -/// Config for the Repository checkpointer -#[derive(Debug, Clone, Copy)] -pub enum CheckpointConfig { - // Flush all in-memory data - Flush, - // Flush all in-memory data and reconstruct all page images - Forced, -} - pub async fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint task. This prevents new connections from // being accepted. @@ -66,7 +51,7 @@ pub async fn shutdown_pageserver(exit_code: i32) { // Shut down all the tenants. This flushes everything to disk and kills // the checkpoint and GC tasks. - tenant_mgr::shutdown_all_tenants().await; + tenant::mgr::shutdown_all_tenants().await; // Stop syncing with remote storage. // @@ -99,7 +84,7 @@ async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) { } } -fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { +pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { if n == 0 { 0.0 } else { diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 454ff01f0e..b61e64048b 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -84,13 +84,20 @@ static LAST_RECORD_LSN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -// Metrics for determining timeline's physical size. -// A layered timeline's physical is defined as the total size of -// (delta/image) layer files on disk. -static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { +static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( - "pageserver_current_physical_size", - "Current physical size grouped by timeline", + "pageserver_resident_physical_size", + "The size of the layer files present in the pageserver's filesystem.", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_remote_physical_size", + "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.", + // Corollary: If any files are missing from the index part, they won't be included here. &["tenant_id", "timeline_id"] ) .expect("failed to define a metric") @@ -136,8 +143,9 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ 1.0, // 1 sec ]; -const STORAGE_IO_TIME_OPERATIONS: &[&str] = - &["open", "close", "read", "write", "seek", "fsync", "gc"]; +const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[ + "open", "close", "read", "write", "seek", "fsync", "gc", "metadata", +]; const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"]; @@ -201,23 +209,42 @@ pub static NUM_ONDISK_LAYERS: Lazy = Lazy::new(|| { // remote storage metrics -pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy = Lazy::new(|| { +/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`]. +static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy = Lazy::new(|| { register_int_gauge_vec!( - "pageserver_remote_upload_queue_unfinished_tasks", - "Number of tasks in the upload queue that are not finished yet.", + "pageserver_remote_timeline_client_calls_unfinished", + "Number of ongoing calls to remote timeline client. \ + Used to populate pageserver_remote_timeline_client_calls_started. \ + This metric is not useful for sampling from Prometheus, but useful in tests.", &["tenant_id", "timeline_id", "file_kind", "op_kind"], ) .expect("failed to define a metric") }); -#[derive(Debug, Clone, Copy)] +static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_remote_timeline_client_calls_started", + "When calling a remote timeline client method, we record the current value \ + of the calls_unfinished gauge in this histogram. Plot the histogram \ + over time in a heatmap to visualize how many operations were ongoing \ + at a given instant. It gives you a better idea of the queue depth \ + than plotting the gauge directly, since operations may complete faster \ + than the sampling interval.", + &["tenant_id", "timeline_id", "file_kind", "op_kind"], + // The calls_unfinished gauge is an integer gauge, hence we have integer buckets. + vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0], + ) + .expect("failed to define a metric") +}); + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RemoteOpKind { Upload, Download, Delete, } impl RemoteOpKind { - pub fn as_str(&self) -> &str { + pub fn as_str(&self) -> &'static str { match self { Self::Upload => "upload", Self::Download => "download", @@ -226,13 +253,13 @@ impl RemoteOpKind { } } -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] pub enum RemoteOpFileKind { Layer, Index, } impl RemoteOpFileKind { - pub fn as_str(&self) -> &str { + pub fn as_str(&self) -> &'static str { match self { Self::Layer => "layer", Self::Index => "index", @@ -240,15 +267,12 @@ impl RemoteOpFileKind { } } -pub static REMOTE_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"]; -pub static REMOTE_OPERATION_FILE_KINDS: &[&str] = &["layer", "index"]; -pub static REMOTE_OPERATION_STATUSES: &[&str] = &["success", "failure"]; - pub static REMOTE_OPERATION_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_remote_operation_seconds", "Time spent on remote storage operations. \ - Grouped by tenant, timeline, operation_kind and status", + Grouped by tenant, timeline, operation_kind and status. \ + Does not account for time spent waiting in remote timeline client's queues.", &["tenant_id", "timeline_id", "file_kind", "op_kind", "status"] ) .expect("failed to define a metric") @@ -365,7 +389,7 @@ pub struct TimelineMetrics { pub load_layer_map_histo: Histogram, pub last_record_gauge: IntGauge, pub wait_lsn_time_histo: Histogram, - pub current_physical_size_gauge: UIntGauge, + pub resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, pub num_persistent_files_created: IntCounter, @@ -406,7 +430,7 @@ impl TimelineMetrics { let wait_lsn_time_histo = WAIT_LSN_TIME .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); - let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE + let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); let current_logical_size_gauge = CURRENT_LOGICAL_SIZE @@ -432,7 +456,7 @@ impl TimelineMetrics { load_layer_map_histo, last_record_gauge, wait_lsn_time_histo, - current_physical_size_gauge, + resident_physical_size_gauge, current_logical_size_gauge, num_persistent_files_created, persistent_bytes_written, @@ -448,7 +472,7 @@ impl Drop for TimelineMetrics { let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]); let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]); - let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); @@ -467,21 +491,6 @@ impl Drop for TimelineMetrics { for op in SMGR_QUERY_TIME_OPERATIONS { let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]); } - - let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[tenant_id, timeline_id]); - for file_kind in REMOTE_OPERATION_FILE_KINDS { - for op in REMOTE_OPERATION_KINDS { - for status in REMOTE_OPERATION_STATUSES { - let _ = REMOTE_OPERATION_TIME.remove_label_values(&[ - tenant_id, - timeline_id, - file_kind, - op, - status, - ]); - } - } - } } } @@ -491,10 +500,198 @@ pub fn remove_tenant_metrics(tenant_id: &TenantId) { use futures::Future; use pin_project_lite::pin_project; +use std::collections::HashMap; use std::pin::Pin; +use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; use std::time::Instant; +pub struct RemoteTimelineClientMetrics { + tenant_id: String, + timeline_id: String, + remote_physical_size_gauge: Mutex>, + remote_operation_time: Mutex>, + calls_unfinished_gauge: Mutex>, + calls_started_hist: Mutex>, +} + +impl RemoteTimelineClientMetrics { + pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { + RemoteTimelineClientMetrics { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + remote_operation_time: Mutex::new(HashMap::default()), + calls_unfinished_gauge: Mutex::new(HashMap::default()), + calls_started_hist: Mutex::new(HashMap::default()), + remote_physical_size_gauge: Mutex::new(None), + } + } + pub fn remote_physical_size_gauge(&self) -> UIntGauge { + let mut guard = self.remote_physical_size_gauge.lock().unwrap(); + guard + .get_or_insert_with(|| { + REMOTE_PHYSICAL_SIZE + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + ]) + .unwrap() + }) + .clone() + } + pub fn remote_operation_time( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + status: &'static str, + ) -> Histogram { + // XXX would be nice to have an upgradable RwLock + let mut guard = self.remote_operation_time.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str(), status); + let metric = guard.entry(key).or_insert_with(move || { + REMOTE_OPERATION_TIME + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + key.0, + key.1, + key.2, + ]) + .unwrap() + }); + metric.clone() + } + fn calls_unfinished_gauge( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> IntGauge { + // XXX would be nice to have an upgradable RwLock + let mut guard = self.calls_unfinished_gauge.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + let metric = guard.entry(key).or_insert_with(move || { + REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + key.0, + key.1, + ]) + .unwrap() + }); + metric.clone() + } + + fn calls_started_hist( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> Histogram { + // XXX would be nice to have an upgradable RwLock + let mut guard = self.calls_started_hist.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + let metric = guard.entry(key).or_insert_with(move || { + REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + key.0, + key.1, + ]) + .unwrap() + }); + metric.clone() + } +} + +/// See [`RemoteTimelineClientMetrics::call_begin`]. +#[must_use] +pub(crate) struct RemoteTimelineClientCallMetricGuard(Option); + +impl RemoteTimelineClientCallMetricGuard { + /// Consume this guard object without decrementing the metric. + /// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out. + pub fn will_decrement_manually(mut self) { + self.0 = None; // prevent drop() from decrementing + } +} + +impl Drop for RemoteTimelineClientCallMetricGuard { + fn drop(&mut self) { + if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self { + guard.dec(); + } + } +} + +impl RemoteTimelineClientMetrics { + /// Increment the metrics that track ongoing calls to the remote timeline client instance. + /// + /// Drop the returned guard object once the operation is finished to decrement the values. + /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that + /// is more suitable. + /// Never do both. + pub(crate) fn call_begin( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> RemoteTimelineClientCallMetricGuard { + let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); + self.calls_started_hist(file_kind, op_kind) + .observe(unfinished_metric.get() as f64); + unfinished_metric.inc(); + RemoteTimelineClientCallMetricGuard(Some(unfinished_metric)) + } + + /// Manually decrement the metric instead of using the guard object. + /// Using the guard object is generally preferable. + /// See [`call_begin`] for more context. + pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) { + let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); + debug_assert!( + unfinished_metric.get() > 0, + "begin and end should cancel out" + ); + unfinished_metric.dec(); + } +} + +impl Drop for RemoteTimelineClientMetrics { + fn drop(&mut self) { + let RemoteTimelineClientMetrics { + tenant_id, + timeline_id, + remote_physical_size_gauge, + remote_operation_time, + calls_unfinished_gauge, + calls_started_hist, + } = self; + for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() { + let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]); + } + for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() { + let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[ + tenant_id, + timeline_id, + a, + b, + ]); + } + for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() { + let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[ + tenant_id, + timeline_id, + a, + b, + ]); + } + { + let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above + let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + } + } +} + /// Wrapper future that measures the time spent by a remote storage operation, /// and records the time and success/failure as a prometheus metric. pub trait MeasureRemoteOp: Sized { @@ -504,6 +701,7 @@ pub trait MeasureRemoteOp: Sized { timeline_id: TimelineId, file_kind: RemoteOpFileKind, op: RemoteOpKind, + metrics: Arc, ) -> MeasuredRemoteOp { let start = Instant::now(); MeasuredRemoteOp { @@ -513,6 +711,7 @@ pub trait MeasureRemoteOp: Sized { file_kind, op, start, + metrics, } } } @@ -529,6 +728,7 @@ pin_project! { file_kind: RemoteOpFileKind, op: RemoteOpKind, start: Instant, + metrics: Arc, } } @@ -541,15 +741,8 @@ impl>, O, E> Future for MeasuredRemoteOp { if let Poll::Ready(ref res) = poll_result { let duration = this.start.elapsed(); let status = if res.is_ok() { &"success" } else { &"failure" }; - REMOTE_OPERATION_TIME - .get_metric_with_label_values(&[ - &this.tenant_id.to_string(), - &this.timeline_id.to_string(), - this.file_kind.as_str(), - this.op.as_str(), - status, - ]) - .unwrap() + this.metrics + .remote_operation_time(this.file_kind, this.op, status) .observe(duration.as_secs_f64()); } poll_result diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 036fb14e9b..b266a07337 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -9,7 +9,7 @@ // custom protocol. // -use anyhow::{bail, ensure, Context, Result}; +use anyhow::Context; use bytes::Buf; use bytes::Bytes; use futures::{Stream, StreamExt}; @@ -19,6 +19,8 @@ use pageserver_api::models::{ PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, }; +use pq_proto::ConnectionError; +use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; use std::io; use std::net::TcpListener; @@ -26,11 +28,9 @@ use std::str; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use tokio::pin; -use tokio_util::io::StreamReader; -use tokio_util::io::SyncIoBridge; use tracing::*; use utils::id::ConnectionId; +use utils::postgres_backend_async::QueryError; use utils::{ auth::{Claims, JwtAuth, Scope}, id::{TenantId, TimelineId}, @@ -42,16 +42,14 @@ use utils::{ use crate::auth::check_permission; use crate::basebackup; -use crate::config::{PageServerConf, ProfilingConfig}; +use crate::config::PageServerConf; use crate::import_datadir::import_wal_from_tar; use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; -use crate::profiling::profpoint_start; use crate::task_mgr; use crate::task_mgr::TaskKind; +use crate::tenant::mgr; use crate::tenant::{Tenant, Timeline}; -use crate::tenant_mgr; use crate::trace::Tracer; -use crate::CheckpointConfig; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; @@ -65,8 +63,8 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream { // We were requested to shut down. let msg = format!("pageserver is shutting down"); - let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg)); - Err(anyhow::anyhow!(msg)) + let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg, None)); + Err(QueryError::Other(anyhow::anyhow!(msg))) } msg = pgb.read_message() => { msg } @@ -79,14 +77,15 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream { break }, FeMessage::Sync => continue, FeMessage::Terminate => { - let msg = format!("client terminated connection with Terminate message during COPY"); - pgb.write_message(&BeMessage::ErrorResponse(&msg))?; + let msg = "client terminated connection with Terminate message during COPY"; + let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg))); + pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?; Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; break; } m => { - let msg = format!("unexpected message {:?}", m); - pgb.write_message(&BeMessage::ErrorResponse(&msg))?; + let msg = format!("unexpected message {m:?}"); + pgb.write_message(&BeMessage::ErrorResponse(&msg, None))?; Err(io::Error::new(io::ErrorKind::Other, msg))?; break; } @@ -96,12 +95,16 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream { let msg = "client closed connection during COPY"; - pgb.write_message(&BeMessage::ErrorResponse(msg))?; + let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg))); + pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?; pgb.flush().await?; Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; } - Err(e) => { - Err(io::Error::new(io::ErrorKind::Other, e))?; + Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => { + Err(io_error)?; + } + Err(other) => { + Err(io::Error::new(io::ErrorKind::Other, other))?; } }; } @@ -199,23 +202,19 @@ async fn page_service_conn_main( // we've been requested to shut down Ok(()) } - Err(err) => { - let root_cause_io_err_kind = err - .root_cause() - .downcast_ref::() - .map(|e| e.kind()); - + Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => { // `ConnectionReset` error happens when the Postgres client closes the connection. // As this disconnection happens quite often and is expected, // we decided to downgrade the logging level to `INFO`. // See: https://github.com/neondatabase/neon/issues/1683. - if root_cause_io_err_kind == Some(io::ErrorKind::ConnectionReset) { + if io_error.kind() == io::ErrorKind::ConnectionReset { info!("Postgres client disconnected"); Ok(()) } else { - Err(err) + Err(io_error).context("Postgres connection error") } } + other => other.context("Postgres query error"), } } @@ -254,7 +253,7 @@ impl PageRequestMetrics { #[derive(Debug)] struct PageServerHandler { - conf: &'static PageServerConf, + _conf: &'static PageServerConf, auth: Option>, claims: Option, } @@ -262,7 +261,7 @@ struct PageServerHandler { impl PageServerHandler { pub fn new(conf: &'static PageServerConf, auth: Option>) -> Self { PageServerHandler { - conf, + _conf: conf, auth, claims: None, } @@ -317,7 +316,7 @@ impl PageServerHandler { Some(FeMessage::CopyData(bytes)) => bytes, Some(FeMessage::Terminate) => break, Some(m) => { - bail!("unexpected message: {m:?} during COPY"); + anyhow::bail!("unexpected message: {m:?} during COPY"); } None => break, // client disconnected }; @@ -374,7 +373,7 @@ impl PageServerHandler { base_lsn: Lsn, _end_lsn: Lsn, pg_version: u32, - ) -> anyhow::Result<()> { + ) -> Result<(), QueryError> { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); @@ -396,9 +395,7 @@ impl PageServerHandler { pgb.write_message(&BeMessage::CopyInResponse)?; pgb.flush().await?; - let copyin_stream = copyin_stream(pgb); - pin!(copyin_stream); - + let mut copyin_stream = Box::pin(copyin_stream(pgb)); timeline .import_basebackup_from_tar(&mut copyin_stream, base_lsn) .await?; @@ -430,11 +427,16 @@ impl PageServerHandler { timeline_id: TimelineId, start_lsn: Lsn, end_lsn: Lsn, - ) -> anyhow::Result<()> { + ) -> Result<(), QueryError> { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?; - ensure!(timeline.get_last_record_lsn() == start_lsn); + let last_record_lsn = timeline.get_last_record_lsn(); + if last_record_lsn != start_lsn { + return Err(QueryError::Other( + anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")) + ); + } // TODO leave clean state on error. For now you can use detach to clean // up broken state from a failed import. @@ -444,10 +446,8 @@ impl PageServerHandler { pgb.write_message(&BeMessage::CopyInResponse)?; pgb.flush().await?; let mut copyin_stream = Box::pin(copyin_stream(pgb)); - let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); - tokio::task::block_in_place(|| { - import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn) - })?; + let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream); + import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn).await?; info!("wal import complete"); // Drain the rest of the Copy data @@ -460,13 +460,17 @@ impl PageServerHandler { } // TODO Does it make sense to overshoot? - ensure!(timeline.get_last_record_lsn() >= end_lsn); + if timeline.get_last_record_lsn() < end_lsn { + return Err(QueryError::Other( + anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")) + ); + } // Flush data to disk, then upload to s3. No need for a forced checkpoint. // We only want to persist the data, and it doesn't matter if it's in the // shape of deltas or images. info!("flushing layers"); - timeline.checkpoint(CheckpointConfig::Flush).await?; + timeline.freeze_and_flush().await?; info!("done"); Ok(()) @@ -489,7 +493,7 @@ impl PageServerHandler { mut lsn: Lsn, latest: bool, latest_gc_cutoff_lsn: &RcuReadGuard, - ) -> Result { + ) -> anyhow::Result { if latest { // Latest page version was requested. If LSN is given, it is a hint // to the page server that there have been no modifications to the @@ -520,11 +524,11 @@ impl PageServerHandler { } } else { if lsn == Lsn(0) { - bail!("invalid LSN(0) in request"); + anyhow::bail!("invalid LSN(0) in request"); } timeline.wait_lsn(lsn).await?; } - ensure!( + anyhow::ensure!( lsn >= **latest_gc_cutoff_lsn, "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, **latest_gc_cutoff_lsn @@ -537,12 +541,15 @@ impl PageServerHandler { &self, timeline: &Timeline, req: &PagestreamExistsRequest, - ) -> Result { + ) -> anyhow::Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; - let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?; + let exists = crate::tenant::with_ondemand_download(|| { + timeline.get_rel_exists(req.rel, lsn, req.latest) + }) + .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { exists, @@ -554,12 +561,15 @@ impl PageServerHandler { &self, timeline: &Timeline, req: &PagestreamNblocksRequest, - ) -> Result { + ) -> anyhow::Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; - let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?; + let n_blocks = crate::tenant::with_ondemand_download(|| { + timeline.get_rel_size(req.rel, lsn, req.latest) + }) + .await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { n_blocks, @@ -571,14 +581,15 @@ impl PageServerHandler { &self, timeline: &Timeline, req: &PagestreamDbSizeRequest, - ) -> Result { + ) -> anyhow::Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; - let total_blocks = - timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?; - + let total_blocks = crate::tenant::with_ondemand_download(|| { + timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest) + }) + .await?; let db_size = total_blocks as i64 * BLCKSZ as i64; Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse { @@ -591,7 +602,7 @@ impl PageServerHandler { &self, timeline: &Timeline, req: &PagestreamGetPageRequest, - ) -> Result { + ) -> anyhow::Result { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; @@ -604,11 +615,10 @@ impl PageServerHandler { } */ - // FIXME: this profiling now happens at different place than it used to. The - // current profiling is based on a thread-local variable, so it doesn't work - // across awaits - let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); - let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?; + let page = crate::tenant::with_ondemand_download(|| { + timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest) + }) + .await?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page, @@ -642,16 +652,12 @@ impl PageServerHandler { pgb.flush().await?; /* Send a tarball of the latest layer on the timeline */ - let mut writer = CopyDataSink { - pgb, - rt: tokio::runtime::Handle::current(), - }; - tokio::task::block_in_place(|| { - let basebackup = - basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?; - tracing::Span::current().record("lsn", &basebackup.lsn.to_string().as_str()); - basebackup.send_tarball() - })?; + { + let mut writer = pgb.copyout_writer(); + basebackup::send_basebackup_tarball(&mut writer, &timeline, lsn, prev_lsn, full_backup) + .await?; + } + pgb.write_message(&BeMessage::CopyDone)?; pgb.flush().await?; info!("basebackup complete"); @@ -661,7 +667,7 @@ impl PageServerHandler { // when accessing management api supply None as an argument // when using to authorize tenant pass corresponding tenant id - fn check_permission(&self, tenant_id: Option) -> Result<()> { + fn check_permission(&self, tenant_id: Option) -> anyhow::Result<()> { if self.auth.is_none() { // auth is set to Trust, nothing to check so just return ok return Ok(()); @@ -683,20 +689,19 @@ impl postgres_backend_async::Handler for PageServerHandler { &mut self, _pgb: &mut PostgresBackend, jwt_response: &[u8], - ) -> anyhow::Result<()> { + ) -> Result<(), QueryError> { // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT // which requires auth to be present let data = self .auth .as_ref() .unwrap() - .decode(str::from_utf8(jwt_response)?)?; + .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?; - if matches!(data.claims.scope, Scope::Tenant) { - ensure!( - data.claims.tenant_id.is_some(), + if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() { + return Err(QueryError::Other(anyhow::anyhow!( "jwt token scope is Tenant, but tenant id is missing" - ) + ))); } info!( @@ -708,22 +713,33 @@ impl postgres_backend_async::Handler for PageServerHandler { Ok(()) } + fn startup( + &mut self, + _pgb: &mut PostgresBackend, + _sm: &FeStartupPacket, + ) -> Result<(), QueryError> { + Ok(()) + } + async fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, - ) -> anyhow::Result<()> { - debug!("process query {:?}", query_string); + ) -> Result<(), QueryError> { + debug!("process query {query_string:?}"); if query_string.starts_with("pagestream ") { let (_, params_raw) = query_string.split_at("pagestream ".len()); let params = params_raw.split(' ').collect::>(); - ensure!( - params.len() == 2, - "invalid param number for pagestream command" - ); - let tenant_id = TenantId::from_str(params[0])?; - let timeline_id = TimelineId::from_str(params[1])?; + if params.len() != 2 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for pagestream command" + ))); + } + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; self.check_permission(Some(tenant_id))?; @@ -733,18 +749,24 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!( - params.len() >= 2, - "invalid param number for basebackup command" - ); + if params.len() < 2 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for basebackup command" + ))); + } - let tenant_id = TenantId::from_str(params[0])?; - let timeline_id = TimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; self.check_permission(Some(tenant_id))?; let lsn = if params.len() == 3 { - Some(Lsn::from_str(params[2])?) + Some( + Lsn::from_str(params[2]) + .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?, + ) } else { None }; @@ -759,13 +781,16 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!( - params.len() == 2, - "invalid param number for get_last_record_rlsn command" - ); + if params.len() != 2 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for get_last_record_rlsn command" + ))); + } - let tenant_id = TenantId::from_str(params[0])?; - let timeline_id = TimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; self.check_permission(Some(tenant_id))?; let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?; @@ -787,22 +812,31 @@ impl postgres_backend_async::Handler for PageServerHandler { let (_, params_raw) = query_string.split_at("fullbackup ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!( - params.len() >= 2, - "invalid param number for fullbackup command" - ); + if params.len() < 2 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for fullbackup command" + ))); + } - let tenant_id = TenantId::from_str(params[0])?; - let timeline_id = TimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; // The caller is responsible for providing correct lsn and prev_lsn. let lsn = if params.len() > 2 { - Some(Lsn::from_str(params[2])?) + Some( + Lsn::from_str(params[2]) + .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?, + ) } else { None }; let prev_lsn = if params.len() > 3 { - Some(Lsn::from_str(params[3])?) + Some( + Lsn::from_str(params[3]) + .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?, + ) } else { None }; @@ -827,12 +861,21 @@ impl postgres_backend_async::Handler for PageServerHandler { // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION" let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!(params.len() == 5); - let tenant_id = TenantId::from_str(params[0])?; - let timeline_id = TimelineId::from_str(params[1])?; - let base_lsn = Lsn::from_str(params[2])?; - let end_lsn = Lsn::from_str(params[3])?; - let pg_version = u32::from_str(params[4])?; + if params.len() != 5 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for import basebackup command" + ))); + } + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; + let base_lsn = Lsn::from_str(params[2]) + .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; + let end_lsn = Lsn::from_str(params[3]) + .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?; + let pg_version = u32::from_str(params[4]) + .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?; self.check_permission(Some(tenant_id))?; @@ -850,7 +893,10 @@ impl postgres_backend_async::Handler for PageServerHandler { Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}"); - pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))? + pgb.write_message(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))? } }; } else if query_string.starts_with("import wal ") { @@ -860,11 +906,19 @@ impl postgres_backend_async::Handler for PageServerHandler { // caller should poll the http api to check when that is done. let (_, params_raw) = query_string.split_at("import wal ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!(params.len() == 4); - let tenant_id = TenantId::from_str(params[0])?; - let timeline_id = TimelineId::from_str(params[1])?; - let start_lsn = Lsn::from_str(params[2])?; - let end_lsn = Lsn::from_str(params[3])?; + if params.len() != 4 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for import wal command" + ))); + } + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; + let start_lsn = Lsn::from_str(params[2]) + .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; + let end_lsn = Lsn::from_str(params[3]) + .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?; self.check_permission(Some(tenant_id))?; @@ -875,7 +929,10 @@ impl postgres_backend_async::Handler for PageServerHandler { Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}"); - pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))? + pgb.write_message(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))? } }; } else if query_string.to_ascii_lowercase().starts_with("set ") { @@ -886,8 +943,13 @@ impl postgres_backend_async::Handler for PageServerHandler { // show let (_, params_raw) = query_string.split_at("show ".len()); let params = params_raw.split(' ').collect::>(); - ensure!(params.len() == 1, "invalid param number for config command"); - let tenant_id = TenantId::from_str(params[0])?; + if params.len() != 1 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for config command" + ))); + } + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; self.check_permission(Some(tenant_id))?; @@ -928,7 +990,9 @@ impl postgres_backend_async::Handler for PageServerHandler { ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { - bail!("unknown command"); + return Err(QueryError::Other(anyhow::anyhow!( + "unknown command {query_string}" + ))); } Ok(()) @@ -940,8 +1004,8 @@ impl postgres_backend_async::Handler for PageServerHandler { /// If the tenant is Loading, waits for it to become Active, for up to 30 s. That /// ensures that queries don't fail immediately after pageserver startup, because /// all tenants are still loading. -async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result> { - let tenant = tenant_mgr::get_tenant(tenant_id, false).await?; +async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> anyhow::Result> { + let tenant = mgr::get_tenant(tenant_id, false).await?; match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await { Ok(wait_result) => wait_result // no .context(), the error message is good enough and some tests depend on it @@ -954,37 +1018,8 @@ async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result Result> { +) -> anyhow::Result> { get_active_tenant_with_timeout(tenant_id) .await .and_then(|tenant| tenant.get_timeline(timeline_id, true)) } - -/// -/// A std::io::Write implementation that wraps all data written to it in CopyData -/// messages. -/// -struct CopyDataSink<'a> { - pgb: &'a mut PostgresBackend, - rt: tokio::runtime::Handle, -} - -impl<'a> io::Write for CopyDataSink<'a> { - fn write(&mut self, data: &[u8]) -> io::Result { - // CopyData - // FIXME: if the input is large, we should split it into multiple messages. - // Not sure what the threshold should be, but the ultimate hard limit is that - // the length cannot exceed u32. - // FIXME: flush isn't really required, but makes it easier - // to view in wireshark - self.pgb.write_message(&BeMessage::CopyData(data))?; - self.rt.block_on(self.pgb.flush())?; - trace!("CopyData sent for {} bytes!", data.len()); - - Ok(data.len()) - } - fn flush(&mut self) -> io::Result<()> { - // no-op - Ok(()) - } -} diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 0e334a63df..82b1576145 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,11 +6,12 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! +use super::tenant::PageReconstructResult; use crate::keyspace::{KeySpace, KeySpaceAccum}; -use crate::repository::*; -use crate::tenant::Timeline; +use crate::tenant::{with_ondemand_download, Timeline}; use crate::walrecord::NeonWalRecord; -use anyhow::{bail, ensure, Result}; +use crate::{repository::*, try_no_ondemand_download}; +use anyhow::Context; use bytes::{Buf, Bytes}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; @@ -19,6 +20,7 @@ use postgres_ffi::{Oid, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{hash_map, HashMap, HashSet}; use std::ops::Range; +use tokio_util::sync::CancellationToken; use tracing::{debug, trace, warn}; use utils::{bin_ser::BeSer, lsn::Lsn}; @@ -33,6 +35,14 @@ pub enum LsnForTimestamp { NoData(Lsn), } +#[derive(Debug, thiserror::Error)] +pub enum CalculateLogicalSizeError { + #[error("cancelled")] + Cancelled, + #[error(transparent)] + Other(#[from] anyhow::Error), +} + /// /// This impl provides all the functionality to store PostgreSQL relations, SLRUs, /// and other special kinds of files, in a versioned key-value store. The @@ -88,16 +98,18 @@ impl Timeline { blknum: BlockNumber, lsn: Lsn, latest: bool, - ) -> Result { - ensure!(tag.relnode != 0, "invalid relnode"); + ) -> PageReconstructResult { + if tag.relnode == 0 { + return PageReconstructResult::from(anyhow::anyhow!("invalid relnode")); + } - let nblocks = self.get_rel_size(tag, lsn, latest)?; + let nblocks = try_no_ondemand_download!(self.get_rel_size(tag, lsn, latest)); if blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", tag, blknum, lsn, nblocks ); - return Ok(ZERO_PAGE.clone()); + return PageReconstructResult::Success(ZERO_PAGE.clone()); } let key = rel_block_to_key(tag, blknum); @@ -105,38 +117,51 @@ impl Timeline { } // Get size of a database in blocks - pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn, latest: bool) -> Result { + pub fn get_db_size( + &self, + spcnode: Oid, + dbnode: Oid, + lsn: Lsn, + latest: bool, + ) -> PageReconstructResult { let mut total_blocks = 0; - let rels = self.list_rels(spcnode, dbnode, lsn)?; + let rels = try_no_ondemand_download!(self.list_rels(spcnode, dbnode, lsn)); for rel in rels { - let n_blocks = self.get_rel_size(rel, lsn, latest)?; + let n_blocks = try_no_ondemand_download!(self.get_rel_size(rel, lsn, latest)); total_blocks += n_blocks as usize; } - Ok(total_blocks) + PageReconstructResult::Success(total_blocks) } /// Get size of a relation file - pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> Result { - ensure!(tag.relnode != 0, "invalid relnode"); + pub fn get_rel_size( + &self, + tag: RelTag, + lsn: Lsn, + latest: bool, + ) -> PageReconstructResult { + if tag.relnode == 0 { + return PageReconstructResult::from(anyhow::anyhow!("invalid relnode")); + } if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) { - return Ok(nblocks); + return PageReconstructResult::Success(nblocks); } if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) - && !self.get_rel_exists(tag, lsn, latest)? + && !try_no_ondemand_download!(self.get_rel_exists(tag, lsn, latest)) { // FIXME: Postgres sometimes calls smgrcreate() to create // FSM, and smgrnblocks() on it immediately afterwards, // without extending it. Tolerate that by claiming that // any non-existent FSM fork has size 0. - return Ok(0); + return PageReconstructResult::Success(0); } let key = rel_size_to_key(tag); - let mut buf = self.get(key, lsn)?; + let mut buf = try_no_ondemand_download!(self.get(key, lsn)); let nblocks = buf.get_u32_le(); if latest { @@ -149,43 +174,62 @@ impl Timeline { // associated with most recent value of LSN. self.update_cached_rel_size(tag, lsn, nblocks); } - Ok(nblocks) + PageReconstructResult::Success(nblocks) } /// Does relation exist? - pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> Result { - ensure!(tag.relnode != 0, "invalid relnode"); + pub fn get_rel_exists( + &self, + tag: RelTag, + lsn: Lsn, + _latest: bool, + ) -> PageReconstructResult { + if tag.relnode == 0 { + return PageReconstructResult::from(anyhow::anyhow!("invalid relnode")); + } // first try to lookup relation in cache if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) { - return Ok(true); + return PageReconstructResult::Success(true); } // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); - let buf = self.get(key, lsn)?; - let dir = RelDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(key, lsn)); - let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); - - Ok(exists) + match RelDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => { + let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); + PageReconstructResult::Success(exists) + } + Err(e) => PageReconstructResult::from(e), + } } /// Get a list of all existing relations in given tablespace and database. - pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { + pub fn list_rels( + &self, + spcnode: Oid, + dbnode: Oid, + lsn: Lsn, + ) -> PageReconstructResult> { // fetch directory listing let key = rel_dir_to_key(spcnode, dbnode); - let buf = self.get(key, lsn)?; - let dir = RelDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(key, lsn)); - let rels: HashSet = - HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { - spcnode, - dbnode, - relnode: *relnode, - forknum: *forknum, - })); + match RelDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => { + let rels: HashSet = + HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { + spcnode, + dbnode, + relnode: *relnode, + forknum: *forknum, + })); - Ok(rels) + PageReconstructResult::Success(rels) + } + Err(e) => PageReconstructResult::from(e), + } } /// Look up given SLRU page version. @@ -195,7 +239,7 @@ impl Timeline { segno: u32, blknum: BlockNumber, lsn: Lsn, - ) -> Result { + ) -> PageReconstructResult { let key = slru_block_to_key(kind, segno, blknum); self.get(key, lsn) } @@ -206,21 +250,30 @@ impl Timeline { kind: SlruKind, segno: u32, lsn: Lsn, - ) -> Result { + ) -> PageReconstructResult { let key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(key, lsn)?; - Ok(buf.get_u32_le()) + let mut buf = try_no_ondemand_download!(self.get(key, lsn)); + PageReconstructResult::Success(buf.get_u32_le()) } /// Get size of an SLRU segment - pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { + pub fn get_slru_segment_exists( + &self, + kind: SlruKind, + segno: u32, + lsn: Lsn, + ) -> PageReconstructResult { // fetch directory listing let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn)?; - let dir = SlruSegmentDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(key, lsn)); - let exists = dir.segments.get(&segno).is_some(); - Ok(exists) + match SlruSegmentDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => { + let exists = dir.segments.get(&segno).is_some(); + PageReconstructResult::Success(exists) + } + Err(e) => PageReconstructResult::from(e), + } } /// Locate LSN, such that all transactions that committed before @@ -230,7 +283,10 @@ impl Timeline { /// so it's not well defined which LSN you get if there were multiple commits /// "in flight" at that point in time. /// - pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result { + pub fn find_lsn_for_timestamp( + &self, + search_timestamp: TimestampTz, + ) -> PageReconstructResult { let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); let min_lsn = *gc_cutoff_lsn_guard; let max_lsn = self.get_last_record_lsn(); @@ -246,12 +302,12 @@ impl Timeline { // cannot overflow, high and low are both smaller than u64::MAX / 2 let mid = (high + low) / 2; - let cmp = self.is_latest_commit_timestamp_ge_than( + let cmp = try_no_ondemand_download!(self.is_latest_commit_timestamp_ge_than( search_timestamp, Lsn(mid * 8), &mut found_smaller, &mut found_larger, - )?; + )); if cmp { high = mid; @@ -263,15 +319,15 @@ impl Timeline { (false, false) => { // This can happen if no commit records have been processed yet, e.g. // just after importing a cluster. - Ok(LsnForTimestamp::NoData(max_lsn)) + PageReconstructResult::Success(LsnForTimestamp::NoData(max_lsn)) } (true, false) => { // Didn't find any commit timestamps larger than the request - Ok(LsnForTimestamp::Future(max_lsn)) + PageReconstructResult::Success(LsnForTimestamp::Future(max_lsn)) } (false, true) => { // Didn't find any commit timestamps smaller than the request - Ok(LsnForTimestamp::Past(max_lsn)) + PageReconstructResult::Success(LsnForTimestamp::Past(max_lsn)) } (true, true) => { // low is the LSN of the first commit record *after* the search_timestamp, @@ -281,7 +337,7 @@ impl Timeline { // Otherwise, if you restore to the returned LSN, the database will // include physical changes from later commits that will be marked // as aborted, and will need to be vacuumed away. - Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8))) + PageReconstructResult::Success(LsnForTimestamp::Present(Lsn((low - 1) * 8))) } } } @@ -299,12 +355,20 @@ impl Timeline { probe_lsn: Lsn, found_smaller: &mut bool, found_larger: &mut bool, - ) -> Result { - for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? { - let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?; + ) -> PageReconstructResult { + for segno in try_no_ondemand_download!(self.list_slru_segments(SlruKind::Clog, probe_lsn)) { + let nblocks = try_no_ondemand_download!(self.get_slru_segment_size( + SlruKind::Clog, + segno, + probe_lsn + )); for blknum in (0..nblocks).rev() { - let clog_page = - self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?; + let clog_page = try_no_ondemand_download!(self.get_slru_page_at_lsn( + SlruKind::Clog, + segno, + blknum, + probe_lsn + )); if clog_page.len() == BLCKSZ as usize + 8 { let mut timestamp_bytes = [0u8; 8]; @@ -313,61 +377,75 @@ impl Timeline { if timestamp >= search_timestamp { *found_larger = true; - return Ok(true); + return PageReconstructResult::Success(true); } else { *found_smaller = true; } } } } - Ok(false) + PageReconstructResult::Success(false) } /// Get a list of SLRU segments - pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { + pub fn list_slru_segments( + &self, + kind: SlruKind, + lsn: Lsn, + ) -> PageReconstructResult> { // fetch directory entry let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn)?; - let dir = SlruSegmentDirectory::des(&buf)?; - - Ok(dir.segments) + let buf = try_no_ondemand_download!(self.get(key, lsn)); + match SlruSegmentDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => PageReconstructResult::Success(dir.segments), + Err(e) => PageReconstructResult::from(e), + } } - pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + pub fn get_relmap_file( + &self, + spcnode: Oid, + dbnode: Oid, + lsn: Lsn, + ) -> PageReconstructResult { let key = relmap_file_key(spcnode, dbnode); - let buf = self.get(key, lsn)?; - Ok(buf) + let buf = try_no_ondemand_download!(self.get(key, lsn)); + PageReconstructResult::Success(buf) } - pub fn list_dbdirs(&self, lsn: Lsn) -> Result> { + pub fn list_dbdirs(&self, lsn: Lsn) -> PageReconstructResult> { // fetch directory entry - let buf = self.get(DBDIR_KEY, lsn)?; - let dir = DbDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(DBDIR_KEY, lsn)); - Ok(dir.dbdirs) + match DbDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => PageReconstructResult::Success(dir.dbdirs), + Err(e) => PageReconstructResult::from(e), + } } - pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { + pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> PageReconstructResult { let key = twophase_file_key(xid); - let buf = self.get(key, lsn)?; - Ok(buf) + let buf = try_no_ondemand_download!(self.get(key, lsn)); + PageReconstructResult::Success(buf) } - pub fn list_twophase_files(&self, lsn: Lsn) -> Result> { + pub fn list_twophase_files(&self, lsn: Lsn) -> PageReconstructResult> { // fetch directory entry - let buf = self.get(TWOPHASEDIR_KEY, lsn)?; - let dir = TwoPhaseDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(TWOPHASEDIR_KEY, lsn)); - Ok(dir.xids) + match TwoPhaseDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => PageReconstructResult::Success(dir.xids), + Err(e) => PageReconstructResult::from(e), + } } - pub fn get_control_file(&self, lsn: Lsn) -> Result { + pub fn get_control_file(&self, lsn: Lsn) -> PageReconstructResult { self.get(CONTROLFILE_KEY, lsn) } - pub fn get_checkpoint(&self, lsn: Lsn) -> Result { + pub fn get_checkpoint(&self, lsn: Lsn) -> PageReconstructResult { self.get(CHECKPOINT_KEY, lsn) } @@ -376,16 +454,26 @@ impl Timeline { /// /// Only relation blocks are counted currently. That excludes metadata, /// SLRUs, twophase files etc. - pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { + pub async fn get_current_logical_size_non_incremental( + &self, + lsn: Lsn, + cancel: CancellationToken, + ) -> Result { // Fetch list of database dirs and iterate them - let buf = self.get(DBDIR_KEY, lsn)?; - let dbdir = DbDirectory::des(&buf)?; + let buf = self.get_download(DBDIR_KEY, lsn).await?; + let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?; let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { - for rel in self.list_rels(*spcnode, *dbnode, lsn)? { + for rel in + crate::tenant::with_ondemand_download(|| self.list_rels(*spcnode, *dbnode, lsn)) + .await? + { + if cancel.is_cancelled() { + return Err(CalculateLogicalSizeError::Cancelled); + } let relsize_key = rel_size_to_key(rel); - let mut buf = self.get(relsize_key, lsn)?; + let mut buf = self.get_download(relsize_key, lsn).await?; let relsize = buf.get_u32_le(); total_size += relsize as u64; @@ -398,7 +486,7 @@ impl Timeline { /// Get a KeySpace that covers all the Keys that are in use at the given LSN. /// Anything that's not listed maybe removed from the underlying storage (from /// that LSN forwards). - pub fn collect_keyspace(&self, lsn: Lsn) -> Result { + pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result { // Iterate through key ranges, greedily packing them into partitions let mut result = KeySpaceAccum::new(); @@ -406,8 +494,8 @@ impl Timeline { result.add_key(DBDIR_KEY); // Fetch list of database dirs and iterate them - let buf = self.get(DBDIR_KEY, lsn)?; - let dbdir = DbDirectory::des(&buf)?; + let buf = self.get_download(DBDIR_KEY, lsn).await?; + let dbdir = DbDirectory::des(&buf).context("deserialization failure")?; let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect(); dbs.sort_unstable(); @@ -415,15 +503,15 @@ impl Timeline { result.add_key(relmap_file_key(spcnode, dbnode)); result.add_key(rel_dir_to_key(spcnode, dbnode)); - let mut rels: Vec = self - .list_rels(spcnode, dbnode, lsn)? - .iter() - .cloned() - .collect(); + let mut rels: Vec = + with_ondemand_download(|| self.list_rels(spcnode, dbnode, lsn)) + .await? + .into_iter() + .collect(); rels.sort_unstable(); for rel in rels { let relsize_key = rel_size_to_key(rel); - let mut buf = self.get(relsize_key, lsn)?; + let mut buf = self.get_download(relsize_key, lsn).await?; let relsize = buf.get_u32_le(); result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize)); @@ -439,13 +527,13 @@ impl Timeline { ] { let slrudir_key = slru_dir_to_key(kind); result.add_key(slrudir_key); - let buf = self.get(slrudir_key, lsn)?; - let dir = SlruSegmentDirectory::des(&buf)?; + let buf = self.get_download(slrudir_key, lsn).await?; + let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?; let mut segments: Vec = dir.segments.iter().cloned().collect(); segments.sort_unstable(); for segno in segments { let segsize_key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(segsize_key, lsn)?; + let mut buf = self.get_download(segsize_key, lsn).await?; let segsize = buf.get_u32_le(); result.add_range( @@ -457,8 +545,8 @@ impl Timeline { // Then pg_twophase result.add_key(TWOPHASEDIR_KEY); - let buf = self.get(TWOPHASEDIR_KEY, lsn)?; - let twophase_dir = TwoPhaseDirectory::des(&buf)?; + let buf = self.get_download(TWOPHASEDIR_KEY, lsn).await?; + let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?; let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); xids.sort_unstable(); for xid in xids { @@ -537,7 +625,7 @@ impl<'a> DatadirModification<'a> { /// /// This inserts the directory metadata entries that are assumed to /// always exist. - pub fn init_empty(&mut self) -> Result<()> { + pub fn init_empty(&mut self) -> anyhow::Result<()> { let buf = DbDirectory::ser(&DbDirectory { dbdirs: HashMap::new(), })?; @@ -570,8 +658,8 @@ impl<'a> DatadirModification<'a> { rel: RelTag, blknum: BlockNumber, rec: NeonWalRecord, - ) -> Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + ) -> anyhow::Result<()> { + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec)); Ok(()) } @@ -583,7 +671,7 @@ impl<'a> DatadirModification<'a> { segno: u32, blknum: BlockNumber, rec: NeonWalRecord, - ) -> Result<()> { + ) -> anyhow::Result<()> { self.put( slru_block_to_key(kind, segno, blknum), Value::WalRecord(rec), @@ -597,8 +685,8 @@ impl<'a> DatadirModification<'a> { rel: RelTag, blknum: BlockNumber, img: Bytes, - ) -> Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + ) -> anyhow::Result<()> { + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); self.put(rel_block_to_key(rel, blknum), Value::Image(img)); Ok(()) } @@ -609,26 +697,26 @@ impl<'a> DatadirModification<'a> { segno: u32, blknum: BlockNumber, img: Bytes, - ) -> Result<()> { + ) -> anyhow::Result<()> { self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img)); Ok(()) } /// Store a relmapper file (pg_filenode.map) in the repository - pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> Result<()> { + pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> anyhow::Result<()> { // Add it to the directory (if it doesn't exist already) - let buf = self.get(DBDIR_KEY)?; + let buf = self.get(DBDIR_KEY).no_ondemand_download()?; let mut dbdir = DbDirectory::des(&buf)?; let r = dbdir.dbdirs.insert((spcnode, dbnode), true); - if r == None || r == Some(false) { + if r.is_none() || r == Some(false) { // The dbdir entry didn't exist, or it contained a // 'false'. The 'insert' call already updated it with // 'true', now write the updated 'dbdirs' map back. let buf = DbDirectory::ser(&dbdir)?; self.put(DBDIR_KEY, Value::Image(buf.into())); } - if r == None { + if r.is_none() { // Create RelDirectory let buf = RelDirectory::ser(&RelDirectory { rels: HashSet::new(), @@ -643,12 +731,12 @@ impl<'a> DatadirModification<'a> { Ok(()) } - pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> Result<()> { + pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> anyhow::Result<()> { // Add it to the directory entry - let buf = self.get(TWOPHASEDIR_KEY)?; + let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?; let mut dir = TwoPhaseDirectory::des(&buf)?; if !dir.xids.insert(xid) { - bail!("twophase file for xid {} already exists", xid); + anyhow::bail!("twophase file for xid {} already exists", xid); } self.put( TWOPHASEDIR_KEY, @@ -659,23 +747,26 @@ impl<'a> DatadirModification<'a> { Ok(()) } - pub fn put_control_file(&mut self, img: Bytes) -> Result<()> { + pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> { self.put(CONTROLFILE_KEY, Value::Image(img)); Ok(()) } - pub fn put_checkpoint(&mut self, img: Bytes) -> Result<()> { + pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> { self.put(CHECKPOINT_KEY, Value::Image(img)); Ok(()) } - pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> { + pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> { let req_lsn = self.tline.get_last_record_lsn(); - let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?; + let total_blocks = self + .tline + .get_db_size(spcnode, dbnode, req_lsn, true) + .no_ondemand_download()?; // Remove entry from dbdir - let buf = self.get(DBDIR_KEY)?; + let buf = self.get(DBDIR_KEY).no_ondemand_download()?; let mut dir = DbDirectory::des(&buf)?; if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { let buf = DbDirectory::ser(&dir)?; @@ -698,11 +789,11 @@ impl<'a> DatadirModification<'a> { /// Create a relation fork. /// /// 'nblocks' is the initial size. - pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> { + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); // It's possible that this is the first rel for this db in this // tablespace. Create the reldir entry for it if so. - let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY)?)?; + let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).no_ondemand_download()?)?; let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() { // Didn't exist. Update dbdir @@ -714,12 +805,12 @@ impl<'a> DatadirModification<'a> { RelDirectory::default() } else { // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key)?)? + RelDirectory::des(&self.get(rel_dir_key).no_ondemand_download()?)? }; // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { - bail!("rel {} already exists", rel); + anyhow::bail!("rel {rel} already exists"); } self.put( rel_dir_key, @@ -742,13 +833,17 @@ impl<'a> DatadirModification<'a> { } /// Truncate relation - pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> { + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); let last_lsn = self.tline.get_last_record_lsn(); - if self.tline.get_rel_exists(rel, last_lsn, true)? { + if self + .tline + .get_rel_exists(rel, last_lsn, true) + .no_ondemand_download()? + { let size_key = rel_size_to_key(rel); // Fetch the old size first - let old_size = self.get(size_key)?.get_u32_le(); + let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le(); // Update the entry with the new size. let buf = nblocks.to_le_bytes(); @@ -768,12 +863,12 @@ impl<'a> DatadirModification<'a> { /// Extend relation /// If new size is smaller, do nothing. - pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> { + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); // Put size let size_key = rel_size_to_key(rel); - let old_size = self.get(size_key)?.get_u32_le(); + let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le(); // only extend relation here. never decrease the size if nblocks > old_size { @@ -789,12 +884,12 @@ impl<'a> DatadirModification<'a> { } /// Drop a relation. - pub fn put_rel_drop(&mut self, rel: RelTag) -> Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + pub fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> { + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); // Remove it from the directory entry let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let buf = self.get(dir_key)?; + let buf = self.get(dir_key).no_ondemand_download()?; let mut dir = RelDirectory::des(&buf)?; if dir.rels.remove(&(rel.relnode, rel.forknum)) { @@ -805,7 +900,7 @@ impl<'a> DatadirModification<'a> { // update logical size let size_key = rel_size_to_key(rel); - let old_size = self.get(size_key)?.get_u32_le(); + let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le(); self.pending_nblocks -= old_size as i64; // Remove enty from relation size cache @@ -822,14 +917,14 @@ impl<'a> DatadirModification<'a> { kind: SlruKind, segno: u32, nblocks: BlockNumber, - ) -> Result<()> { + ) -> anyhow::Result<()> { // Add it to the directory entry let dir_key = slru_dir_to_key(kind); - let buf = self.get(dir_key)?; + let buf = self.get(dir_key).no_ondemand_download()?; let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.insert(segno) { - bail!("slru segment {:?}/{} already exists", kind, segno); + anyhow::bail!("slru segment {kind:?}/{segno} already exists"); } self.put( dir_key, @@ -852,7 +947,7 @@ impl<'a> DatadirModification<'a> { kind: SlruKind, segno: u32, nblocks: BlockNumber, - ) -> Result<()> { + ) -> anyhow::Result<()> { // Put size let size_key = slru_segment_size_to_key(kind, segno); let buf = nblocks.to_le_bytes(); @@ -861,10 +956,10 @@ impl<'a> DatadirModification<'a> { } /// This method is used for marking truncated SLRU files - pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> Result<()> { + pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> { // Remove it from the directory entry let dir_key = slru_dir_to_key(kind); - let buf = self.get(dir_key)?; + let buf = self.get(dir_key).no_ondemand_download()?; let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.remove(&segno) { @@ -882,15 +977,15 @@ impl<'a> DatadirModification<'a> { } /// Drop a relmapper file (pg_filenode.map) - pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<()> { + pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> { // TODO Ok(()) } /// This method is used for marking truncated SLRU files - pub fn drop_twophase_file(&mut self, xid: TransactionId) -> Result<()> { + pub fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { // Remove it from the directory entry - let buf = self.get(TWOPHASEDIR_KEY)?; + let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?; let mut dir = TwoPhaseDirectory::des(&buf)?; if !dir.xids.remove(&xid) { @@ -925,7 +1020,7 @@ impl<'a> DatadirModification<'a> { /// retains all the metadata, but data pages are flushed. That's again OK /// for bulk import, where you are just loading data pages and won't try to /// modify the same pages twice. - pub fn flush(&mut self) -> Result<()> { + pub fn flush(&mut self) -> anyhow::Result<()> { // Unless we have accumulated a decent amount of changes, it's not worth it // to scan through the pending_updates list. let pending_nblocks = self.pending_nblocks; @@ -936,7 +1031,7 @@ impl<'a> DatadirModification<'a> { let writer = self.tline.writer(); // Flush relation and SLRU data blocks, keep metadata. - let mut result: Result<()> = Ok(()); + let mut result: anyhow::Result<()> = Ok(()); self.pending_updates.retain(|&key, value| { if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) { result = writer.put(key, self.lsn, value); @@ -984,7 +1079,7 @@ impl<'a> DatadirModification<'a> { // Internal helper functions to batch the modifications - fn get(&self, key: Key) -> Result { + fn get(&self, key: Key) -> PageReconstructResult { // Have we already updated the same key? Read the pending updated // version in that case. // @@ -992,14 +1087,14 @@ impl<'a> DatadirModification<'a> { // value that has been removed, deletion only avoids leaking storage. if let Some(value) = self.pending_updates.get(&key) { if let Value::Image(img) = value { - Ok(img.clone()) + PageReconstructResult::Success(img.clone()) } else { // Currently, we never need to read back a WAL record that we // inserted in the same "transaction". All the metadata updates // work directly with Images, and we never need to read actual // data pages. We could handle this if we had to, by calling // the walredo manager, but let's keep it simple for now. - bail!("unexpected pending WAL record"); + PageReconstructResult::from(anyhow::anyhow!("unexpected pending WAL record")) } } else { let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); @@ -1327,7 +1422,7 @@ fn twophase_key_range(xid: TransactionId) -> Range { field2: 0, field3: 0, field4: 0, - field5: if overflowed { 1 } else { 0 }, + field5: u8::from(overflowed), field6: next_xid, } } @@ -1354,7 +1449,7 @@ const CHECKPOINT_KEY: Key = Key { // Reverse mappings for a few Keys. // These are needed by WAL redo manager. -pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> { +pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { Ok(match key.field1 { 0x00 => ( RelTag { @@ -1365,7 +1460,7 @@ pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> { }, key.field6, ), - _ => bail!("unexpected value kind 0x{:02x}", key.field1), + _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), }) } @@ -1384,21 +1479,21 @@ pub fn is_rel_vm_block_key(key: Key) -> bool { && key.field6 != 0xffffffff } -pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> { +pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> { Ok(match key.field1 { 0x01 => { let kind = match key.field2 { 0x00 => SlruKind::Clog, 0x01 => SlruKind::MultiXactMembers, 0x02 => SlruKind::MultiXactOffsets, - _ => bail!("unrecognized slru kind 0x{:02x}", key.field2), + _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2), }; let segno = key.field4; let blknum = key.field6; (kind, segno, blknum) } - _ => bail!("unexpected value kind 0x{:02x}", key.field1), + _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), }) } @@ -1413,7 +1508,7 @@ pub fn create_test_timeline( tenant: &crate::tenant::Tenant, timeline_id: utils::id::TimelineId, pg_version: u32, -) -> Result> { +) -> anyhow::Result> { let tline = tenant .create_empty_timeline(timeline_id, Lsn(8), pg_version)? .initialize()?; diff --git a/pageserver/src/profiling.rs b/pageserver/src/profiling.rs deleted file mode 100644 index ad896cfa30..0000000000 --- a/pageserver/src/profiling.rs +++ /dev/null @@ -1,107 +0,0 @@ -//! -//! Support for profiling -//! -//! This relies on a modified version of the 'pprof-rs' crate. That's not very -//! nice, so to avoid a hard dependency on that, this is an optional feature. -//! -use crate::config::{PageServerConf, ProfilingConfig}; - -/// The actual implementation is in the `profiling_impl` submodule. If the profiling -/// feature is not enabled, it's just a dummy implementation that panics if you -/// try to enabled profiling in the configuration. -pub use profiling_impl::*; - -#[cfg(feature = "profiling")] -mod profiling_impl { - use super::*; - use pprof; - use std::marker::PhantomData; - - /// Start profiling the current thread. Returns a guard object; - /// the profiling continues until the guard is dropped. - /// - /// Note: profiling is not re-entrant. If you call 'profpoint_start' while - /// profiling is already started, nothing happens, and the profiling will be - /// stopped when either guard object is dropped. - #[inline] - pub fn profpoint_start( - conf: &crate::config::PageServerConf, - point: ProfilingConfig, - ) -> Option { - if conf.profiling == point { - pprof::start_profiling(); - Some(ProfilingGuard(PhantomData)) - } else { - None - } - } - - /// A hack to remove Send and Sync from the ProfilingGuard. Because the - /// profiling is attached to current thread. - //// - /// See comments in https://github.com/rust-lang/rust/issues/68318 - type PhantomUnsend = std::marker::PhantomData<*mut u8>; - - pub struct ProfilingGuard(PhantomUnsend); - - impl Drop for ProfilingGuard { - fn drop(&mut self) { - pprof::stop_profiling(); - } - } - - /// Initialize the profiler. This must be called before any 'profpoint_start' calls. - pub fn init_profiler(conf: &PageServerConf) -> Option { - if conf.profiling != ProfilingConfig::Disabled { - Some(pprof::ProfilerGuardBuilder::default().build().unwrap()) - } else { - None - } - } - - /// Exit the profiler. Writes the flamegraph to current workdir. - pub fn exit_profiler(_conf: &PageServerConf, profiler_guard: &Option) { - // Write out the flamegraph - if let Some(profiler_guard) = profiler_guard { - if let Ok(report) = profiler_guard.report().build() { - // this gets written under the workdir - let file = std::fs::File::create("flamegraph.svg").unwrap(); - let mut options = pprof::flamegraph::Options::default(); - options.image_width = Some(2500); - report.flamegraph_with_options(file, &mut options).unwrap(); - } - } - } -} - -/// Dummy implementation when compiling without profiling feature or for non-linux OSes. -#[cfg(not(feature = "profiling"))] -mod profiling_impl { - use super::*; - - pub struct DummyProfilerGuard; - - impl Drop for DummyProfilerGuard { - fn drop(&mut self) { - // do nothing, this exists to calm Clippy down - } - } - - pub fn profpoint_start( - _conf: &PageServerConf, - _point: ProfilingConfig, - ) -> Option { - None - } - - pub fn init_profiler(conf: &PageServerConf) -> Option { - if conf.profiling != ProfilingConfig::Disabled { - // shouldn't happen, we don't allow profiling in the config if the support - // for it is disabled. - panic!("profiling enabled but the binary was compiled without profiling support"); - } - None - } - - pub fn exit_profiler(_conf: &PageServerConf, _guard: &Option) {} -} diff --git a/pageserver/src/storage_sync2/download.rs b/pageserver/src/storage_sync2/download.rs deleted file mode 100644 index 0d25d88a97..0000000000 --- a/pageserver/src/storage_sync2/download.rs +++ /dev/null @@ -1,232 +0,0 @@ -//! Helper functions to download files from remote storage with a RemoteStorage -use std::collections::HashSet; -use std::path::Path; - -use anyhow::{bail, Context}; -use futures::stream::{FuturesUnordered, StreamExt}; -use tokio::fs; -use tokio::io::AsyncWriteExt; -use tracing::{debug, info_span, Instrument}; - -use crate::config::PageServerConf; -use crate::storage_sync::index::LayerFileMetadata; -use crate::tenant::filename::LayerFileName; -use remote_storage::{DownloadError, GenericRemoteStorage}; -use utils::crashsafe::path_with_suffix_extension; -use utils::id::{TenantId, TimelineId}; - -use super::index::{IndexPart, IndexPartUnclean}; - -async fn fsync_path(path: impl AsRef) -> Result<(), std::io::Error> { - fs::File::open(path).await?.sync_all().await -} - -/// -/// If 'metadata' is given, we will validate that the downloaded file's size matches that -/// in the metadata. (In the future, we might do more cross-checks, like CRC validation) -/// -/// Returns the size of the downloaded file. -pub async fn download_layer_file<'a>( - conf: &'static PageServerConf, - storage: &'a GenericRemoteStorage, - tenant_id: TenantId, - timeline_id: TimelineId, - layer_file_name: &'a LayerFileName, - layer_metadata: &'a LayerFileMetadata, -) -> anyhow::Result { - let timeline_path = conf.timeline_path(&timeline_id, &tenant_id); - - let local_path = timeline_path.join(layer_file_name.file_name()); - - let remote_path = conf.remote_path(&local_path)?; - - // Perform a rename inspired by durable_rename from file_utils.c. - // The sequence: - // write(tmp) - // fsync(tmp) - // rename(tmp, new) - // fsync(new) - // fsync(parent) - // For more context about durable_rename check this email from postgres mailing list: - // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com - // If pageserver crashes the temp file will be deleted on startup and re-downloaded. - let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); - - // TODO: this doesn't use the cached fd for some reason? - let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| { - format!( - "Failed to create a destination file for layer '{}'", - temp_file_path.display() - ) - })?; - let mut download = storage.download(&remote_path).await.with_context(|| { - format!( - "Failed to open a download stream for layer with remote storage path '{remote_path:?}'" - ) - })?; - let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| { - format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}") - })?; - - // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: - // A file will not be closed immediately when it goes out of scope if there are any IO operations - // that have not yet completed. To ensure that a file is closed immediately when it is dropped, - // you should call flush before dropping it. - // - // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because - // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations. - // But for additional safety lets check/wait for any pending operations. - destination_file.flush().await.with_context(|| { - format!( - "failed to flush source file at {}", - temp_file_path.display() - ) - })?; - - match layer_metadata.file_size() { - Some(expected) if expected != bytes_amount => { - anyhow::bail!( - "According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'", - temp_file_path.display() - ); - } - Some(_) | None => { - // matches, or upgrading from an earlier IndexPart version - } - } - - // not using sync_data because it can lose file size update - destination_file.sync_all().await.with_context(|| { - format!( - "failed to fsync source file at {}", - temp_file_path.display() - ) - })?; - drop(destination_file); - - fail::fail_point!("remote-storage-download-pre-rename", |_| { - bail!("remote-storage-download-pre-rename failpoint triggered") - }); - - fs::rename(&temp_file_path, &local_path).await?; - - fsync_path(&local_path) - .await - .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))?; - - tracing::info!("download complete: {}", local_path.display()); - - Ok(bytes_amount) -} - -const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; - -pub fn is_temp_download_file(path: &Path) -> bool { - let extension = path.extension().map(|pname| { - pname - .to_str() - .expect("paths passed to this function must be valid Rust strings") - }); - match extension { - Some(TEMP_DOWNLOAD_EXTENSION) => true, - Some(_) => false, - None => false, - } -} - -/// List timelines of given tenant in remote storage -pub async fn list_remote_timelines<'a>( - storage: &'a GenericRemoteStorage, - conf: &'static PageServerConf, - tenant_id: TenantId, -) -> anyhow::Result> { - let tenant_path = conf.timelines_path(&tenant_id); - let tenant_storage_path = conf.remote_path(&tenant_path)?; - - let timelines = storage - .list_prefixes(Some(&tenant_storage_path)) - .await - .with_context(|| { - format!( - "Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download" - ) - })?; - - if timelines.is_empty() { - anyhow::bail!("no timelines found on the remote storage") - } - - let mut timeline_ids = HashSet::new(); - let mut part_downloads = FuturesUnordered::new(); - - for timeline_remote_storage_key in timelines { - let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { - anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") - })?; - - let timeline_id: TimelineId = object_name.parse().with_context(|| { - format!("failed to parse object name into timeline id '{object_name}'") - })?; - - // list_prefixes returns all files with the prefix. If we haven't seen this timeline ID - // yet, launch a download task for it. - if !timeline_ids.contains(&timeline_id) { - timeline_ids.insert(timeline_id); - let storage_clone = storage.clone(); - part_downloads.push(async move { - ( - timeline_id, - download_index_part(conf, &storage_clone, tenant_id, timeline_id) - .instrument(info_span!("download_index_part", timeline=%timeline_id)) - .await, - ) - }); - } - } - - // Wait for all the download tasks to complete. - let mut timeline_parts = Vec::new(); - while let Some((timeline_id, part_upload_result)) = part_downloads.next().await { - let index_part = part_upload_result - .with_context(|| format!("Failed to fetch index part for timeline {timeline_id}"))?; - - debug!("Successfully fetched index part for timeline {timeline_id}"); - timeline_parts.push((timeline_id, index_part)); - } - Ok(timeline_parts) -} - -pub async fn download_index_part( - conf: &'static PageServerConf, - storage: &GenericRemoteStorage, - tenant_id: TenantId, - timeline_id: TimelineId, -) -> Result { - let index_part_path = conf - .metadata_path(timeline_id, tenant_id) - .with_file_name(IndexPart::FILE_NAME); - let part_storage_path = conf - .remote_path(&index_part_path) - .map_err(DownloadError::BadInput)?; - - let mut index_part_download = storage.download(&part_storage_path).await?; - - let mut index_part_bytes = Vec::new(); - tokio::io::copy( - &mut index_part_download.download_stream, - &mut index_part_bytes, - ) - .await - .with_context(|| format!("Failed to download an index part into file {index_part_path:?}")) - .map_err(DownloadError::Other)?; - - let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes) - .with_context(|| { - format!("Failed to deserialize index part file into file {index_part_path:?}") - }) - .map_err(DownloadError::Other)?; - - let index_part = index_part.remove_unclean_layer_file_names(); - - Ok(index_part) -} diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 3325ce01d4..a1b3ad26b0 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -25,7 +25,6 @@ //! the current task has been requested to shut down. You can use that with //! Tokio select!(). //! -//! //! TODO: This would be a good place to also handle panics in a somewhat sane way. //! Depending on what task panics, we might want to kill the whole server, or //! only a single tenant or timeline. @@ -36,6 +35,7 @@ #![allow(clippy::declare_interior_mutable_const)] use std::collections::HashMap; +use std::fmt; use std::future::Future; use std::panic::AssertUnwindSafe; use std::sync::atomic::{AtomicU64, Ordering}; @@ -43,9 +43,9 @@ use std::sync::{Arc, Mutex}; use futures::FutureExt; use tokio::runtime::Runtime; -use tokio::sync::watch; use tokio::task::JoinHandle; use tokio::task_local; +use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, warn}; @@ -135,8 +135,15 @@ pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { .expect("Failed to create background op runtime") }); +#[derive(Debug, Clone, Copy)] pub struct PageserverTaskId(u64); +impl fmt::Display for PageserverTaskId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + /// Each task that we track is associated with a "task ID". It's just an /// increasing number that we assign. Note that it is different from tokio::task::Id. static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1); @@ -146,11 +153,10 @@ static TASKS: Lazy>>> = Lazy::new(|| Mutex::new(HashMap::new())); task_local! { - // There is a Tokio watch channel for each task, which can be used to signal the - // task that it needs to shut down. This task local variable holds the receiving - // end of the channel. The sender is kept in the global registry, so that anyone - // can send the signal to request task shutdown. - static SHUTDOWN_RX: watch::Receiver; + // This is a cancellation token which will be cancelled when a task needs to shut down. The + // root token is kept in the global registry, so that anyone can send the signal to request + // task shutdown. + static SHUTDOWN_TOKEN: CancellationToken; // Each task holds reference to its own PageServerTask here. static CURRENT_TASK: Arc; @@ -200,11 +206,20 @@ pub enum TaskKind { // Task that uploads a file to remote storage RemoteUploadTask, + // Task that downloads a file from remote storage + RemoteDownloadTask, + // task that handles the initial downloading of all tenants InitialLoad, // task that handles attaching a tenant Attach, + + // task that handhes metrics collection + MetricsCollection, + + // task that drives downloading layers + DownloadAllRemoteLayers, } #[derive(Default)] @@ -226,8 +241,8 @@ struct PageServerTask { name: String, - // To request task shutdown, send 'true' to the channel to notify the task. - shutdown_tx: watch::Sender, + // To request task shutdown, just cancel this token. + cancel: CancellationToken, mutable: Mutex, } @@ -247,13 +262,13 @@ pub fn spawn( where F: Future> + Send + 'static, { - let (shutdown_tx, shutdown_rx) = watch::channel(false); + let cancel = CancellationToken::new(); let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed); let task = Arc::new(PageServerTask { task_id: PageserverTaskId(task_id), kind, name: name.to_string(), - shutdown_tx, + cancel: cancel.clone(), mutable: Mutex::new(MutableTaskState { tenant_id, timeline_id, @@ -271,7 +286,7 @@ where task_name, task_id, task_cloned, - shutdown_rx, + cancel, shutdown_process_on_error, future, )); @@ -288,7 +303,7 @@ async fn task_wrapper( task_name: String, task_id: u64, task: Arc, - shutdown_rx: watch::Receiver, + shutdown_token: CancellationToken, shutdown_process_on_error: bool, future: F, ) where @@ -296,9 +311,9 @@ async fn task_wrapper( { debug!("Starting task '{}'", task_name); - let result = SHUTDOWN_RX + let result = SHUTDOWN_TOKEN .scope( - shutdown_rx, + shutdown_token, CURRENT_TASK.scope(task, { // We use AssertUnwindSafe here so that the payload function // doesn't need to be UnwindSafe. We don't do anything after the @@ -408,7 +423,7 @@ pub async fn shutdown_tasks( && (tenant_id.is_none() || task_mut.tenant_id == tenant_id) && (timeline_id.is_none() || task_mut.timeline_id == timeline_id) { - let _ = task.shutdown_tx.send_replace(true); + task.cancel.cancel(); victim_tasks.push(Arc::clone(task)); } } @@ -436,24 +451,35 @@ pub fn current_task_kind() -> Option { CURRENT_TASK.try_with(|ct| ct.kind).ok() } +pub fn current_task_id() -> Option { + CURRENT_TASK.try_with(|ct| ct.task_id).ok() +} + /// A Future that can be used to check if the current task has been requested to /// shut down. pub async fn shutdown_watcher() { - let mut shutdown_rx = SHUTDOWN_RX - .try_with(|rx| rx.clone()) + let token = SHUTDOWN_TOKEN + .try_with(|t| t.clone()) .expect("shutdown_requested() called in an unexpected task or thread"); - while !*shutdown_rx.borrow() { - if shutdown_rx.changed().await.is_err() { - break; - } - } + token.cancelled().await; +} + +/// Clone the current task's cancellation token, which can be moved across tasks. +/// +/// When the task which is currently executing is shutdown, the cancellation token will be +/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or +/// `tokio::task::JoinSet::spawn`. +pub fn shutdown_token() -> CancellationToken { + SHUTDOWN_TOKEN + .try_with(|t| t.clone()) + .expect("shutdown_token() called in an unexpected task or thread") } /// Has the current task been requested to shut down? pub fn is_shutdown_requested() -> bool { - if let Ok(shutdown_rx) = SHUTDOWN_RX.try_with(|rx| rx.clone()) { - *shutdown_rx.borrow() + if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) { + cancel.is_cancelled() } else { if !cfg!(test) { warn!("is_shutdown_requested() called in an unexpected task or thread"); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 4fcb1e3ba3..d74f263f08 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -13,13 +13,13 @@ use anyhow::{bail, Context}; use bytes::Bytes; +use futures::FutureExt; use futures::Stream; use pageserver_api::models::TimelineState; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use tokio::sync::watch; -use tokio_util::io::StreamReader; -use tokio_util::io::SyncIoBridge; +use tokio::task::JoinSet; use tracing::*; use utils::crashsafe::path_with_suffix_extension; @@ -36,7 +36,6 @@ use std::io::Write; use std::ops::Bound::Included; use std::path::Path; use std::path::PathBuf; -use std::pin::Pin; use std::process::Command; use std::process::Stdio; use std::sync::Arc; @@ -45,24 +44,25 @@ use std::sync::{Mutex, RwLock}; use std::time::{Duration, Instant}; use self::metadata::TimelineMetadata; +use self::remote_timeline_client::RemoteTimelineClient; use crate::config::PageServerConf; use crate::import_datadir; use crate::is_uninit_mark; use crate::metrics::{remove_tenant_metrics, STORAGE_TIME}; use crate::repository::GcResult; -use crate::storage_sync::create_remote_timeline_client; -use crate::storage_sync::index::IndexPart; -use crate::storage_sync::list_remote_timelines; -use crate::storage_sync::RemoteTimelineClient; use crate::task_mgr; use crate::task_mgr::TaskKind; +use crate::tenant::config::TenantConfOpt; use crate::tenant::metadata::load_metadata; +use crate::tenant::remote_timeline_client::index::IndexPart; +use crate::tenant::storage_layer::DeltaLayer; +use crate::tenant::storage_layer::ImageLayer; use crate::tenant::storage_layer::Layer; -use crate::tenant_config::TenantConfOpt; + use crate::virtual_file::VirtualFile; use crate::walredo::PostgresRedoManager; use crate::walredo::WalRedoManager; -use crate::{CheckpointConfig, TEMP_FILE_SUFFIX}; +use crate::TEMP_FILE_SUFFIX; pub use pageserver_api::models::TenantState; use toml_edit; @@ -74,23 +74,25 @@ use utils::{ mod blob_io; pub mod block_io; -mod delta_layer; mod disk_btree; pub(crate) mod ephemeral_file; -pub mod filename; -mod image_layer; -mod inmemory_layer; pub mod layer_map; pub mod metadata; mod par_fsync; +mod remote_timeline_client; pub mod storage_layer; +pub mod config; +pub mod mgr; +pub mod tasks; +pub mod upload_queue; + mod timeline; pub mod size; -pub use timeline::Timeline; +pub use timeline::{with_ondemand_download, PageReconstructError, PageReconstructResult, Timeline}; // re-export this function so that page_cache.rs can use it. pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file; @@ -125,11 +127,11 @@ pub struct Tenant { timelines: Mutex>>, // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding - // `timelines` mutex during all GC iteration (especially with enforced checkpoint) + // `timelines` mutex during all GC iteration // may block for a long time `get_timeline`, `get_timelines_state`,... and other operations // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn // timeout... - gc_cs: Mutex<()>, + gc_cs: tokio::sync::Mutex<()>, walredo_mgr: Arc, // provides access to timeline data sitting in the remote storage @@ -233,23 +235,17 @@ impl UninitializedTimeline<'_> { /// Prepares timeline data by loading it from the basebackup archive. pub async fn import_basebackup_from_tar( self, - mut copyin_stream: &mut Pin<&mut impl Stream>>, + copyin_stream: &mut (impl Stream> + Sync + Send + Unpin), base_lsn: Lsn, ) -> anyhow::Result> { let raw_timeline = self.raw_timeline()?; - // import_basebackup_from_tar() is not async, mainly because the Tar crate - // it uses is not async. So we need to jump through some hoops: - // - convert the input from client connection to a synchronous Read - // - use block_in_place() - let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); + let mut reader = tokio_util::io::StreamReader::new(copyin_stream); + import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn) + .await + .context("Failed to import basebackup")?; - tokio::task::block_in_place(|| { - import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn) - .context("Failed to import basebackup") - })?; - - // Flush loop needs to be spawned in order for checkpoint to be able to flush. + // Flush loop needs to be spawned in order to be able to flush. // We want to run proper checkpoint before we mark timeline as available to outside world // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock raw_timeline.maybe_spawn_flush_loop(); @@ -259,9 +255,9 @@ impl UninitializedTimeline<'_> { }); raw_timeline - .checkpoint(CheckpointConfig::Flush) + .freeze_and_flush() .await - .context("Failed to checkpoint after basebackup import")?; + .context("Failed to flush after basebackup import")?; let timeline = self.initialize()?; @@ -336,7 +332,7 @@ impl TimelineUninitMark { let uninit_mark_parent = uninit_mark_file .parent() .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?; - ignore_absent_files(|| fs::remove_file(&uninit_mark_file)).with_context(|| { + ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| { format!("Failed to remove uninit mark file at path {uninit_mark_file:?}") })?; crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?; @@ -371,7 +367,7 @@ impl Drop for TimelineUninitMark { // We should not blindly overwrite local metadata with remote one. // For example, consider the following case: -// Checkpoint comes, we update local metadata and start upload task but after that +// Image layer is flushed to disk as a new delta layer, we update local metadata and start upload task but after that // pageserver crashes. During startup we'll load new metadata, and then reset it // to the state of remote one. But current layermap will have layers from the old // metadata which is inconsistent. @@ -480,7 +476,7 @@ impl Tenant { let timeline = UninitializedTimeline { owning_tenant: self, timeline_id, - raw_timeline: Some((Arc::new(dummy_timeline), TimelineUninitMark::dummy())), + raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())), }; // Do not start walreceiver here. We do need loaded layer map for reconcile_with_remote // But we shouldnt start walreceiver before we have all the data locally, because working walreceiver @@ -510,7 +506,7 @@ impl Tenant { ) })?; broken_timeline.set_state(TimelineState::Broken); - timelines_accessor.insert(timeline_id, Arc::new(broken_timeline)); + timelines_accessor.insert(timeline_id, broken_timeline); Err(e) } } @@ -600,7 +596,7 @@ impl Tenant { match tenant_clone.attach().await { Ok(_) => {} Err(e) => { - tenant_clone.set_broken(); + tenant_clone.set_broken(&e.to_string()); error!("error attaching tenant: {:?}", e); } } @@ -645,22 +641,62 @@ impl Tenant { .as_ref() .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?; - let remote_timelines = - list_remote_timelines(remote_storage, self.conf, self.tenant_id).await?; + let remote_timeline_ids = remote_timeline_client::list_remote_timelines( + remote_storage, + self.conf, + self.tenant_id, + ) + .await?; - info!("found {} timelines", remote_timelines.len()); + info!("found {} timelines", remote_timeline_ids.len()); - let mut timeline_ancestors: HashMap = HashMap::new(); - let mut index_parts: HashMap = HashMap::new(); - for (timeline_id, index_part) in remote_timelines { - let remote_metadata = index_part.parse_metadata().with_context(|| { - format!( - "Failed to parse metadata file from remote storage for tenant {} timeline {}", - self.tenant_id, timeline_id - ) - })?; + // Download & parse index parts + let mut part_downloads = JoinSet::new(); + for timeline_id in remote_timeline_ids { + let client = RemoteTimelineClient::new( + remote_storage.clone(), + self.conf, + self.tenant_id, + timeline_id, + ); + part_downloads.spawn( + async move { + debug!("starting index part download"); + + let index_part = client + .download_index_file() + .await + .context("download index file")?; + + let remote_metadata = index_part.parse_metadata().context("parse metadata")?; + + debug!("finished index part download"); + + Result::<_, anyhow::Error>::Ok(( + timeline_id, + client, + index_part, + remote_metadata, + )) + } + .map(move |res| { + res.with_context(|| format!("download index part for timeline {timeline_id}")) + }) + .instrument(info_span!("download_index_part", timeline=%timeline_id)), + ); + } + // Wait for all the download tasks to complete & collect results. + let mut remote_clients = HashMap::new(); + let mut index_parts = HashMap::new(); + let mut timeline_ancestors = HashMap::new(); + while let Some(result) = part_downloads.join_next().await { + // NB: we already added timeline_id as context to the error + let result: Result<_, anyhow::Error> = result.context("joinset task join")?; + let (timeline_id, client, index_part, remote_metadata) = result?; + debug!("successfully downloaded index part for timeline {timeline_id}"); timeline_ancestors.insert(timeline_id, remote_metadata); index_parts.insert(timeline_id, index_part); + remote_clients.insert(timeline_id, client); } // For every timeline, download the metadata file, scan the local directory, @@ -673,7 +709,7 @@ impl Tenant { timeline_id, index_parts.remove(&timeline_id).unwrap(), remote_metadata, - remote_storage.clone(), + remote_clients.remove(&timeline_id).unwrap(), ) .await .with_context(|| { @@ -700,22 +736,35 @@ impl Tenant { Ok(()) } - #[instrument(skip(self, index_part, remote_metadata, remote_storage), fields(timeline_id=%timeline_id))] + /// get size of all remote timelines + /// + /// This function relies on the index_part instead of listing the remote storage + /// + pub async fn get_remote_size(&self) -> anyhow::Result { + let mut size = 0; + + for timeline in self.list_timelines().iter() { + if let Some(remote_client) = &timeline.remote_client { + size += remote_client.get_remote_physical_size(); + } + } + + Ok(size) + } + + #[instrument(skip_all, fields(timeline_id=%timeline_id))] async fn load_remote_timeline( &self, timeline_id: TimelineId, index_part: IndexPart, remote_metadata: TimelineMetadata, - remote_storage: GenericRemoteStorage, + remote_client: RemoteTimelineClient, ) -> anyhow::Result<()> { info!("downloading index file for timeline {}", timeline_id); tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id)) .await .context("Failed to create new timeline directory")?; - let remote_client = - create_remote_timeline_client(remote_storage, self.conf, self.tenant_id, timeline_id)?; - let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() { let timelines = self.timelines.lock().unwrap(); Some(Arc::clone(timelines.get(&ancestor_id).ok_or_else( @@ -811,7 +860,7 @@ impl Tenant { match tenant_clone.load().await { Ok(()) => {} Err(err) => { - tenant_clone.set_broken(); + tenant_clone.set_broken(&err.to_string()); error!("could not load tenant {tenant_id}: {err:?}"); } } @@ -972,18 +1021,14 @@ impl Tenant { None }; - let remote_client = self - .remote_storage - .as_ref() - .map(|remote_storage| { - create_remote_timeline_client( - remote_storage.clone(), - self.conf, - self.tenant_id, - timeline_id, - ) - }) - .transpose()?; + let remote_client = self.remote_storage.as_ref().map(|remote_storage| { + RemoteTimelineClient::new( + remote_storage.clone(), + self.conf, + self.tenant_id, + timeline_id, + ) + }); let remote_startup_data = match &remote_client { Some(remote_client) => match remote_client.download_index_file().await { @@ -1142,7 +1187,8 @@ impl Tenant { ancestor_timeline.wait_lsn(*lsn).await?; } - self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? + self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn) + .await? } None => self.bootstrap_timeline(new_timeline_id, pg_version).await?, }; @@ -1154,17 +1200,20 @@ impl Tenant { /// this function is periodically called by gc task. /// also it can be explicitly requested through page server api 'do_gc' command. /// - /// 'target_timeline_id' specifies the timeline to GC, or None for all. - /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval). - /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC - /// to make tests more deterministic. - /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed? + /// `target_timeline_id` specifies the timeline to GC, or None for all. + /// + /// The `horizon` an `pitr` parameters determine how much WAL history needs to be retained. + /// Also known as the retention period, or the GC cutoff point. `horizon` specifies + /// the amount of history, as LSN difference from current latest LSN on each timeline. + /// `pitr` specifies the same as a time difference from the current time. The effective + /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever + /// requires more history to be retained. + // pub async fn gc_iteration( &self, target_timeline_id: Option, horizon: u64, pitr: Duration, - checkpoint_before_gc: bool, ) -> anyhow::Result { anyhow::ensure!( self.is_active(), @@ -1179,7 +1228,7 @@ impl Tenant { let _timer = STORAGE_TIME .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str]) .start_timer(); - self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc) + self.gc_iteration_internal(target_timeline_id, horizon, pitr) .await } } @@ -1222,24 +1271,21 @@ impl Tenant { /// /// Used at graceful shutdown. /// - pub async fn checkpoint(&self) -> anyhow::Result<()> { + pub async fn freeze_and_flush(&self) -> anyhow::Result<()> { // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the - // checkpoints. We don't want to block everything else while the - // checkpoint runs. - let timelines_to_checkpoint = { + // flushing. We don't want to block everything else while the + // flushing is performed. + let timelines_to_flush = { let timelines = self.timelines.lock().unwrap(); timelines .iter() - .map(|(id, timeline)| (*id, Arc::clone(timeline))) + .map(|(_id, timeline)| Arc::clone(timeline)) .collect::>() }; - for (id, timeline) in &timelines_to_checkpoint { - timeline - .checkpoint(CheckpointConfig::Flush) - .instrument(info_span!("checkpoint", timeline = %id, tenant = %self.tenant_id)) - .await?; + for timeline in &timelines_to_flush { + timeline.freeze_and_flush().await?; } Ok(()) @@ -1274,26 +1320,62 @@ impl Tenant { timeline }; - info!("waiting for layer_removal_cs.lock()"); - // No timeout here, GC & Compaction should be responsive to the `TimelineState::Stopping` change. - let layer_removal_guard = timeline.layer_removal_cs.lock().await; - info!("got layer_removal_cs.lock(), deleting layer files"); + // Now that the Timeline is in Stopping state, request all the related tasks to + // shut down. + // + // NB: If you call delete_timeline multiple times concurrently, they will + // all go through the motions here. Make sure the code here is idempotent, + // and don't error out if some of the shutdown tasks have already been + // completed! - // NB: storage_sync upload tasks that reference these layers have been cancelled - // by the caller. + // Stop the walreceiver first. + debug!("waiting for wal receiver to shutdown"); + task_mgr::shutdown_tasks( + Some(TaskKind::WalReceiverManager), + Some(self.tenant_id), + Some(timeline_id), + ) + .await; + debug!("wal receiver shutdown confirmed"); - let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id); - // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up - // with some layers missing. - std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { - format!( - "Failed to remove local timeline directory '{}'", - local_timeline_directory.display() - ) - })?; - info!("finished deleting layer files, releasing layer_removal_cs.lock()"); + info!("waiting for timeline tasks to shutdown"); + task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await; - drop(layer_removal_guard); + { + // Grab the layer_removal_cs lock, and actually perform the deletion. + // + // This lock prevents multiple concurrent delete_timeline calls from + // stepping on each other's toes, while deleting the files. It also + // prevents GC or compaction from running at the same time. + // + // Note that there are still other race conditions between + // GC, compaction and timeline deletion. GC task doesn't + // register itself properly with the timeline it's + // operating on. See + // https://github.com/neondatabase/neon/issues/2671 + // + // No timeout here, GC & Compaction should be responsive to the + // `TimelineState::Stopping` change. + info!("waiting for layer_removal_cs.lock()"); + let layer_removal_guard = timeline.layer_removal_cs.lock().await; + info!("got layer_removal_cs.lock(), deleting layer files"); + + // NB: storage_sync upload tasks that reference these layers have been cancelled + // by the caller. + + let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id); + // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up + // with some layers missing. + std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { + format!( + "Failed to remove local timeline directory '{}'", + local_timeline_directory.display() + ) + })?; + + info!("finished deleting layer files, releasing layer_removal_cs.lock()"); + drop(layer_removal_guard); + } // Remove the timeline from the map. let mut timelines = self.timelines.lock().unwrap(); @@ -1371,7 +1453,7 @@ impl Tenant { // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. - crate::tenant_tasks::start_background_loops(self.tenant_id); + tasks::start_background_loops(self.tenant_id); for timeline in not_broken_timelines { timeline.set_state(TimelineState::Active); @@ -1414,7 +1496,7 @@ impl Tenant { }); } - pub fn set_broken(&self) { + pub fn set_broken(&self, reason: &str) { self.state.send_modify(|current_state| { match *current_state { TenantState::Active => { @@ -1423,18 +1505,22 @@ impl Tenant { // activated should never be marked as broken. We cope with it the best // we can, but it shouldn't happen. *current_state = TenantState::Broken; - warn!("Changing Active tenant to Broken state"); + warn!("Changing Active tenant to Broken state, reason: {}", reason); } TenantState::Broken => { // This shouldn't happen either - warn!("Tenant is already broken"); + warn!("Tenant is already in Broken state"); } TenantState::Stopping => { // This shouldn't happen either *current_state = TenantState::Broken; - warn!("Marking Stopping tenant as Broken"); + warn!( + "Marking Stopping tenant as Broken state, reason: {}", + reason + ); } TenantState::Loading | TenantState::Attaching => { + info!("Setting tenant as Broken state, reason: {}", reason); *current_state = TenantState::Broken; } } @@ -1595,7 +1681,7 @@ impl Tenant { new_metadata: TimelineMetadata, ancestor: Option>, remote_client: Option, - ) -> anyhow::Result { + ) -> anyhow::Result> { if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() { anyhow::ensure!( ancestor.is_some(), @@ -1631,7 +1717,7 @@ impl Tenant { conf, tenant_conf: Arc::new(RwLock::new(tenant_conf)), timelines: Mutex::new(HashMap::new()), - gc_cs: Mutex::new(()), + gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, remote_storage, state, @@ -1778,16 +1864,22 @@ impl Tenant { target_timeline_id: Option, horizon: u64, pitr: Duration, - checkpoint_before_gc: bool, ) -> anyhow::Result { let mut totals: GcResult = Default::default(); let now = Instant::now(); - let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?; + let gc_timelines = self + .refresh_gc_info_internal(target_timeline_id, horizon, pitr) + .await?; utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines"); - info!("starting on {} timelines", gc_timelines.len()); + // If there is nothing to GC, we don't want any messages in the INFO log. + if !gc_timelines.is_empty() { + info!("{} timelines need GC", gc_timelines.len()); + } else { + debug!("{} timelines need GC", gc_timelines.len()); + } // Perform GC for each timeline. // @@ -1805,18 +1897,6 @@ impl Tenant { // made. break; } - - // If requested, force flush all in-memory layers to disk first, - // so that they too can be garbage collected. That's - // used in tests, so we want as deterministic results as possible. - if checkpoint_before_gc { - timeline.checkpoint(CheckpointConfig::Forced).await?; - info!( - "timeline {} checkpoint_before_gc done", - timeline.timeline_id - ); - } - let result = timeline.gc().await?; totals += result; } @@ -1830,7 +1910,7 @@ impl Tenant { /// [`Tenant::get_gc_horizon`]. /// /// This is usually executed as part of periodic gc, but can now be triggered more often. - pub fn refresh_gc_info(&self) -> anyhow::Result>> { + pub async fn refresh_gc_info(&self) -> anyhow::Result>> { // since this method can now be called at different rates than the configured gc loop, it // might be that these configuration values get applied faster than what it was previously, // since these were only read from the gc task. @@ -1841,54 +1921,60 @@ impl Tenant { let target_timeline_id = None; self.refresh_gc_info_internal(target_timeline_id, horizon, pitr) + .await } - fn refresh_gc_info_internal( + async fn refresh_gc_info_internal( &self, target_timeline_id: Option, horizon: u64, pitr: Duration, ) -> anyhow::Result>> { // grab mutex to prevent new timelines from being created here. - let gc_cs = self.gc_cs.lock().unwrap(); - - let timelines = self.timelines.lock().unwrap(); + let gc_cs = self.gc_cs.lock().await; // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. - let mut all_branchpoints: BTreeSet<(TimelineId, Lsn)> = BTreeSet::new(); - let timeline_ids = { - if let Some(target_timeline_id) = target_timeline_id.as_ref() { - if timelines.get(target_timeline_id).is_none() { - bail!("gc target timeline does not exist") - } - }; + let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = { + let timelines = self.timelines.lock().unwrap(); + let mut all_branchpoints = BTreeSet::new(); + let timeline_ids = { + if let Some(target_timeline_id) = target_timeline_id.as_ref() { + if timelines.get(target_timeline_id).is_none() { + bail!("gc target timeline does not exist") + } + }; - timelines - .iter() - .map(|(timeline_id, timeline_entry)| { - if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { - // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timeline_id) = target_timeline_id { - if ancestor_timeline_id == &timeline_id { + timelines + .iter() + .map(|(timeline_id, timeline_entry)| { + if let Some(ancestor_timeline_id) = + &timeline_entry.get_ancestor_timeline_id() + { + // If target_timeline is specified, we only need to know branchpoints of its children + if let Some(timeline_id) = target_timeline_id { + if ancestor_timeline_id == &timeline_id { + all_branchpoints.insert(( + *ancestor_timeline_id, + timeline_entry.get_ancestor_lsn(), + )); + } + } + // Collect branchpoints for all timelines + else { all_branchpoints.insert(( *ancestor_timeline_id, timeline_entry.get_ancestor_lsn(), )); } } - // Collect branchpoints for all timelines - else { - all_branchpoints - .insert((*ancestor_timeline_id, timeline_entry.get_ancestor_lsn())); - } - } - *timeline_id - }) - .collect::>() + *timeline_id + }) + .collect::>() + }; + (all_branchpoints, timeline_ids) }; - drop(timelines); // Ok, we now know all the branch points. // Update the GC information for each timeline. @@ -1914,7 +2000,7 @@ impl Tenant { )) .map(|&x| x.1) .collect(); - timeline.update_gc_info(branchpoints, cutoff, pitr)?; + timeline.update_gc_info(branchpoints, cutoff, pitr).await?; gc_timelines.push(timeline); } @@ -1924,7 +2010,7 @@ impl Tenant { } /// Branch an existing timeline - fn branch_timeline( + async fn branch_timeline( &self, src: TimelineId, dst: TimelineId, @@ -1933,10 +2019,11 @@ impl Tenant { // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn // about timelines, so otherwise a race condition is possible, where we create new timeline and GC // concurrently removes data that is needed by the new timeline. - let _gc_cs = self.gc_cs.lock().unwrap(); - let timelines = self.timelines.lock().unwrap(); - let timeline_uninit_mark = self.create_timeline_uninit_mark(dst, &timelines)?; - drop(timelines); + let _gc_cs = self.gc_cs.lock().await; + let timeline_uninit_mark = { + let timelines = self.timelines.lock().unwrap(); + self.create_timeline_uninit_mark(dst, &timelines)? + }; // In order for the branch creation task to not wait for GC/compaction, // we need to make sure that the starting LSN of the child branch is not out of scope midway by @@ -2083,13 +2170,12 @@ impl Tenant { let tenant_id = raw_timeline.owning_tenant.tenant_id; let unfinished_timeline = raw_timeline.raw_timeline()?; - tokio::task::block_in_place(|| { - import_datadir::import_timeline_from_postgres_datadir( - unfinished_timeline, - pgdata_path, - pgdata_lsn, - ) - }) + import_datadir::import_timeline_from_postgres_datadir( + unfinished_timeline, + pgdata_path, + pgdata_lsn, + ) + .await .with_context(|| { format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}") })?; @@ -2105,8 +2191,13 @@ impl Tenant { }); unfinished_timeline - .checkpoint(CheckpointConfig::Flush).await - .with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?; + .freeze_and_flush() + .await + .with_context(|| { + format!( + "Failed to flush after pgdatadir import for timeline {tenant_id}/{timeline_id}" + ) + })?; let timeline = { let mut timelines = self.timelines.lock().unwrap(); @@ -2135,12 +2226,12 @@ impl Tenant { let tenant_id = self.tenant_id; let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() { - let remote_client = create_remote_timeline_client( + let remote_client = RemoteTimelineClient::new( remote_storage.clone(), self.conf, tenant_id, new_timeline_id, - )?; + ); remote_client.init_upload_queue_for_empty_remote(&new_metadata)?; Some(remote_client) } else { @@ -2165,7 +2256,7 @@ impl Tenant { Ok(UninitializedTimeline { owning_tenant: self, timeline_id: new_timeline_id, - raw_timeline: Some((Arc::new(new_timeline), uninit_mark)), + raw_timeline: Some((new_timeline, uninit_mark)), }) } Err(e) => { @@ -2183,7 +2274,7 @@ impl Tenant { new_metadata: TimelineMetadata, ancestor: Option>, remote_client: Option, - ) -> anyhow::Result { + ) -> anyhow::Result> { let timeline_data = self .create_timeline_data( new_timeline_id, @@ -2266,12 +2357,12 @@ impl Tenant { // See more for on the issue #2748 condenced out of the initial PR review. let mut shared_cache = self.cached_logical_sizes.lock().await; - size::gather_inputs(self, logical_sizes_at_once, &mut *shared_cache).await + size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache).await } } fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> { - fs::remove_dir_all(&timeline_dir) + fs::remove_dir_all(timeline_dir) .or_else(|e| { if e.kind() == std::io::ErrorKind::NotFound { // we can leave the uninit mark without a timeline dir, @@ -2287,7 +2378,7 @@ fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> a timeline_dir.display() ) })?; - fs::remove_file(&uninit_mark).with_context(|| { + fs::remove_file(uninit_mark).with_context(|| { format!( "Failed to remove timeline uninit mark file {}", uninit_mark.display() @@ -2387,7 +2478,7 @@ fn try_create_target_tenant_dir( anyhow::bail!("failpoint tenant-creation-before-tmp-rename"); }); - fs::rename(&temporary_tenant_dir, target_tenant_directory).with_context(|| { + fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| { format!( "failed to move tenant {} temporary directory {} into the permanent one {}", tenant_id, @@ -2441,9 +2532,9 @@ fn run_initdb( ); let initdb_output = Command::new(&initdb_bin_path) - .args(&["-D", &initdb_target_dir.to_string_lossy()]) - .args(&["-U", &conf.superuser]) - .args(&["-E", "utf8"]) + .args(["-D", &initdb_target_dir.to_string_lossy()]) + .args(["-U", &conf.superuser]) + .args(["-E", "utf8"]) .arg("--no-instructions") // This is only used for a temporary installation that is deleted shortly after, // so no need to fsync it @@ -2486,12 +2577,8 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> anyhow::Result<() file.read_exact_at(&mut header_buf, 0)?; match u16::from_be_bytes(header_buf) { - crate::IMAGE_FILE_MAGIC => { - image_layer::ImageLayer::new_for_path(path, file)?.dump(verbose)? - } - crate::DELTA_FILE_MAGIC => { - delta_layer::DeltaLayer::new_for_path(path, file)?.dump(verbose)? - } + crate::IMAGE_FILE_MAGIC => ImageLayer::new_for_path(path, file)?.dump(verbose)?, + crate::DELTA_FILE_MAGIC => DeltaLayer::new_for_path(path, file)?.dump(verbose)?, magic => bail!("unrecognized magic identifier: {:?}", magic), } @@ -2528,7 +2615,7 @@ pub mod harness { }; use super::*; - use crate::tenant_config::{TenantConf, TenantConfOpt}; + use crate::tenant::config::{TenantConf, TenantConfOpt}; use hex_literal::hex; use utils::id::{TenantId, TimelineId}; @@ -2605,9 +2692,11 @@ pub mod harness { // Disable automatic GC and compaction to make the unit tests more deterministic. // The tests perform them manually if needed. - let mut tenant_conf = TenantConf::dummy_conf(); - tenant_conf.gc_period = Duration::ZERO; - tenant_conf.compaction_period = Duration::ZERO; + let tenant_conf = TenantConf { + gc_period: Duration::ZERO, + compaction_period: Duration::ZERO, + ..TenantConf::default() + }; let tenant_id = TenantId::generate(); fs::create_dir_all(conf.tenant_path(&tenant_id))?; @@ -2726,9 +2815,18 @@ mod tests { writer.finish_write(Lsn(0x20)); drop(writer); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?, + TEST_IMG("foo at 0x20") + ); Ok(()) } @@ -2793,7 +2891,9 @@ mod tests { //assert_current_logical_size(&tline, Lsn(0x40)); // Branch the history, modify relation differently on the new timeline - tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; + tenant + .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30))) + .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); @@ -2803,15 +2903,15 @@ mod tests { // Check page contents on both branches assert_eq!( - from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?, + from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?, "foo at 0x40" ); assert_eq!( - from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?, + from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?, "bar at 0x40" ); assert_eq!( - from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?, + from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).no_ondemand_download()?)?, "foobar at 0x20" ); @@ -2841,7 +2941,7 @@ mod tests { writer.finish_write(lsn); lsn += 0x10; } - tline.checkpoint(CheckpointConfig::Forced).await?; + tline.freeze_and_flush().await?; { let writer = tline.writer(); writer.put( @@ -2858,7 +2958,7 @@ mod tests { )?; writer.finish_write(lsn); } - tline.checkpoint(CheckpointConfig::Forced).await + tline.freeze_and_flush().await } #[tokio::test] @@ -2873,15 +2973,18 @@ mod tests { make_some_layers(tline.as_ref(), Lsn(0x20)).await?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - // FIXME: this doesn't actually remove any layer currently, given how the checkpointing + // FIXME: this doesn't actually remove any layer currently, given how the flushing // and compaction works. But it does set the 'cutoff' point so that the cross check // below should fail. tenant - .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false) + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO) .await?; // try to branch at lsn 25, should fail because we already garbage collected the data - match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + match tenant + .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) + .await + { Ok(_) => panic!("branching should have failed"), Err(err) => { assert!(err.to_string().contains("invalid branch start lsn")); @@ -2906,7 +3009,10 @@ mod tests { .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)? .initialize()?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 - match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + match tenant + .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) + .await + { Ok(_) => panic!("branching should have failed"), Err(err) => { assert!(&err.to_string().contains("invalid branch start lsn")); @@ -2933,7 +3039,7 @@ mod tests { let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?; let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); match tline.get(*TEST_KEY, Lsn(0x25)) { @@ -2954,15 +3060,20 @@ mod tests { .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; - tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + tenant + .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40))) + .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 tenant - .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false) + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO) .await?; - assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); + assert!(newtline + .get(*TEST_KEY, Lsn(0x25)) + .no_ondemand_download() + .is_ok()); Ok(()) } @@ -2976,7 +3087,9 @@ mod tests { .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; - tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + tenant + .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40))) + .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); @@ -2985,12 +3098,12 @@ mod tests { // run gc on parent tenant - .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false) + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO) .await?; // Check that the data is still accessible on the branch. assert_eq!( - newtline.get(*TEST_KEY, Lsn(0x50))?, + newtline.get(*TEST_KEY, Lsn(0x50)).no_ondemand_download()?, TEST_IMG(&format!("foo at {}", Lsn(0x40))) ); @@ -3007,7 +3120,6 @@ mod tests { .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)? .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x8000)).await?; - tline.checkpoint(CheckpointConfig::Forced).await?; } let tenant = harness.load().await; @@ -3030,16 +3142,16 @@ mod tests { .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; - tline.checkpoint(CheckpointConfig::Forced).await?; - tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + tenant + .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40))) + .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60)).await?; - tline.checkpoint(CheckpointConfig::Forced).await?; } // check that both of them are initially unloaded @@ -3111,7 +3223,7 @@ mod tests { writer.finish_write(Lsn(0x10)); drop(writer); - tline.checkpoint(CheckpointConfig::Forced).await?; + tline.freeze_and_flush().await?; tline.compact().await?; let writer = tline.writer(); @@ -3119,7 +3231,7 @@ mod tests { writer.finish_write(Lsn(0x20)); drop(writer); - tline.checkpoint(CheckpointConfig::Forced).await?; + tline.freeze_and_flush().await?; tline.compact().await?; let writer = tline.writer(); @@ -3127,7 +3239,7 @@ mod tests { writer.finish_write(Lsn(0x30)); drop(writer); - tline.checkpoint(CheckpointConfig::Forced).await?; + tline.freeze_and_flush().await?; tline.compact().await?; let writer = tline.writer(); @@ -3135,21 +3247,36 @@ mod tests { writer.finish_write(Lsn(0x40)); drop(writer); - tline.checkpoint(CheckpointConfig::Forced).await?; + tline.freeze_and_flush().await?; tline.compact().await?; - assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40")); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?, + TEST_IMG("foo at 0x20") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x30)).no_ondemand_download()?, + TEST_IMG("foo at 0x30") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x40)).no_ondemand_download()?, + TEST_IMG("foo at 0x40") + ); Ok(()) } // - // Insert 1000 key-value pairs with increasing keys, checkpoint, - // repeat 50 times. + // Insert 1000 key-value pairs with increasing keys, flush, compact, GC. + // Repeat 50 times. // #[tokio::test] async fn test_bulk_insert() -> anyhow::Result<()> { @@ -3184,8 +3311,10 @@ mod tests { let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; - tline.checkpoint(CheckpointConfig::Forced).await?; + tline + .update_gc_info(Vec::new(), cutoff, Duration::ZERO) + .await?; + tline.freeze_and_flush().await?; tline.compact().await?; tline.gc().await?; } @@ -3248,16 +3377,17 @@ mod tests { for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = blknum as u32; assert_eq!( - tline.get(test_key, lsn)?, + tline.get(test_key, lsn).no_ondemand_download()?, TEST_IMG(&format!("{} at {}", blknum, last_lsn)) ); } - // Perform a cycle of checkpoint, compaction, and GC - println!("checkpointing {}", lsn); + // Perform a cycle of flush, compact, and GC let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; - tline.checkpoint(CheckpointConfig::Forced).await?; + tline + .update_gc_info(Vec::new(), cutoff, Duration::ZERO) + .await?; + tline.freeze_and_flush().await?; tline.compact().await?; tline.gc().await?; } @@ -3305,7 +3435,9 @@ mod tests { let mut tline_id = TIMELINE_ID; for _ in 0..50 { let new_tline_id = TimelineId::generate(); - tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; + tenant + .branch_timeline(tline_id, new_tline_id, Some(lsn)) + .await?; tline = tenant .get_timeline(new_tline_id, true) .expect("Should have the branched timeline"); @@ -3331,16 +3463,17 @@ mod tests { for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = blknum as u32; assert_eq!( - tline.get(test_key, lsn)?, + tline.get(test_key, lsn).no_ondemand_download()?, TEST_IMG(&format!("{} at {}", blknum, last_lsn)) ); } - // Perform a cycle of checkpoint, compaction, and GC - println!("checkpointing {}", lsn); + // Perform a cycle of flush, compact, and GC let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; - tline.checkpoint(CheckpointConfig::Forced).await?; + tline + .update_gc_info(Vec::new(), cutoff, Duration::ZERO) + .await?; + tline.freeze_and_flush().await?; tline.compact().await?; tline.gc().await?; } @@ -3370,7 +3503,9 @@ mod tests { #[allow(clippy::needless_range_loop)] for idx in 0..NUM_TLINES { let new_tline_id = TimelineId::generate(); - tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; + tenant + .branch_timeline(tline_id, new_tline_id, Some(lsn)) + .await?; tline = tenant .get_timeline(new_tline_id, true) .expect("Should have the branched timeline"); @@ -3403,7 +3538,7 @@ mod tests { println!("checking [{idx}][{blknum}] at {lsn}"); test_key.field6 = blknum as u32; assert_eq!( - tline.get(test_key, *lsn)?, + tline.get(test_key, *lsn).no_ondemand_download()?, TEST_IMG(&format!("{idx} {blknum} at {lsn}")) ); } diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant/config.rs similarity index 89% rename from pageserver/src/tenant_config.rs rename to pageserver/src/tenant/config.rs index 1204d1abd8..c95a98fbc7 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant/config.rs @@ -30,7 +30,7 @@ pub mod defaults { pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; pub const DEFAULT_GC_PERIOD: &str = "100 s"; pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; - pub const DEFAULT_PITR_INTERVAL: &str = "30 days"; + pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds"; pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; @@ -191,11 +191,10 @@ impl TenantConfOpt { } } -impl TenantConf { - pub fn default() -> TenantConf { +impl Default for TenantConf { + fn default() -> Self { use defaults::*; - - TenantConf { + Self { checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT) .expect("cannot parse default checkpoint timeout"), @@ -220,29 +219,4 @@ impl TenantConf { trace_read_requests: false, } } - - pub fn dummy_conf() -> Self { - TenantConf { - checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_timeout: Duration::from_secs(600), - compaction_target_size: 4 * 1024 * 1024, - compaction_period: Duration::from_secs(10), - compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD, - gc_horizon: defaults::DEFAULT_GC_HORIZON, - gc_period: Duration::from_secs(10), - image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD, - pitr_interval: Duration::from_secs(60 * 60), - walreceiver_connect_timeout: humantime::parse_duration( - defaults::DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, - ) - .unwrap(), - lagging_wal_timeout: humantime::parse_duration( - defaults::DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT, - ) - .unwrap(), - max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) - .unwrap(), - trace_read_requests: false, - } - } } diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 33255dbd82..88dff32b76 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -139,7 +139,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> { off += keys_len as u64; let values_off = off as usize; - let values_len = num_children as usize * VALUE_SZ as usize; + let values_len = num_children as usize * VALUE_SZ; //off += values_len as u64; let prefix = &buf[prefix_off..prefix_off + prefix_len as usize]; @@ -177,7 +177,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> { while low < high { let mid = low + size / 2; - let key_off = mid as usize * self.suffix_len as usize; + let key_off = mid * self.suffix_len as usize; let suffix = &self.keys[key_off..key_off + self.suffix_len as usize]; // Does this match? keybuf[self.prefix_len as usize..].copy_from_slice(suffix); @@ -328,7 +328,7 @@ where while idx < node.num_children as usize { let suffix = &node.keys[key_off..key_off + suffix_len]; keybuf[prefix_len..].copy_from_slice(suffix); - let value = node.value(idx as usize); + let value = node.value(idx); #[allow(clippy::collapsible_if)] if node.level == 0 { // leaf @@ -368,7 +368,7 @@ where key_off -= suffix_len; let suffix = &node.keys[key_off..key_off + suffix_len]; keybuf[prefix_len..].copy_from_slice(suffix); - let value = node.value(idx as usize); + let value = node.value(idx); #[allow(clippy::collapsible_if)] if node.level == 0 { // leaf @@ -629,7 +629,7 @@ impl BuildNode { self.keys.extend(&key[self.prefix.len()..]); self.values.extend(value.0); - assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize); + assert!(self.keys.len() == self.num_children as usize * self.suffix_len); assert!(self.values.len() == self.num_children as usize * VALUE_SZ); self.size += self.suffix_len + VALUE_SZ; @@ -674,7 +674,7 @@ impl BuildNode { self.size -= prefix_len * self.num_children as usize; self.size += prefix_len; - assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize); + assert!(self.keys.len() == self.num_children as usize * self.suffix_len); assert!(self.values.len() == self.num_children as usize * VALUE_SZ); true @@ -684,7 +684,7 @@ impl BuildNode { /// Serialize the node to on-disk format. /// fn pack(&self) -> Bytes { - assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize); + assert!(self.keys.len() == self.num_children as usize * self.suffix_len); assert!(self.values.len() == self.num_children as usize * VALUE_SZ); assert!(self.num_children > 0); @@ -940,7 +940,7 @@ mod tests { let t = -(f64::ln(u)); let key_int = (t * 1000000.0) as u128; - all_data.insert(key_int as u128, idx as u64); + all_data.insert(key_int, idx as u64); } // Build a tree from it diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 0774fa42a6..c433e65ad2 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -91,7 +91,7 @@ impl EphemeralFile { break; } - off += n as usize; + off += n; } Ok(()) } diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 19252ecf6e..44bed5959f 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -12,7 +12,6 @@ use crate::metrics::NUM_ONDISK_LAYERS; use crate::repository::Key; -use crate::tenant::inmemory_layer::InMemoryLayer; use crate::tenant::storage_layer::{range_eq, range_overlaps}; use amplify_num::i256; use anyhow::Result; @@ -27,7 +26,7 @@ use std::sync::Arc; use tracing::*; use utils::lsn::Lsn; -use super::storage_layer::Layer; +use super::storage_layer::{InMemoryLayer, Layer}; /// /// LayerMap tracks what layers exist on a timeline. @@ -261,8 +260,10 @@ where /// contain the version, even if it's missing from the returned /// layer. /// - pub fn search(&self, key: Key, end_lsn: Lsn) -> Result>> { - // linear search + /// NOTE: This only searches the 'historic' layers, *not* the + /// 'open' and 'frozen' layers! + /// + pub fn search(&self, key: Key, end_lsn: Lsn) -> Option> { // Find the latest image layer that covers the given key let mut latest_img: Option> = None; let mut latest_img_lsn: Option = None; @@ -286,10 +287,10 @@ where assert!(img_lsn < end_lsn); if Lsn(img_lsn.0 + 1) == end_lsn { // found exact match - return Ok(Some(SearchResult { + return Some(SearchResult { layer: Arc::clone(l), lsn_floor: img_lsn, - })); + }); } if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) { latest_img = Some(Arc::clone(l)); @@ -327,14 +328,16 @@ where latest_delta.replace(Arc::clone(l)); break; } - // this layer's end LSN is smaller than the requested point. If there's - // nothing newer, this is what we need to return. Remember this. - if let Some(old_candidate) = &latest_delta { - if l.get_lsn_range().end > old_candidate.get_lsn_range().end { + if l.get_lsn_range().end > latest_img_lsn.unwrap_or(Lsn(0)) { + // this layer's end LSN is smaller than the requested point. If there's + // nothing newer, this is what we need to return. Remember this. + if let Some(old_candidate) = &latest_delta { + if l.get_lsn_range().end > old_candidate.get_lsn_range().end { + latest_delta.replace(Arc::clone(l)); + } + } else { latest_delta.replace(Arc::clone(l)); } - } else { - latest_delta.replace(Arc::clone(l)); } } if let Some(l) = latest_delta { @@ -346,19 +349,19 @@ where Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1), l.get_lsn_range().start, ); - Ok(Some(SearchResult { + Some(SearchResult { lsn_floor, layer: l, - })) + }) } else if let Some(l) = latest_img { trace!("found img layer and no deltas for request on {key} at {end_lsn}"); - Ok(Some(SearchResult { + Some(SearchResult { lsn_floor: latest_img_lsn.unwrap(), layer: l, - })) + }) } else { trace!("no layer found for request on {key} at {end_lsn}"); - Ok(None) + None } } diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index f3a0a5171a..297cccbe30 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -255,8 +255,7 @@ pub fn save_metadata( // fsync the parent directory to ensure the directory entry is durable if first_save { let timeline_dir = File::open( - &path - .parent() + path.parent() .expect("Metadata should always have a parent dir"), )?; timeline_dir.sync_all()?; diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant/mgr.rs similarity index 91% rename from pageserver/src/tenant_mgr.rs rename to pageserver/src/tenant/mgr.rs index f4f1eba717..dce7cd8bae 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -17,8 +17,8 @@ use utils::crashsafe; use crate::config::PageServerConf; use crate::task_mgr::{self, TaskKind}; +use crate::tenant::config::TenantConfOpt; use crate::tenant::{Tenant, TenantState}; -use crate::tenant_config::TenantConfOpt; use crate::IGNORED_TENANT_FILE_NAME; use utils::fs_ext::PathExt; @@ -196,7 +196,7 @@ pub async fn shutdown_all_tenants() { let tenant_id = tenant.tenant_id(); debug!("shutdown tenant {tenant_id}"); - if let Err(err) = tenant.checkpoint().await { + if let Err(err) = tenant.freeze_and_flush().await { error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}"); } } @@ -216,8 +216,7 @@ pub async fn create_tenant( hash_map::Entry::Vacant(v) => { // Hold the write_tenants() lock, since all of this is local IO. // If this section ever becomes contentious, introduce a new `TenantState::Creating`. - let tenant_directory = - super::tenant::create_tenant_files(conf, tenant_conf, tenant_id)?; + let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?; let created_tenant = schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?; let crated_tenant_id = created_tenant.tenant_id(); @@ -262,27 +261,6 @@ pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Resul } pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> { - // Start with the shutdown of timeline tasks (this shuts down the walreceiver) - // It is important that we do not take locks here, and do not check whether the timeline exists - // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join - // we cannot create new timelines and tenants, and that can take quite some time, - // it can even become stuck due to a bug making whole pageserver unavailable for some operations - // so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation - // and then try to actually remove timeline from inmemory state and this is the point when concurrent requests - // will synchronize and either fail with the not found error or succeed - - debug!("waiting for wal receiver to shutdown"); - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(tenant_id), - Some(timeline_id), - ) - .await; - debug!("wal receiver shutdown confirmed"); - - info!("waiting for timeline tasks to shutdown"); - task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await; - info!("timeline task shutdown completed"); match get_tenant(tenant_id, true).await { Ok(tenant) => { tenant.delete_timeline(timeline_id).await?; @@ -452,7 +430,7 @@ where Err(e) => { let tenants_accessor = TENANTS.read().await; match tenants_accessor.get(&tenant_id) { - Some(tenant) => tenant.set_broken(), + Some(tenant) => tenant.set_broken(&e.to_string()), None => warn!("Tenant {tenant_id} got removed from memory"), } Err(e) @@ -496,7 +474,7 @@ pub async fn immediate_gc( async move { fail::fail_point!("immediate_gc_task_pre"); let result = tenant - .gc_iteration(Some(timeline_id), gc_horizon, pitr, true) + .gc_iteration(Some(timeline_id), gc_horizon, pitr) .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id)) .await; // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it @@ -514,3 +492,53 @@ pub async fn immediate_gc( Ok(wait_task_done) } + +#[cfg(feature = "testing")] +pub async fn immediate_compact( + tenant_id: TenantId, + timeline_id: TimelineId, +) -> Result>, ApiError> { + let guard = TENANTS.read().await; + + let tenant = guard + .get(&tenant_id) + .map(Arc::clone) + .with_context(|| format!("Tenant {tenant_id} not found")) + .map_err(ApiError::NotFound)?; + + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(ApiError::NotFound)?; + + // Run in task_mgr to avoid race with detach operation + let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); + task_mgr::spawn( + &tokio::runtime::Handle::current(), + TaskKind::Compaction, + Some(tenant_id), + Some(timeline_id), + &format!( + "timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}" + ), + false, + async move { + let result = timeline + .compact() + .instrument( + info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id), + ) + .await; + + match task_done.send(result) { + Ok(_) => (), + Err(result) => error!("failed to send compaction result: {result:?}"), + } + Ok(()) + }, + ); + + // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task + drop(guard); + + Ok(wait_task_done) +} diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/tenant/remote_timeline_client.rs similarity index 76% rename from pageserver/src/storage_sync2.rs rename to pageserver/src/tenant/remote_timeline_client.rs index 55dbeaff73..1db69d8b73 100644 --- a/pageserver/src/storage_sync2.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -32,7 +32,8 @@ //! the corresponding remote operation with the timeline's [`RemoteTimelineClient`]: //! //! - [`RemoteTimelineClient::schedule_layer_file_upload`] when we've created a new layer file. -//! - [`RemoteTimelineClient::schedule_index_upload`] when we've updated the timeline metadata file. +//! - [`RemoteTimelineClient::schedule_index_upload_for_metadata_update`] when we've updated the timeline metadata file. +//! - [`RemoteTimelineClient::schedule_index_upload_for_file_changes`] to upload an updated index file, after we've scheduled file uploads //! - [`RemoteTimelineClient::schedule_layer_file_deletion`] when we've deleted one or more layer files. //! //! Internally, these functions create [`UploadOp`]s and put them in a queue. @@ -57,7 +58,7 @@ //! To have a consistent remote structure, it's important that uploads and //! deletions are performed in the right order. For example, the index file //! contains a list of layer files, so it must not be uploaded until all the -//! layer files that are in its list have been succesfully uploaded. +//! layer files that are in its list have been successfully uploaded. //! //! The contract between client and its user is that the user is responsible of //! scheduling operations in an order that keeps the remote consistent as @@ -139,7 +140,7 @@ //! Note that if we crash during file deletion between the index update //! that removes the file from the list of files, and deleting the remote file, //! the file is leaked in the remote storage. Similarly, if a new file is created -//! and uploaded, but the pageserver dies permantently before updating the +//! and uploaded, but the pageserver dies permanently before updating the //! remote index file, the new file is leaked in remote storage. We accept and //! tolerate that for now. //! Note further that we cannot easily fix this by scheduling deletes for every @@ -147,31 +148,43 @@ //! following two cases: //! - (1) We had the file locally, deleted it locally, scheduled a remote delete, //! but crashed before it finished remotely. -//! - (2) We never had the file locally because we were still in tenant attach -//! when we crashed. (Similar case for on-demand download in the future.) +//! - (2) We never had the file locally because we haven't on-demand downloaded +//! it yet. //! -//! # Downloads (= Tenant Attach) +//! # Downloads //! //! In addition to the upload queue, [`RemoteTimelineClient`] has functions for -//! downloading files from the remote storage. Downloads are performed immediately, -//! independently of the uploads. +//! downloading files from the remote storage. Downloads are performed immediately +//! against the `RemoteStorage`, independently of the upload queue. //! //! When we attach a tenant, we perform the following steps: //! - create `Tenant` object in `TenantState::Attaching` state -//! - List timelines that are present in remote storage, and download their remote [`IndexPart`]s -//! - For each timeline, create `Timeline` struct and a `RemoteTimelineClient`, and initialize the client's upload queue with its `IndexPart` -//! - eagerly download all the remote layers using the client's download APIs -//! - transition tenant from `TenantState::Attaching` to `TenantState::Active` state. +//! - List timelines that are present in remote storage, and for each: +//! - download their remote [`IndexPart`]s +//! - create `Timeline` struct and a `RemoteTimelineClient` +//! - initialize the client's upload queue with its `IndexPart` +//! - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart` +//! but not present locally +//! - schedule uploads for layers that are only present locally. +//! - if the remote `IndexPart`'s metadata was newer than the metadata in +//! the local filesystem, write the remote metadata to the local filesystem +//! - After the above is done for each timeline, open the tenant for business by +//! transitioning it from `TenantState::Attaching` to `TenantState::Active` state. +//! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops. //! -//! Most of the above happens in [`Timeline::reconcile_with_remote`]. +//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers. //! We keep track of the fact that a client is in `Attaching` state in a marker -//! file on the local disk. -//! However, the distinction is moot for storage sync since we call -//! `reconcile_with_remote` for tenants both with and without the marker file. -//! -//! In the future, downloading will be done on-demand and `reconcile_with_remote` -//! will only be responsible for re-scheduling upload ops after a crash of an -//! `Active` tenant. +//! file on the local disk. This is critical because, when we restart the pageserver, +//! we do not want to do the `List timelines` step for each tenant that has already +//! been successfully attached (for performance & cost reasons). +//! Instead, for a tenant without the attach marker file, we assume that the +//! local state is in sync or ahead of the remote state. This includes the list +//! of all of the tenant's timelines, which is particularly critical to be up-to-date: +//! if there's a timeline on the remote that the pageserver doesn't know about, +//! the GC will not consider its branch point, leading to data loss. +//! So, for a tenant with the attach marker file, we know that we do not yet have +//! persisted all the remote timeline's metadata files locally. To exclude the +//! risk above, we re-run the procedure for such tenants //! //! # Operating Without Remote Storage //! @@ -194,39 +207,51 @@ mod upload; // re-export these pub use download::{is_temp_download_file, list_remote_timelines}; -use std::collections::{HashMap, VecDeque}; -use std::fmt::Debug; -use std::ops::DerefMut; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; use anyhow::ensure; use remote_storage::{DownloadError, GenericRemoteStorage}; +use std::ops::DerefMut; use tokio::runtime::Runtime; use tracing::{debug, info, warn}; use tracing::{info_span, Instrument}; - use utils::lsn::Lsn; -use self::index::IndexPart; - -use crate::metrics::MeasureRemoteOp; use crate::metrics::RemoteOpFileKind; use crate::metrics::RemoteOpKind; -use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS; -use crate::tenant::filename::LayerFileName; +use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics}; +use crate::tenant::remote_timeline_client::index::LayerFileMetadata; use crate::{ config::PageServerConf, - storage_sync::index::LayerFileMetadata, task_mgr, task_mgr::TaskKind, task_mgr::BACKGROUND_RUNTIME, tenant::metadata::TimelineMetadata, + tenant::upload_queue::{ + UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask, + }, {exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}, }; use utils::id::{TenantId, TimelineId}; +use self::index::IndexPart; + +use super::storage_layer::LayerFileName; + +// Occasional network issues and such can cause remote operations to fail, and +// that's expected. If a download fails, we log it at info-level, and retry. +// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN +// level instead, as repeated failures can mean a more serious problem. If it +// fails more than FAILED_DOWNLOAD_RETRIES times, we give up +const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3; +const FAILED_DOWNLOAD_RETRIES: u32 = 10; + +// Similarly log failed uploads and deletions at WARN level, after this many +// retries. Uploads and deletions are retried forever, though. +const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; + /// A client for accessing a timeline's data in remote storage. /// /// This takes care of managing the number of connections, and balancing them @@ -256,209 +281,42 @@ pub struct RemoteTimelineClient { upload_queue: Mutex, + metrics: Arc, + storage_impl: GenericRemoteStorage, } -// clippy warns that Uninitialized is much smaller than Initialized, which wastes -// memory for Uninitialized variants. Doesn't matter in practice, there are not -// that many upload queues in a running pageserver, and most of them are initialized -// anyway. -#[allow(clippy::large_enum_variant)] -enum UploadQueue { - Uninitialized, - Initialized(UploadQueueInitialized), - Stopped(UploadQueueStopped), -} - -impl UploadQueue { - fn as_str(&self) -> &'static str { - match self { - UploadQueue::Uninitialized => "Uninitialized", - UploadQueue::Initialized(_) => "Initialized", - UploadQueue::Stopped(_) => "Stopped", - } - } -} - -/// This keeps track of queued and in-progress tasks. -struct UploadQueueInitialized { - /// Counter to assign task IDs - task_counter: u64, - - /// All layer files stored in the remote storage, taking into account all - /// in-progress and queued operations - latest_files: HashMap, - - /// Metadata stored in the remote storage, taking into account all - /// in-progress and queued operations. - /// DANGER: do not return to outside world, e.g., safekeepers. - latest_metadata: TimelineMetadata, - - /// `disk_consistent_lsn` from the last metadata file that was successfully - /// uploaded. `Lsn(0)` if nothing was uploaded yet. - /// Unlike `latest_files` or `latest_metadata`, this value is never ahead. - /// Safekeeper can rely on it to make decisions for WAL storage. - last_uploaded_consistent_lsn: Lsn, - - // Breakdown of different kinds of tasks currently in-progress - num_inprogress_layer_uploads: usize, - num_inprogress_metadata_uploads: usize, - num_inprogress_deletions: usize, - - /// Tasks that are currently in-progress. In-progress means that a tokio Task - /// has been launched for it. An in-progress task can be busy uploading, but it can - /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can - /// be waiting for retry in `exponential_backoff`. - inprogress_tasks: HashMap>, - - /// Queued operations that have not been launched yet. They might depend on previous - /// tasks to finish. For example, metadata upload cannot be performed before all - /// preceding layer file uploads have completed. - queued_operations: VecDeque, -} - -struct UploadQueueStopped { - last_uploaded_consistent_lsn: Lsn, -} - -impl UploadQueue { - fn initialize_empty_remote( - &mut self, - metadata: &TimelineMetadata, - ) -> anyhow::Result<&mut UploadQueueInitialized> { - match self { - UploadQueue::Uninitialized => (), - UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => { - anyhow::bail!("already initialized, state {}", self.as_str()) - } - } - - info!("initializing upload queue for empty remote"); - - let state = UploadQueueInitialized { - // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead. - latest_files: HashMap::new(), - latest_metadata: metadata.clone(), - // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent - // safekeepers from garbage-collecting anything. - last_uploaded_consistent_lsn: Lsn(0), - // what follows are boring default initializations - task_counter: 0, - num_inprogress_layer_uploads: 0, - num_inprogress_metadata_uploads: 0, - num_inprogress_deletions: 0, - inprogress_tasks: HashMap::new(), - queued_operations: VecDeque::new(), - }; - - *self = UploadQueue::Initialized(state); - Ok(self.initialized_mut().expect("we just set it")) - } - - fn initialize_with_current_remote_index_part( - &mut self, - index_part: &IndexPart, - ) -> anyhow::Result<&mut UploadQueueInitialized> { - match self { - UploadQueue::Uninitialized => (), - UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => { - anyhow::bail!("already initialized, state {}", self.as_str()) - } - } - - let mut files = HashMap::with_capacity(index_part.timeline_layers.len()); - for layer_name in &index_part.timeline_layers { - let layer_metadata = index_part - .layer_metadata - .get(layer_name) - .map(LayerFileMetadata::from) - .unwrap_or(LayerFileMetadata::MISSING); - files.insert(layer_name.to_owned(), layer_metadata); - } - - let index_part_metadata = index_part.parse_metadata()?; - info!( - "initializing upload queue with remote index_part.disk_consistent_lsn: {}", - index_part_metadata.disk_consistent_lsn() - ); - - let state = UploadQueueInitialized { - latest_files: files, - latest_metadata: index_part_metadata.clone(), - last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(), - // what follows are boring default initializations - task_counter: 0, - num_inprogress_layer_uploads: 0, - num_inprogress_metadata_uploads: 0, - num_inprogress_deletions: 0, - inprogress_tasks: HashMap::new(), - queued_operations: VecDeque::new(), - }; - - *self = UploadQueue::Initialized(state); - Ok(self.initialized_mut().expect("we just set it")) - } - - fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> { - match self { - UploadQueue::Uninitialized | UploadQueue::Stopped(_) => { - anyhow::bail!("queue is in state {}", self.as_str()) - } - UploadQueue::Initialized(x) => Ok(x), - } - } -} - -/// An in-progress upload or delete task. -#[derive(Debug)] -struct UploadTask { - /// Unique ID of this task. Used as the key in `inprogress_tasks` above. - task_id: u64, - retries: AtomicU32, - - op: UploadOp, -} - -#[derive(Debug)] -enum UploadOp { - /// Upload a layer file - UploadLayer(LayerFileName, LayerFileMetadata), - - /// Upload the metadata file - UploadMetadata(IndexPart, Lsn), - - /// Delete a file. - Delete(RemoteOpFileKind, LayerFileName), - - /// Barrier. When the barrier operation is reached, - Barrier(tokio::sync::watch::Sender<()>), -} - -impl std::fmt::Display for UploadOp { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - match self { - UploadOp::UploadLayer(path, metadata) => { - write!( - f, - "UploadLayer({}, size={:?})", - path.file_name(), - metadata.file_size() - ) - } - UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn), - UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()), - UploadOp::Barrier(_) => write!(f, "Barrier"), - } - } -} - impl RemoteTimelineClient { + /// + /// Create a remote storage client for given timeline + /// + /// Note: the caller must initialize the upload queue before any uploads can be scheduled, + /// by calling init_upload_queue. + /// + pub fn new( + remote_storage: GenericRemoteStorage, + conf: &'static PageServerConf, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> RemoteTimelineClient { + RemoteTimelineClient { + conf, + runtime: &BACKGROUND_RUNTIME, + tenant_id, + timeline_id, + storage_impl: remote_storage, + upload_queue: Mutex::new(UploadQueue::Uninitialized), + metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)), + } + } + /// Initialize the upload queue for a remote storage that already received /// an index file upload, i.e., it's not empty. /// The given `index_part` must be the one on the remote. pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> { let mut upload_queue = self.upload_queue.lock().unwrap(); upload_queue.initialize_with_current_remote_index_part(index_part)?; + self.update_remote_physical_size_gauge(Some(index_part)); Ok(()) } @@ -470,6 +328,7 @@ impl RemoteTimelineClient { ) -> anyhow::Result<()> { let mut upload_queue = self.upload_queue.lock().unwrap(); upload_queue.initialize_empty_remote(local_metadata)?; + self.update_remote_physical_size_gauge(None); Ok(()) } @@ -481,6 +340,24 @@ impl RemoteTimelineClient { } } + fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) { + let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part { + current_remote_index_part + .layer_metadata + .values() + // If we don't have the file size for the layer, don't account for it in the metric. + .map(|ilmd| ilmd.file_size.unwrap_or(0)) + .sum() + } else { + 0 + }; + self.metrics.remote_physical_size_gauge().set(size); + } + + pub fn get_remote_physical_size(&self) -> u64 { + self.metrics.remote_physical_size_gauge().get() + } + // // Download operations. // @@ -490,6 +367,10 @@ impl RemoteTimelineClient { /// Download index file pub async fn download_index_file(&self) -> Result { + let _unfinished_gauge_guard = self + .metrics + .call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download); + download::download_index_part( self.conf, &self.storage_impl, @@ -501,6 +382,7 @@ impl RemoteTimelineClient { self.timeline_id, RemoteOpFileKind::Index, RemoteOpKind::Download, + Arc::clone(&self.metrics), ) .await } @@ -515,21 +397,27 @@ impl RemoteTimelineClient { layer_file_name: &LayerFileName, layer_metadata: &LayerFileMetadata, ) -> anyhow::Result { - let downloaded_size = download::download_layer_file( - self.conf, - &self.storage_impl, - self.tenant_id, - self.timeline_id, - layer_file_name, - layer_metadata, - ) - .measure_remote_op( - self.tenant_id, - self.timeline_id, - RemoteOpFileKind::Layer, - RemoteOpKind::Download, - ) - .await?; + let downloaded_size = { + let _unfinished_gauge_guard = self + .metrics + .call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download); + download::download_layer_file( + self.conf, + &self.storage_impl, + self.tenant_id, + self.timeline_id, + layer_file_name, + layer_metadata, + ) + .measure_remote_op( + self.tenant_id, + self.timeline_id, + RemoteOpFileKind::Layer, + RemoteOpKind::Download, + Arc::clone(&self.metrics), + ) + .await? + }; // Update the metadata for given layer file. The remote index file // might be missing some information for the file; this allows us @@ -539,7 +427,17 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) { - upgraded.merge(&new_metadata); + if upgraded.merge(&new_metadata) { + upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; + } + // If we don't do an index file upload inbetween here and restart, + // the value will go back down after pageserver restart, since we will + // have lost this data point. + // But, we upload index part fairly frequently, and restart pageserver rarely. + // So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner. + self.metrics + .remote_physical_size_gauge() + .add(downloaded_size); } else { // The file should exist, since we just downloaded it. warn!( @@ -556,14 +454,20 @@ impl RemoteTimelineClient { // /// - /// Launch an index-file upload operation in the background. + /// Launch an index-file upload operation in the background, with + /// updated metadata. /// /// The upload will be added to the queue immediately, but it /// won't be performed until all previosuly scheduled layer file /// upload operations have completed successfully. This is to /// ensure that when the index file claims that layers X, Y and Z - /// exist in remote storage, they really do. - pub fn schedule_index_upload( + /// exist in remote storage, they really do. To wait for the upload + /// to complete, use `wait_completion`. + /// + /// If there were any changes to the list of files, i.e. if any + /// layer file uploads were scheduled, since the last index file + /// upload, those will be included too. + pub fn schedule_index_upload_for_metadata_update( self: &Arc, metadata: &TimelineMetadata, ) -> anyhow::Result<()> { @@ -574,26 +478,60 @@ impl RemoteTimelineClient { // ahead of what's _actually_ on the remote during index upload. upload_queue.latest_metadata = metadata.clone(); + let metadata_bytes = upload_queue.latest_metadata.to_bytes()?; + self.schedule_index_upload(upload_queue, metadata_bytes); + + Ok(()) + } + + /// + /// Launch an index-file upload operation in the background, if necessary. + /// + /// Use this function to schedule the update of the index file after + /// scheduling file uploads or deletions. If no file uploads or deletions + /// have been scheduled since the last index file upload, this does + /// nothing. + /// + /// Like schedule_index_upload_for_metadata_update(), this merely adds + /// the upload to the upload queue and returns quickly. + pub fn schedule_index_upload_for_file_changes(self: &Arc) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { + let metadata_bytes = upload_queue.latest_metadata.to_bytes()?; + self.schedule_index_upload(upload_queue, metadata_bytes); + } + + Ok(()) + } + + /// Launch an index-file upload operation in the background (internal function) + fn schedule_index_upload( + self: &Arc, + upload_queue: &mut UploadQueueInitialized, + metadata_bytes: Vec, + ) { + info!( + "scheduling metadata upload with {} files ({} changed)", + upload_queue.latest_files.len(), + upload_queue.latest_files_changes_since_metadata_upload_scheduled, + ); + let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); let index_part = IndexPart::new( upload_queue.latest_files.clone(), disk_consistent_lsn, - upload_queue.latest_metadata.to_bytes()?, + metadata_bytes, ); let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn); - self.update_upload_queue_unfinished_metric(1, &op); + self.calls_unfinished_metric_begin(&op); upload_queue.queued_operations.push_back(op); - - info!( - "scheduled metadata upload with {} files", - upload_queue.latest_files.len() - ); + upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0; // Launch the task immediately, if possible self.launch_queued_tasks(upload_queue); - - Ok(()) } /// @@ -617,9 +555,10 @@ impl RemoteTimelineClient { upload_queue .latest_files .insert(layer_file_name.clone(), layer_metadata.clone()); + upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone()); - self.update_upload_queue_unfinished_metric(1, &op); + self.calls_unfinished_metric_begin(&op); upload_queue.queued_operations.push_back(op); info!( @@ -635,8 +574,11 @@ impl RemoteTimelineClient { /// /// Launch a delete operation in the background. /// - /// The deletion won't actually be performed, until all preceding - /// upload operations have completed succesfully. + /// Note: This schedules an index file upload before the deletions. The + /// deletion won't actually be performed, until any previously scheduled + /// upload operations, and the index file upload, have completed + /// succesfully. + /// pub fn schedule_layer_file_deletion( self: &Arc, names: &[LayerFileName], @@ -647,7 +589,6 @@ impl RemoteTimelineClient { // Deleting layers doesn't affect the values stored in TimelineMetadata, // so we don't need update it. Just serialize it. let metadata_bytes = upload_queue.latest_metadata.to_bytes()?; - let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); // Update the remote index file, removing the to-be-deleted files from the index, // before deleting the actual files. @@ -659,21 +600,17 @@ impl RemoteTimelineClient { let no_bail_here = || { for name in names { upload_queue.latest_files.remove(name); + upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; } - let index_part = IndexPart::new( - upload_queue.latest_files.clone(), - disk_consistent_lsn, - metadata_bytes, - ); - let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn); - self.update_upload_queue_unfinished_metric(1, &op); - upload_queue.queued_operations.push_back(op); + if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { + self.schedule_index_upload(upload_queue, metadata_bytes); + } // schedule the actual deletions for name in names { let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone()); - self.update_upload_queue_unfinished_metric(1, &op); + self.calls_unfinished_metric_begin(&op); upload_queue.queued_operations.push_back(op); info!("scheduled layer file deletion {}", name.file_name()); } @@ -825,7 +762,7 @@ impl RemoteTimelineClient { // upload finishes or times out soon enough. if task_mgr::is_shutdown_requested() { info!("upload task cancelled by shutdown request"); - self.update_upload_queue_unfinished_metric(-1, &task.op); + self.calls_unfinished_metric_end(&task.op); self.stop(); return; } @@ -847,11 +784,12 @@ impl RemoteTimelineClient { self.timeline_id, RemoteOpFileKind::Layer, RemoteOpKind::Upload, + Arc::clone(&self.metrics), ) .await } UploadOp::UploadMetadata(ref index_part, _lsn) => { - upload::upload_index_part( + let res = upload::upload_index_part( self.conf, &self.storage_impl, self.tenant_id, @@ -863,8 +801,13 @@ impl RemoteTimelineClient { self.timeline_id, RemoteOpFileKind::Index, RemoteOpKind::Upload, + Arc::clone(&self.metrics), ) - .await + .await; + if res.is_ok() { + self.update_remote_physical_size_gauge(Some(index_part)); + } + res } UploadOp::Delete(metric_file_kind, ref layer_file_name) => { let path = &self @@ -877,6 +820,7 @@ impl RemoteTimelineClient { self.timeline_id, *metric_file_kind, RemoteOpKind::Delete, + Arc::clone(&self.metrics), ) .await } @@ -895,12 +839,14 @@ impl RemoteTimelineClient { Err(e) => { let retries = task.retries.fetch_add(1, Ordering::SeqCst); - // uploads may fail due to rate limts (IAM, S3) or spurious network and external errors - // such issues are relatively regular, so don't use WARN or ERROR to avoid alerting - // people and tests until the retries are definitely causing delays. - if retries < 3 { + // Uploads can fail due to rate limits (IAM, S3), spurious network problems, + // or other external reasons. Such issues are relatively regular, so log them + // at info level at first, and only WARN if the operation fails repeatedly. + // + // (See similar logic for downloads in `download::download_retry`) + if retries < FAILED_UPLOAD_WARN_THRESHOLD { info!( - "failed to perform remote task {}, will retry (attempt {}): {:?}", + "failed to perform remote task {}, will retry (attempt {}): {:#}", task.op, retries, e ); } else { @@ -964,28 +910,40 @@ impl RemoteTimelineClient { // Launch any queued tasks that were unblocked by this one. self.launch_queued_tasks(upload_queue); } - self.update_upload_queue_unfinished_metric(-1, &task.op); + self.calls_unfinished_metric_end(&task.op); } - fn update_upload_queue_unfinished_metric(&self, delta: i64, op: &UploadOp) { - let (file_kind, op_kind) = match op { + fn calls_unfinished_metric_impl( + &self, + op: &UploadOp, + ) -> Option<(RemoteOpFileKind, RemoteOpKind)> { + let res = match op { UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload), UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload), UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete), UploadOp::Barrier(_) => { // we do not account these - return; + return None; } }; - REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS - .get_metric_with_label_values(&[ - &self.tenant_id.to_string(), - &self.timeline_id.to_string(), - file_kind.as_str(), - op_kind.as_str(), - ]) - .unwrap() - .add(delta) + Some(res) + } + + fn calls_unfinished_metric_begin(&self, op: &UploadOp) { + let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) { + Some(x) => x, + None => return, + }; + let guard = self.metrics.call_begin(&file_kind, &op_kind); + guard.will_decrement_manually(); // in unfinished_ops_metric_end() + } + + fn calls_unfinished_metric_end(&self, op: &UploadOp) { + let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) { + Some(x) => x, + None => return, + }; + self.metrics.call_end(&file_kind, &op_kind); } fn stop(&self) { @@ -1036,7 +994,7 @@ impl RemoteTimelineClient { // Tear down queued ops for op in qi.queued_operations.into_iter() { - self.update_upload_queue_unfinished_metric(-1, &op); + self.calls_unfinished_metric_end(&op); // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err() // which is exactly what we want to happen. drop(op); @@ -1049,28 +1007,6 @@ impl RemoteTimelineClient { } } -/// -/// Create a remote storage client for given timeline -/// -/// Note: the caller must initialize the upload queue before any uploads can be scheduled, -/// by calling init_upload_queue. -/// -pub fn create_remote_timeline_client( - remote_storage: GenericRemoteStorage, - conf: &'static PageServerConf, - tenant_id: TenantId, - timeline_id: TimelineId, -) -> anyhow::Result { - Ok(RemoteTimelineClient { - conf, - runtime: &BACKGROUND_RUNTIME, - tenant_id, - timeline_id, - storage_impl: remote_storage, - upload_queue: Mutex::new(UploadQueue::Uninitialized), - }) -} - #[cfg(test)] mod tests { use super::*; @@ -1180,6 +1116,10 @@ mod tests { timeline_id: TIMELINE_ID, storage_impl, upload_queue: Mutex::new(UploadQueue::Uninitialized), + metrics: Arc::new(RemoteTimelineClientMetrics::new( + &harness.tenant_id, + &TIMELINE_ID, + )), }); let remote_timeline_dir = @@ -1211,15 +1151,19 @@ mod tests { assert!(upload_queue.queued_operations.is_empty()); assert!(upload_queue.inprogress_tasks.len() == 2); assert!(upload_queue.num_inprogress_layer_uploads == 2); + + // also check that `latest_file_changes` was updated + assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2); } // Schedule upload of index. Check that it is queued let metadata = dummy_metadata(Lsn(0x20)); - client.schedule_index_upload(&metadata)?; + client.schedule_index_upload_for_metadata_update(&metadata)?; { let mut guard = client.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut().unwrap(); assert!(upload_queue.queued_operations.len() == 1); + assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0); } // Wait for the uploads to finish @@ -1255,6 +1199,7 @@ mod tests { assert!(upload_queue.inprogress_tasks.len() == 1); assert!(upload_queue.num_inprogress_layer_uploads == 1); assert!(upload_queue.num_inprogress_deletions == 0); + assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0); } assert_remote_files(&["foo", "bar", "index_part.json"], &remote_timeline_dir); diff --git a/pageserver/src/storage_sync2/delete.rs b/pageserver/src/tenant/remote_timeline_client/delete.rs similarity index 100% rename from pageserver/src/storage_sync2/delete.rs rename to pageserver/src/tenant/remote_timeline_client/delete.rs diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs new file mode 100644 index 0000000000..2e79698087 --- /dev/null +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -0,0 +1,316 @@ +//! Helper functions to download files from remote storage with a RemoteStorage +//! +//! The functions in this module retry failed operations automatically, according +//! to the FAILED_DOWNLOAD_RETRIES constant. + +use std::collections::HashSet; +use std::future::Future; +use std::path::Path; + +use anyhow::{anyhow, Context}; +use tokio::fs; +use tokio::io::AsyncWriteExt; +use tracing::{error, info, warn}; + +use crate::config::PageServerConf; +use crate::tenant::storage_layer::LayerFileName; +use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; +use remote_storage::{DownloadError, GenericRemoteStorage}; +use utils::crashsafe::path_with_suffix_extension; +use utils::id::{TenantId, TimelineId}; + +use super::index::{IndexPart, IndexPartUnclean, LayerFileMetadata}; +use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD}; + +async fn fsync_path(path: impl AsRef) -> Result<(), std::io::Error> { + fs::File::open(path).await?.sync_all().await +} + +/// +/// If 'metadata' is given, we will validate that the downloaded file's size matches that +/// in the metadata. (In the future, we might do more cross-checks, like CRC validation) +/// +/// Returns the size of the downloaded file. +pub async fn download_layer_file<'a>( + conf: &'static PageServerConf, + storage: &'a GenericRemoteStorage, + tenant_id: TenantId, + timeline_id: TimelineId, + layer_file_name: &'a LayerFileName, + layer_metadata: &'a LayerFileMetadata, +) -> Result { + let timeline_path = conf.timeline_path(&timeline_id, &tenant_id); + + let local_path = timeline_path.join(layer_file_name.file_name()); + + let remote_path = conf + .remote_path(&local_path) + .map_err(DownloadError::Other)?; + + // Perform a rename inspired by durable_rename from file_utils.c. + // The sequence: + // write(tmp) + // fsync(tmp) + // rename(tmp, new) + // fsync(new) + // fsync(parent) + // For more context about durable_rename check this email from postgres mailing list: + // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com + // If pageserver crashes the temp file will be deleted on startup and re-downloaded. + let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); + + let (mut destination_file, bytes_amount) = download_retry( + || async { + // TODO: this doesn't use the cached fd for some reason? + let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| { + format!( + "Failed to create a destination file for layer '{}'", + temp_file_path.display() + ) + }) + .map_err(DownloadError::Other)?; + let mut download = storage.download(&remote_path).await.with_context(|| { + format!( + "Failed to open a download stream for layer with remote storage path '{remote_path:?}'" + ) + }) + .map_err(DownloadError::Other)?; + let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| { + format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}") + }) + .map_err(DownloadError::Other)?; + Ok((destination_file, bytes_amount)) + }, + &format!("download {remote_path:?}"), + ).await?; + + // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: + // A file will not be closed immediately when it goes out of scope if there are any IO operations + // that have not yet completed. To ensure that a file is closed immediately when it is dropped, + // you should call flush before dropping it. + // + // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because + // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations. + // But for additional safety lets check/wait for any pending operations. + destination_file + .flush() + .await + .with_context(|| { + format!( + "failed to flush source file at {}", + temp_file_path.display() + ) + }) + .map_err(DownloadError::Other)?; + + match layer_metadata.file_size() { + Some(expected) if expected != bytes_amount => { + return Err(DownloadError::Other(anyhow!( + "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'", + temp_file_path.display() + ))); + } + Some(_) | None => { + // matches, or upgrading from an earlier IndexPart version + } + } + + // not using sync_data because it can lose file size update + destination_file + .sync_all() + .await + .with_context(|| { + format!( + "failed to fsync source file at {}", + temp_file_path.display() + ) + }) + .map_err(DownloadError::Other)?; + drop(destination_file); + + fail::fail_point!("remote-storage-download-pre-rename", |_| { + Err(DownloadError::Other(anyhow!( + "remote-storage-download-pre-rename failpoint triggered" + ))) + }); + + fs::rename(&temp_file_path, &local_path) + .await + .with_context(|| { + format!( + "Could not rename download layer file to {}", + local_path.display(), + ) + }) + .map_err(DownloadError::Other)?; + + fsync_path(&local_path) + .await + .with_context(|| format!("Could not fsync layer file {}", local_path.display(),)) + .map_err(DownloadError::Other)?; + + tracing::info!("download complete: {}", local_path.display()); + + Ok(bytes_amount) +} + +const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; + +pub fn is_temp_download_file(path: &Path) -> bool { + let extension = path.extension().map(|pname| { + pname + .to_str() + .expect("paths passed to this function must be valid Rust strings") + }); + match extension { + Some(TEMP_DOWNLOAD_EXTENSION) => true, + Some(_) => false, + None => false, + } +} + +/// List timelines of given tenant in remote storage +pub async fn list_remote_timelines<'a>( + storage: &'a GenericRemoteStorage, + conf: &'static PageServerConf, + tenant_id: TenantId, +) -> anyhow::Result> { + let tenant_path = conf.timelines_path(&tenant_id); + let tenant_storage_path = conf.remote_path(&tenant_path)?; + + fail::fail_point!("storage-sync-list-remote-timelines", |_| { + anyhow::bail!("storage-sync-list-remote-timelines"); + }); + + let timelines = download_retry( + || storage.list_prefixes(Some(&tenant_storage_path)), + &format!("list prefixes for {tenant_path:?}"), + ) + .await?; + + if timelines.is_empty() { + anyhow::bail!("no timelines found on the remote storage") + } + + let mut timeline_ids = HashSet::new(); + + for timeline_remote_storage_key in timelines { + let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { + anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") + })?; + + let timeline_id: TimelineId = object_name.parse().with_context(|| { + format!("failed to parse object name into timeline id '{object_name}'") + })?; + + // list_prefixes is assumed to return unique names. Ensure this here. + // NB: it's safer to bail out than warn-log this because the pageserver + // needs to absolutely know about _all_ timelines that exist, so that + // GC knows all the branchpoints. If we skipped over a timeline instead, + // GC could delete a layer that's still needed by that timeline. + anyhow::ensure!( + !timeline_ids.contains(&timeline_id), + "list_prefixes contains duplicate timeline id {timeline_id}" + ); + timeline_ids.insert(timeline_id); + } + + Ok(timeline_ids) +} + +pub(super) async fn download_index_part( + conf: &'static PageServerConf, + storage: &GenericRemoteStorage, + tenant_id: TenantId, + timeline_id: TimelineId, +) -> Result { + let index_part_path = conf + .metadata_path(timeline_id, tenant_id) + .with_file_name(IndexPart::FILE_NAME); + let part_storage_path = conf + .remote_path(&index_part_path) + .map_err(DownloadError::BadInput)?; + + let index_part_bytes = download_retry( + || async { + let mut index_part_download = storage.download(&part_storage_path).await?; + + let mut index_part_bytes = Vec::new(); + tokio::io::copy( + &mut index_part_download.download_stream, + &mut index_part_bytes, + ) + .await + .with_context(|| { + format!("Failed to download an index part into file {index_part_path:?}") + }) + .map_err(DownloadError::Other)?; + Ok(index_part_bytes) + }, + &format!("download {part_storage_path:?}"), + ) + .await?; + + let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes) + .with_context(|| { + format!("Failed to deserialize index part file into file {index_part_path:?}") + }) + .map_err(DownloadError::Other)?; + + let index_part = index_part.remove_unclean_layer_file_names(); + + Ok(index_part) +} + +/// +/// Helper function to handle retries for a download operation. +/// +/// Remote operations can fail due to rate limits (IAM, S3), spurious network +/// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times, +/// with backoff. +/// +/// (See similar logic for uploads in `perform_upload_task`) +async fn download_retry(mut op: O, description: &str) -> Result +where + O: FnMut() -> F, + F: Future>, +{ + let mut attempts = 0; + loop { + let result = op().await; + match result { + Ok(_) => { + if attempts > 0 { + info!("{description} succeeded after {attempts} retries"); + } + return result; + } + + // These are "permanent" errors that should not be retried. + Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => { + return result; + } + // Assume that any other failure might be transient, and the operation might + // succeed if we just keep trying. + Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => { + info!("{description} failed, will retry (attempt {attempts}): {err:#}"); + } + Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => { + warn!("{description} failed, will retry (attempt {attempts}): {err:#}"); + } + Err(DownloadError::Other(ref err)) => { + // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up. + error!("{description} still failed after {attempts} retries, giving up: {err:?}"); + return result; + } + } + // sleep and retry + exponential_backoff( + attempts, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ) + .await; + attempts += 1; + } +} diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs similarity index 94% rename from pageserver/src/storage_sync2/index.rs rename to pageserver/src/tenant/remote_timeline_client/index.rs index 82487339ee..c199b7e10b 100644 --- a/pageserver/src/storage_sync2/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use tracing::warn; -use crate::tenant::{filename::LayerFileName, metadata::TimelineMetadata}; +use crate::tenant::{metadata::TimelineMetadata, storage_layer::LayerFileName}; use utils::lsn::Lsn; @@ -48,9 +48,17 @@ impl LayerFileMetadata { /// Metadata has holes due to version upgrades. This method is called to upgrade self with the /// other value. /// - /// This is called on the possibly outdated version. - pub fn merge(&mut self, other: &Self) { - self.file_size = other.file_size.or(self.file_size); + /// This is called on the possibly outdated version. Returns true if any changes + /// were made. + pub fn merge(&mut self, other: &Self) -> bool { + let mut changed = false; + + if self.file_size != other.file_size { + self.file_size = other.file_size.or(self.file_size); + changed = true; + } + + changed } } @@ -75,11 +83,6 @@ where /// Additional metadata can might exist in `layer_metadata`. pub timeline_layers: HashSet, - /// FIXME: unused field. This should be removed, but that changes the on-disk format, - /// so we need to make sure we're backwards-` (and maybe forwards-) compatible - /// First pass is to move it to Optional and the next would be its removal - missing_layers: Option>, - /// Per layer file name metadata, which can be present for a present or missing layer file. /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata @@ -159,8 +162,6 @@ impl IndexPartUnclean { let IndexPartUnclean { version, timeline_layers, - // this is an unused field, ignore it on cleaning - missing_layers: _, layer_metadata, disk_consistent_lsn, metadata_bytes, @@ -181,7 +182,6 @@ impl IndexPartUnclean { } }) .collect(), - missing_layers: None, layer_metadata: layer_metadata .into_iter() .filter_map(|(l, m)| l.into_clean().map(|l| (l, m))) @@ -217,7 +217,6 @@ impl IndexPart { Self { version: Self::LATEST_VERSION, timeline_layers, - missing_layers: Some(HashSet::new()), layer_metadata, disk_consistent_lsn, metadata_bytes, @@ -232,7 +231,7 @@ impl IndexPart { /// Serialized form of [`LayerFileMetadata`]. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] pub struct IndexLayerMetadata { - file_size: Option, + pub(super) file_size: Option, } impl From<&'_ LayerFileMetadata> for IndexLayerMetadata { @@ -251,7 +250,6 @@ mod tests { fn v0_indexpart_is_parsed() { let example = r#"{ "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], - "missing_layers":["LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage"], "disk_consistent_lsn":"0/16960E8", "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] }"#; @@ -259,7 +257,6 @@ mod tests { let expected = IndexPart { version: 0, timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]), - missing_layers: None, // disabled fields should not carry unused values further layer_metadata: HashMap::default(), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), @@ -275,7 +272,6 @@ mod tests { let example = r#"{ "version":1, "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], - "missing_layers":["LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage"], "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 } @@ -288,7 +284,6 @@ mod tests { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? version: 1, timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]), - missing_layers: None, layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { file_size: Some(25600000), @@ -314,6 +309,7 @@ mod tests { let example = r#"{ "version":1, "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], + "missing_layers":["This shouldn't fail deserialization"], "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, "LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 } @@ -338,7 +334,6 @@ mod tests { ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), - missing_layers: None, }; let part = serde_json::from_str::(example).unwrap(); diff --git a/pageserver/src/storage_sync2/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs similarity index 96% rename from pageserver/src/storage_sync2/upload.rs rename to pageserver/src/tenant/remote_timeline_client/upload.rs index 57a524a22d..5082fa1634 100644 --- a/pageserver/src/storage_sync2/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -5,12 +5,12 @@ use fail::fail_point; use std::path::Path; use tokio::fs; -use super::index::IndexPart; -use crate::config::PageServerConf; -use crate::storage_sync::LayerFileMetadata; +use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart}; use remote_storage::GenericRemoteStorage; use utils::id::{TenantId, TimelineId}; +use super::index::LayerFileMetadata; + /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part<'a>( conf: &'static PageServerConf, diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 24d9b2a10e..aa11985cbe 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -3,8 +3,11 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; use anyhow::Context; +use tokio::sync::oneshot::error::RecvError; use tokio::sync::Semaphore; +use crate::pgdatadir_mapping::CalculateLogicalSizeError; + use super::Tenant; use utils::id::TimelineId; use utils::lsn::Lsn; @@ -67,6 +70,7 @@ pub(super) async fn gather_inputs( let timelines = tenant .refresh_gc_info() + .await .context("Failed to refresh gc_info before gathering inputs")?; if timelines.is_empty() { @@ -93,8 +97,6 @@ pub(super) async fn gather_inputs( // used to determine the `retention_period` for the size model let mut max_cutoff_distance = None; - // this will probably conflict with on-demand downloaded layers, or at least force them all - // to be downloaded for timeline in timelines { let last_record_lsn = timeline.get_last_record_lsn(); @@ -212,11 +214,30 @@ pub(super) async fn gather_inputs( let mut have_any_error = false; while let Some(res) = joinset.join_next().await { - // each of these come with Result, JoinError> + // each of these come with Result, JoinError> // because of spawn + spawn_blocking - let res = res.and_then(|inner| inner); match res { - Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => { + Err(join_error) if join_error.is_cancelled() => { + unreachable!("we are not cancelling any of the futures, nor should be"); + } + Err(join_error) => { + // cannot really do anything, as this panic is likely a bug + error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}"); + have_any_error = true; + } + Ok(Err(recv_result_error)) => { + // cannot really do anything, as this panic is likely a bug + error!("failed to receive logical size query result: {recv_result_error:#}"); + have_any_error = true; + } + Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => { + warn!( + timeline_id=%timeline.timeline_id, + "failed to calculate logical size at {lsn}: {error:#}" + ); + have_any_error = true; + } + Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => { debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated"); logical_size_cache.insert((timeline.timeline_id, lsn), size); @@ -228,21 +249,6 @@ pub(super) async fn gather_inputs( command: Command::Update(size), }); } - Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => { - warn!( - timeline_id=%timeline.timeline_id, - "failed to calculate logical size at {lsn}: {error:#}" - ); - have_any_error = true; - } - Err(join_error) if join_error.is_cancelled() => { - unreachable!("we are not cancelling any of the futures, nor should be"); - } - Err(join_error) => { - // cannot really do anything, as this panic is likely a bug - error!("logical size query panicked: {join_error:#}"); - have_any_error = true; - } } } @@ -351,7 +357,7 @@ enum LsnKind { struct TimelineAtLsnSizeResult( Arc, utils::lsn::Lsn, - anyhow::Result, + Result, ); #[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))] @@ -359,17 +365,15 @@ async fn calculate_logical_size( limit: Arc, timeline: Arc, lsn: utils::lsn::Lsn, -) -> Result { - let permit = tokio::sync::Semaphore::acquire_owned(limit) +) -> Result { + let _permit = tokio::sync::Semaphore::acquire_owned(limit) .await .expect("global semaphore should not had been closed"); - tokio::task::spawn_blocking(move || { - let _permit = permit; - let size_res = timeline.calculate_logical_size(lsn); - TimelineAtLsnSizeResult(timeline, lsn, size_res) - }) - .await + let size_res = timeline + .spawn_ondemand_logical_size_calculation(lsn) + .await?; + Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res)) } #[test] diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 3ad62587d3..d87a248bdf 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -1,6 +1,10 @@ -//! //! Common traits and structs for layers -//! + +mod delta_layer; +mod filename; +mod image_layer; +mod inmemory_layer; +mod remote_layer; use crate::repository::{Key, Value}; use crate::walrecord::NeonWalRecord; @@ -8,13 +12,19 @@ use anyhow::Result; use bytes::Bytes; use std::ops::Range; use std::path::PathBuf; +use std::sync::Arc; use utils::{ id::{TenantId, TimelineId}, lsn::Lsn, }; -use super::filename::LayerFileName; +pub use delta_layer::{DeltaLayer, DeltaLayerWriter}; +pub use filename::{DeltaFileName, ImageFileName, LayerFileName, PathOrConf}; +pub use image_layer::{ImageLayer, ImageLayerWriter}; +pub use inmemory_layer::InMemoryLayer; +pub use remote_layer::RemoteLayer; + pub fn range_overlaps(a: &Range, b: &Range) -> bool where T: PartialOrd, @@ -116,6 +126,12 @@ pub trait Layer: Send + Sync { fn dump(&self, verbose: bool) -> Result<()>; } +/// Returned by [`Layer::iter`] +pub type LayerIter<'i> = Box> + 'i>; + +/// Returned by [`Layer::key_iter`] +pub type LayerKeyIter<'i> = Box + 'i>; + /// A Layer contains all data in a "rectangle" consisting of a range of keys and /// range of LSNs. /// @@ -141,17 +157,42 @@ pub trait PersistentLayer: Layer { fn filename(&self) -> LayerFileName; // Path to the layer file in the local filesystem. - fn local_path(&self) -> PathBuf; + // `None` for `RemoteLayer`. + fn local_path(&self) -> Option; /// Iterate through all keys and values stored in the layer - fn iter(&self) -> Box> + '_>; + fn iter(&self) -> Result>; /// Iterate through all keys stored in the layer. Returns key, lsn and value size /// It is used only for compaction and so is currently implemented only for DeltaLayer - fn key_iter(&self) -> Box + '_> { + fn key_iter(&self) -> Result> { panic!("Not implemented") } /// Permanently remove this layer from disk. fn delete(&self) -> Result<()>; + + fn downcast_remote_layer(self: Arc) -> Option> { + None + } + + fn is_remote_layer(&self) -> bool { + false + } + + /// Returns None if the layer file size is not known. + /// + /// Should not change over the lifetime of the layer object because + /// current_physical_size is computed as the som of this value. + fn file_size(&self) -> Option; +} + +pub fn downcast_remote_layer( + layer: &Arc, +) -> Option> { + if layer.is_remote_layer() { + Arc::clone(layer).downcast_remote_layer() + } else { + None + } } diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs similarity index 96% rename from pageserver/src/tenant/delta_layer.rs rename to pageserver/src/tenant/storage_layer/delta_layer.rs index d8aaa3e8b9..302ba2dc78 100644 --- a/pageserver/src/tenant/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -29,7 +29,6 @@ use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; -use crate::tenant::filename::{DeltaFileName, PathOrConf}; use crate::tenant::storage_layer::{ PersistentLayer, ValueReconstructResult, ValueReconstructState, }; @@ -39,7 +38,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; -use std::fs; +use std::fs::{self, File}; use std::io::{BufWriter, Write}; use std::io::{Seek, SeekFrom}; use std::ops::Range; @@ -54,8 +53,7 @@ use utils::{ lsn::Lsn, }; -use super::filename::LayerFileName; -use super::storage_layer::Layer; +use super::{DeltaFileName, Layer, LayerFileName, LayerIter, LayerKeyIter, PathOrConf}; /// /// Header stored in the beginning of the file @@ -183,6 +181,8 @@ pub struct DeltaLayer { pub key_range: Range, pub lsn_range: Range, + pub file_size: u64, + inner: RwLock, } @@ -387,32 +387,23 @@ impl PersistentLayer for DeltaLayer { self.layer_name().into() } - fn local_path(&self) -> PathBuf { - self.path() + fn local_path(&self) -> Option { + Some(self.path()) } - fn iter<'a>(&'a self) -> Box> + 'a> { - let inner = match self.load() { - Ok(inner) => inner, - Err(e) => panic!("Failed to load a delta layer: {e:?}"), - }; - - match DeltaValueIter::new(inner) { + fn iter(&self) -> Result> { + let inner = self.load().context("load delta layer")?; + Ok(match DeltaValueIter::new(inner) { Ok(iter) => Box::new(iter), Err(err) => Box::new(std::iter::once(Err(err))), - } + }) } - fn key_iter<'a>(&'a self) -> Box + 'a> { - let inner = match self.load() { - Ok(inner) => inner, - Err(e) => panic!("Failed to load a delta layer: {e:?}"), - }; - - match DeltaKeyIter::new(inner) { - Ok(iter) => Box::new(iter), - Err(e) => panic!("Layer index is corrupted: {e:?}"), - } + fn key_iter(&self) -> Result> { + let inner = self.load()?; + Ok(Box::new( + DeltaKeyIter::new(inner).context("Layer index is corrupted")?, + )) } fn delete(&self) -> Result<()> { @@ -420,6 +411,10 @@ impl PersistentLayer for DeltaLayer { fs::remove_file(self.path())?; Ok(()) } + + fn file_size(&self) -> Option { + Some(self.file_size) + } } impl DeltaLayer { @@ -544,6 +539,7 @@ impl DeltaLayer { timeline_id: TimelineId, tenant_id: TenantId, filename: &DeltaFileName, + file_size: u64, ) -> DeltaLayer { DeltaLayer { path_or_conf: PathOrConf::Conf(conf), @@ -551,6 +547,7 @@ impl DeltaLayer { tenant_id, key_range: filename.key_range.clone(), lsn_range: filename.lsn_range.clone(), + file_size, inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, @@ -563,21 +560,23 @@ impl DeltaLayer { /// Create a DeltaLayer struct representing an existing file on disk. /// /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. - pub fn new_for_path(path: &Path, file: F) -> Result - where - F: FileExt, - { + pub fn new_for_path(path: &Path, file: File) -> Result { let mut summary_buf = Vec::new(); summary_buf.resize(PAGE_SZ, 0); file.read_exact_at(&mut summary_buf, 0)?; let summary = Summary::des_prefix(&summary_buf)?; + let metadata = file + .metadata() + .context("get file metadata to determine size")?; + Ok(DeltaLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), timeline_id: summary.timeline_id, tenant_id: summary.tenant_id, key_range: summary.key_range, lsn_range: summary.lsn_range, + file_size: metadata.len(), inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, @@ -734,6 +733,10 @@ impl DeltaLayerWriterInner { file.seek(SeekFrom::Start(0))?; Summary::ser_into(&summary, &mut file)?; + let metadata = file + .metadata() + .context("get file metadata to determine size")?; + // Note: Because we opened the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't // set inner.file here. The first read will have to re-open it. @@ -743,6 +746,7 @@ impl DeltaLayerWriterInner { timeline_id: self.timeline_id, key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), + file_size: metadata.len(), inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, diff --git a/pageserver/src/tenant/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs similarity index 100% rename from pageserver/src/tenant/filename.rs rename to pageserver/src/tenant/storage_layer/filename.rs diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs similarity index 95% rename from pageserver/src/tenant/image_layer.rs rename to pageserver/src/tenant/storage_layer/image_layer.rs index e08e938a4f..9a26fce73b 100644 --- a/pageserver/src/tenant/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -21,11 +21,10 @@ //! actual page images are stored in the "values" part. use crate::config::PageServerConf; use crate::page_cache::PAGE_SZ; -use crate::repository::{Key, Value, KEY_SIZE}; +use crate::repository::{Key, KEY_SIZE}; use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; -use crate::tenant::filename::{ImageFileName, PathOrConf}; use crate::tenant::storage_layer::{ PersistentLayer, ValueReconstructResult, ValueReconstructState, }; @@ -36,10 +35,11 @@ use bytes::Bytes; use hex; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; -use std::fs; +use std::fs::{self, File}; use std::io::Write; use std::io::{Seek, SeekFrom}; use std::ops::Range; +use std::os::unix::prelude::FileExt; use std::path::{Path, PathBuf}; use std::sync::{RwLock, RwLockReadGuard}; use tracing::*; @@ -50,8 +50,8 @@ use utils::{ lsn::Lsn, }; -use super::filename::LayerFileName; -use super::storage_layer::Layer; +use super::filename::{ImageFileName, LayerFileName, PathOrConf}; +use super::{Layer, LayerIter}; /// /// Header stored in the beginning of the file @@ -105,6 +105,7 @@ pub struct ImageLayer { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub key_range: Range, + pub file_size: u64, // This entry contains an image of all pages as of this LSN pub lsn: Lsn, @@ -208,8 +209,8 @@ impl PersistentLayer for ImageLayer { self.layer_name().into() } - fn local_path(&self) -> PathBuf { - self.path() + fn local_path(&self) -> Option { + Some(self.path()) } fn get_tenant_id(&self) -> TenantId { @@ -219,7 +220,7 @@ impl PersistentLayer for ImageLayer { fn get_timeline_id(&self) -> TimelineId { self.timeline_id } - fn iter(&self) -> Box>> { + fn iter(&self) -> Result> { unimplemented!(); } @@ -228,6 +229,10 @@ impl PersistentLayer for ImageLayer { fs::remove_file(self.path())?; Ok(()) } + + fn file_size(&self) -> Option { + Some(self.file_size) + } } impl ImageLayer { @@ -344,6 +349,7 @@ impl ImageLayer { timeline_id: TimelineId, tenant_id: TenantId, filename: &ImageFileName, + file_size: u64, ) -> ImageLayer { ImageLayer { path_or_conf: PathOrConf::Conf(conf), @@ -351,6 +357,7 @@ impl ImageLayer { tenant_id, key_range: filename.key_range.clone(), lsn: filename.lsn, + file_size, inner: RwLock::new(ImageLayerInner { loaded: false, file: None, @@ -363,21 +370,21 @@ impl ImageLayer { /// Create an ImageLayer struct representing an existing file on disk. /// /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. - pub fn new_for_path(path: &Path, file: F) -> Result - where - F: std::os::unix::prelude::FileExt, - { + pub fn new_for_path(path: &Path, file: File) -> Result { let mut summary_buf = Vec::new(); summary_buf.resize(PAGE_SZ, 0); file.read_exact_at(&mut summary_buf, 0)?; let summary = Summary::des_prefix(&summary_buf)?; - + let metadata = file + .metadata() + .context("get file metadata to determine size")?; Ok(ImageLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), timeline_id: summary.timeline_id, tenant_id: summary.tenant_id, key_range: summary.key_range, lsn: summary.lsn, + file_size: metadata.len(), inner: RwLock::new(ImageLayerInner { file: None, loaded: false, @@ -523,6 +530,10 @@ impl ImageLayerWriterInner { file.seek(SeekFrom::Start(0))?; Summary::ser_into(&summary, &mut file)?; + let metadata = file + .metadata() + .context("get metadata to determine file size")?; + // Note: Because we open the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't // set inner.file here. The first read will have to re-open it. @@ -532,6 +543,7 @@ impl ImageLayerWriterInner { tenant_id: self.tenant_id, key_range: self.key_range.clone(), lsn: self.lsn, + file_size: metadata.len(), inner: RwLock::new(ImageLayerInner { loaded: false, file: None, @@ -556,7 +568,7 @@ impl ImageLayerWriterInner { lsn: self.lsn, }, ); - std::fs::rename(self.path, &final_path)?; + std::fs::rename(self.path, final_path)?; trace!("created image layer {}", layer.path().display()); diff --git a/pageserver/src/tenant/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs similarity index 99% rename from pageserver/src/tenant/inmemory_layer.rs rename to pageserver/src/tenant/storage_layer/inmemory_layer.rs index 8f64281cb1..93356a9d8c 100644 --- a/pageserver/src/tenant/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -8,7 +8,6 @@ use crate::config::PageServerConf; use crate::repository::{Key, Value}; use crate::tenant::blob_io::{BlobCursor, BlobWriter}; use crate::tenant::block_io::BlockReader; -use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter}; use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState}; use crate::walrecord; @@ -28,7 +27,7 @@ use std::fmt::Write as _; use std::ops::Range; use std::sync::RwLock; -use super::storage_layer::Layer; +use super::{DeltaLayer, DeltaLayerWriter, Layer}; thread_local! { /// A buffer for serializing object during [`InMemoryLayer::put_value`]. @@ -97,6 +96,7 @@ impl Layer for InMemoryLayer { }; self.start_lsn..end_lsn } + fn is_incremental(&self) -> bool { // in-memory layer is always considered incremental. true diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs new file mode 100644 index 0000000000..33474bb4a2 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -0,0 +1,210 @@ +//! A RemoteLayer is an in-memory placeholder for a layer file that exists +//! in remote storage. +//! +use crate::config::PageServerConf; +use crate::repository::Key; +use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; +use anyhow::{bail, Result}; +use std::ops::Range; +use std::path::PathBuf; +use std::sync::Arc; + +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +use super::filename::{DeltaFileName, ImageFileName, LayerFileName}; +use super::image_layer::ImageLayer; +use super::{DeltaLayer, LayerIter, LayerKeyIter, PersistentLayer}; + +#[derive(Debug)] +pub struct RemoteLayer { + tenantid: TenantId, + timelineid: TimelineId, + key_range: Range, + lsn_range: Range, + + pub file_name: LayerFileName, + + pub layer_metadata: LayerFileMetadata, + + is_delta: bool, + + is_incremental: bool, + + pub(crate) ongoing_download: Arc, +} + +impl Layer for RemoteLayer { + fn get_key_range(&self) -> Range { + self.key_range.clone() + } + + fn get_lsn_range(&self) -> Range { + self.lsn_range.clone() + } + + fn get_value_reconstruct_data( + &self, + _key: Key, + _lsn_range: Range, + _reconstruct_state: &mut ValueReconstructState, + ) -> Result { + bail!( + "layer {} needs to be downloaded", + self.filename().file_name() + ); + } + + fn is_incremental(&self) -> bool { + self.is_incremental + } + + /// debugging function to print out the contents of the layer + fn dump(&self, _verbose: bool) -> Result<()> { + println!( + "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----", + self.tenantid, + self.timelineid, + self.key_range.start, + self.key_range.end, + self.lsn_range.start, + self.lsn_range.end + ); + + Ok(()) + } + + fn short_id(&self) -> String { + self.filename().file_name() + } +} + +impl PersistentLayer for RemoteLayer { + fn get_tenant_id(&self) -> TenantId { + self.tenantid + } + + fn get_timeline_id(&self) -> TimelineId { + self.timelineid + } + + fn filename(&self) -> LayerFileName { + if self.is_delta { + DeltaFileName { + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), + } + .into() + } else { + ImageFileName { + key_range: self.key_range.clone(), + lsn: self.lsn_range.start, + } + .into() + } + } + + fn local_path(&self) -> Option { + None + } + + fn iter(&self) -> Result> { + bail!("cannot iterate a remote layer"); + } + + fn key_iter(&self) -> Result> { + bail!("cannot iterate a remote layer"); + } + + fn delete(&self) -> Result<()> { + Ok(()) + } + + fn downcast_remote_layer<'a>(self: Arc) -> Option> { + Some(self) + } + + fn is_remote_layer(&self) -> bool { + true + } + + fn file_size(&self) -> Option { + self.layer_metadata.file_size() + } +} + +impl RemoteLayer { + pub fn new_img( + tenantid: TenantId, + timelineid: TimelineId, + fname: &ImageFileName, + layer_metadata: &LayerFileMetadata, + ) -> RemoteLayer { + RemoteLayer { + tenantid, + timelineid, + key_range: fname.key_range.clone(), + lsn_range: fname.lsn..(fname.lsn + 1), + is_delta: false, + is_incremental: false, + file_name: fname.to_owned().into(), + layer_metadata: layer_metadata.clone(), + ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + } + } + + pub fn new_delta( + tenantid: TenantId, + timelineid: TimelineId, + fname: &DeltaFileName, + layer_metadata: &LayerFileMetadata, + ) -> RemoteLayer { + RemoteLayer { + tenantid, + timelineid, + key_range: fname.key_range.clone(), + lsn_range: fname.lsn_range.clone(), + is_delta: true, + is_incremental: true, + file_name: fname.to_owned().into(), + layer_metadata: layer_metadata.clone(), + ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + } + } + + /// Create a Layer struct representing this layer, after it has been downloaded. + pub fn create_downloaded_layer( + &self, + conf: &'static PageServerConf, + file_size: u64, + ) -> Arc { + if self.is_delta { + let fname = DeltaFileName { + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), + }; + Arc::new(DeltaLayer::new( + conf, + self.timelineid, + self.tenantid, + &fname, + file_size, + )) + } else { + let fname = ImageFileName { + key_range: self.key_range.clone(), + lsn: self.lsn_range.start, + }; + Arc::new(ImageLayer::new( + conf, + self.timelineid, + self.tenantid, + &fname, + file_size, + )) + } + } +} diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant/tasks.rs similarity index 97% rename from pageserver/src/tenant_tasks.rs rename to pageserver/src/tenant/tasks.rs index d3aec933c2..8397d26e5d 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -8,8 +8,8 @@ use std::time::Duration; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::mgr; use crate::tenant::{Tenant, TenantState}; -use crate::tenant_mgr; use tracing::*; use utils::id::TenantId; @@ -127,7 +127,7 @@ async fn gc_loop(tenant_id: TenantId) { } else { // Run gc if gc_horizon > 0 { - if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await + if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval()).await { sleep_duration = wait_duration; error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration); @@ -155,7 +155,7 @@ async fn wait_for_active_tenant( wait: Duration, ) -> ControlFlow<(), Arc> { let tenant = loop { - match tenant_mgr::get_tenant(tenant_id, false).await { + match mgr::get_tenant(tenant_id, false).await { Ok(tenant) => break tenant, Err(e) => { error!("Failed to get a tenant {tenant_id}: {e:#}"); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index cd045d1081..0d8a5fc800 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3,11 +3,15 @@ use anyhow::{anyhow, bail, ensure, Context}; use bytes::Bytes; use fail::fail_point; +use futures::stream::FuturesUnordered; +use futures::StreamExt; use itertools::Itertools; use once_cell::sync::OnceCell; -use pageserver_api::models::TimelineState; -use tokio::sync::watch; -use tokio::task::spawn_blocking; +use pageserver_api::models::{ + DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskState, TimelineState, +}; +use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError}; +use tokio_util::sync::CancellationToken; use tracing::*; use std::cmp::{max, min, Ordering}; @@ -15,18 +19,17 @@ use std::collections::HashMap; use std::fs; use std::ops::{Deref, Range}; use std::path::{Path, PathBuf}; -use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock}; +use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; -use crate::storage_sync::index::IndexPart; -use crate::storage_sync::RemoteTimelineClient; +use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata}; +use crate::tenant::storage_layer::{ + DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer, LayerFileName, + RemoteLayer, +}; use crate::tenant::{ - delta_layer::{DeltaLayer, DeltaLayerWriter}, ephemeral_file::is_ephemeral_file, - filename::{DeltaFileName, ImageFileName}, - image_layer::{ImageLayer, ImageLayerWriter}, - inmemory_layer::InMemoryLayer, layer_map::{LayerMap, SearchResult}, metadata::{save_metadata, TimelineMetadata}, par_fsync, @@ -36,10 +39,10 @@ use crate::tenant::{ use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::metrics::TimelineMetrics; -use crate::pgdatadir_mapping::BlockNumber; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; -use crate::tenant_config::TenantConfOpt; +use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError}; +use crate::tenant::config::TenantConfOpt; use pageserver_api::reltag::RelTag; use postgres_connection::PgConnectionConfig; @@ -51,19 +54,19 @@ use utils::{ simple_rcu::{Rcu, RcuReadGuard}, }; +use crate::page_cache; use crate::repository::GcResult; use crate::repository::{Key, Value}; use crate::task_mgr::TaskKind; use crate::walreceiver::{is_broker_client_initialized, spawn_connection_manager_task}; use crate::walredo::WalRedoManager; -use crate::CheckpointConfig; use crate::METADATA_FILE_NAME; use crate::ZERO_PAGE; use crate::{is_temporary, task_mgr}; -use crate::{page_cache, storage_sync::index::LayerFileMetadata}; -use super::filename::LayerFileName; -use super::storage_layer::Layer; +use super::remote_timeline_client::index::IndexPart; +use super::remote_timeline_client::RemoteTimelineClient; +use super::storage_layer::{DeltaLayer, ImageLayer, Layer}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] enum FlushLoopState { @@ -76,6 +79,8 @@ pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, + myself: Weak, + pub tenant_id: TenantId, pub timeline_id: TimelineId, @@ -91,10 +96,7 @@ pub struct Timeline { walredo_mgr: Arc, /// Remote storage client. - /// - /// If Some, use it to upload all newly created layers to the remote storage, - /// and keep remote metadata file in sync. In the future, also use it to download - /// layer files on-demand. + /// See [`storage_sync`] module comment for details. pub remote_client: Option>, // What page versions do we hold in the repository? If we get a @@ -158,7 +160,7 @@ pub struct Timeline { // List of child timelines and their branch points. This is needed to avoid // garbage collecting data that is still needed by the child timelines. - pub gc_info: RwLock, + pub gc_info: std::sync::RwLock, // It may change across major versions so for simplicity // keep it after running initdb for a timeline. @@ -176,7 +178,6 @@ pub struct Timeline { /// Current logical size of the "datadir", at the last LSN. current_logical_size: LogicalSize, - initial_size_computation_started: AtomicBool, /// Information about the last processed message by the WAL receiver, /// or None if WAL receiver has not received anything for this timeline @@ -186,6 +187,8 @@ pub struct Timeline { /// Relation size cache pub rel_size_cache: RwLock>, + download_all_remote_layers_task_info: RwLock>, + state: watch::Sender, } @@ -202,6 +205,8 @@ struct LogicalSize { /// /// NOTE: initial size is not a constant and will change between restarts. initial_logical_size: OnceCell, + /// Semaphore to track ongoing calculation of `initial_logical_size`. + initial_size_computation: Arc, /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines. initial_part_end: Option, /// All other size changes after startup, combined together. @@ -252,6 +257,8 @@ impl LogicalSize { fn empty_initial() -> Self { Self { initial_logical_size: OnceCell::with_value(0), + // initial_logical_size already computed, so, don't admit any calculations + initial_size_computation: Arc::new(Semaphore::new(0)), initial_part_end: None, size_added_after_initial: AtomicI64::new(0), } @@ -260,6 +267,7 @@ impl LogicalSize { fn deferred_initial(compute_to: Lsn) -> Self { Self { initial_logical_size: OnceCell::new(), + initial_size_computation: Arc::new(Semaphore::new(1)), initial_part_end: Some(compute_to), size_added_after_initial: AtomicI64::new(0), } @@ -302,12 +310,68 @@ impl LogicalSize { } } +/// Returned by [`Timeline::layer_size_sum`] +pub enum LayerSizeSum { + /// The result is accurate. + Accurate(u64), + // We don't know the layer file size of one or more layers. + // They contribute to the sum with a value of 0. + // Hence, the sum is a lower bound for the actualy layer file size sum. + ApproximateLowerBound(u64), +} + +impl LayerSizeSum { + pub fn approximate_is_ok(self) -> u64 { + match self { + LayerSizeSum::Accurate(v) => v, + LayerSizeSum::ApproximateLowerBound(v) => v, + } + } +} + pub struct WalReceiverInfo { pub wal_source_connconf: PgConnectionConfig, pub last_received_msg_lsn: Lsn, pub last_received_msg_ts: u128, } +/// Like `?`, but for [`PageReconstructResult`]. +/// Use it to bubble up the `NeedsDownload` and `Error` to the caller. +/// +/// Once `std::ops::Try` is stabilized, we should use it instead of this macro. +#[macro_export] +macro_rules! try_no_ondemand_download { + ($result:expr) => {{ + let result = $result; + match result { + PageReconstructResult::Success(value) => value, + PageReconstructResult::NeedsDownload(timeline, layer) => { + return PageReconstructResult::NeedsDownload(timeline, layer); + } + PageReconstructResult::Error(e) => return PageReconstructResult::Error(e), + } + }}; +} + +/// Replacement for `?` in functions that return [`PageReconstructResult`]. +/// +/// Given an `expr: Result`, use `try_page_reconstruct_result!(expr)` +/// instead of `(expr)?`. +/// If `expr` is `Ok(v)`, the macro evaluates to `v`. +/// If `expr` is `Err(e)`, the macro returns `PageReconstructResult::Error(e.into())`. +/// +/// Once `std::ops::Try` is stabilized, we should use it instead of this macro. +#[macro_export] +macro_rules! try_page_reconstruct_result { + ($result:expr) => {{ + let result = $result; + match result { + Ok(v) => v, + Err(e) => return PageReconstructResult::from(e), + } + }}; +} + /// /// Information about how much history needs to be retained, needed by /// Garbage Collection. @@ -337,6 +401,77 @@ pub struct GcInfo { pub pitr_cutoff: Lsn, } +pub enum PageReconstructResult { + Success(T), + /// The given RemoteLayer needs to be downloaded and replaced in the timeline's layer map + /// for the operation to succeed. Use [`Timeline::download_remote_layer`] to do it, then + /// retry the operation that returned this error. + NeedsDownload(Weak, Weak), + Error(PageReconstructError), +} + +/// An error happened in a get() operation. +#[derive(thiserror::Error)] +pub enum PageReconstructError { + #[error(transparent)] + Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error + + #[error(transparent)] + WalRedo(#[from] crate::walredo::WalRedoError), +} + +impl std::fmt::Debug for PageReconstructError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + Self::Other(err) => err.fmt(f), + Self::WalRedo(err) => err.fmt(f), + } + } +} + +/// This impl makes it so you can substitute return type +/// `Result` with `PageReconstructError` in functions +/// and existing `?` will generally continue to work. +/// The reason why thanks to +/// anyhow::Error that `(some error type)ensures that exis +impl From for PageReconstructResult +where + E: Into, +{ + fn from(e: E) -> Self { + Self::Error(e.into()) + } +} + +impl PageReconstructResult { + /// Treat the need for on-demand download as an error. + /// + /// **Avoid this function in new code** if you can help it, + /// as on-demand download will become the norm in the future, + /// especially once we implement layer file eviction. + /// + /// If you are in an async function, use [`with_ondemand_download`] + /// to do the download right here. + /// + /// If you are in a sync function, change its return type from + /// `Result` to `PageReconstructResult` and bubble up + /// the non-success cases of `PageReconstructResult` to the caller. + /// This gives them a chance to do the download and retry. + /// Consider using [`try_no_ondemand_download`] for convenience. + /// + /// For more background, read the comment on [`with_ondemand_download`]. + pub fn no_ondemand_download(self) -> anyhow::Result { + match self { + PageReconstructResult::Success(value) => Ok(value), + // TODO print more info about the timeline + PageReconstructResult::NeedsDownload(_, _) => anyhow::bail!("Layer needs downloading"), + PageReconstructResult::Error(e) => { + Err(anyhow::Error::new(e).context("Failed to reconstruct the page")) + } + } + } +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -364,8 +499,10 @@ impl Timeline { /// the Repository implementation may incorrectly return a value from an ancestor /// branch, for example, or waste a lot of cycles chasing the non-existing key. /// - pub fn get(&self, key: Key, lsn: Lsn) -> anyhow::Result { - anyhow::ensure!(lsn.is_valid(), "Invalid LSN"); + pub fn get(&self, key: Key, lsn: Lsn) -> PageReconstructResult { + if !lsn.is_valid() { + return PageReconstructResult::from(anyhow!("Invalid LSN")); + } // Check the page cache. We will get back the most recent page with lsn <= `lsn`. // The cached image can be returned directly if there is no WAL between the cached image @@ -375,7 +512,7 @@ impl Timeline { Some((cached_lsn, cached_img)) => { match cached_lsn.cmp(&lsn) { Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check - Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image + Ordering::Equal => return PageReconstructResult::Success(cached_img), // exact LSN match, return the image Ordering::Greater => { unreachable!("the returned lsn should never be after the requested lsn") } @@ -390,13 +527,18 @@ impl Timeline { img: cached_page_img, }; - self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; + try_no_ondemand_download!(self.get_reconstruct_data(key, lsn, &mut reconstruct_state)); self.metrics .reconstruct_time_histo .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) } + // Like get(), but if a remote layer file is needed, it is downloaded as part of this call. + pub async fn get_download(&self, key: Key, lsn: Lsn) -> anyhow::Result { + with_ondemand_download(|| self.get(key, lsn)).await + } + /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. pub fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last @@ -423,30 +565,27 @@ impl Timeline { } } - /// Get the physical size of the timeline at the latest LSN - pub fn get_physical_size(&self) -> u64 { - self.metrics.current_physical_size_gauge.get() + /// The sum of the file size of all historic layers in the layer map. + /// This method makes no distinction between local and remote layers. + /// Hence, the result **does not represent local filesystem usage**. + pub fn layer_size_sum(&self) -> LayerSizeSum { + let layer_map = self.layers.read().unwrap(); + let mut size = 0; + let mut no_size_cnt = 0; + for l in layer_map.iter_historic_layers() { + let (l_size, l_no_size) = l.file_size().map(|s| (s, 0)).unwrap_or((0, 1)); + size += l_size; + no_size_cnt += l_no_size; + } + if no_size_cnt == 0 { + LayerSizeSum::Accurate(size) + } else { + LayerSizeSum::ApproximateLowerBound(size) + } } - /// Get the physical size of the timeline at the latest LSN non incrementally - pub fn get_physical_size_non_incremental(&self) -> anyhow::Result { - let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); - // total size of layer files in the current timeline directory - let mut total_physical_size = 0; - - for direntry in fs::read_dir(timeline_path)? { - let direntry = direntry?; - let fname = direntry.file_name(); - let fname = fname.to_string_lossy(); - - if ImageFileName::parse_str(&fname).is_some() - || DeltaFileName::parse_str(&fname).is_some() - { - total_physical_size += direntry.metadata()?.len(); - } - } - - Ok(total_physical_size) + pub fn get_resident_physical_size(&self) -> u64 { + self.metrics.resident_physical_size_gauge.get() } /// @@ -494,22 +633,10 @@ impl Timeline { } /// Flush to disk all data that was written with the put_* functions - /// - /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't - /// know anything about them here in the repository. #[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))] - pub async fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { - match cconf { - CheckpointConfig::Flush => { - self.freeze_inmem_layer(false); - self.flush_frozen_layers_and_wait().await - } - CheckpointConfig::Forced => { - self.freeze_inmem_layer(false); - self.flush_frozen_layers_and_wait().await?; - self.compact().await - } - } + pub async fn freeze_and_flush(&self) -> anyhow::Result<()> { + self.freeze_inmem_layer(false); + self.flush_frozen_layers_and_wait().await } pub async fn compact(&self) -> anyhow::Result<()> { @@ -566,14 +693,18 @@ impl Timeline { // Define partitioning schema if needed - match self.repartition( - self.get_last_record_lsn(), - self.get_compaction_target_size(), - ) { + match self + .repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + ) + .await + { Ok((partitioning, lsn)) => { // 2. Create new image layers for partitions that have been modified // "enough". - let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; + let layer_paths_to_upload = + self.create_image_layers(&partitioning, lsn, false).await?; if let Some(remote_client) = &self.remote_client { for (path, layer_metadata) in layer_paths_to_upload { remote_client.schedule_layer_file_upload(&path, &layer_metadata)?; @@ -584,6 +715,18 @@ impl Timeline { let timer = self.metrics.compact_time_histo.start_timer(); self.compact_level0(target_file_size).await?; timer.stop_and_record(); + + // If `create_image_layers' or `compact_level0` scheduled any + // uploads or deletions, but didn't update the index file yet, + // do it now. + // + // This isn't necessary for correctness, the remote state is + // consistent without the uploads and deletions, and we would + // update the index file on next flush iteration too. But it + // could take a while until that happens. + if let Some(remote_client) = &self.remote_client { + remote_client.schedule_index_upload_for_file_changes()?; + } } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -609,18 +752,22 @@ impl Timeline { /// /// The size could be lagging behind the actual number, in case /// the initial size calculation has not been run (gets triggered on the first size access). - pub fn get_current_logical_size(self: &Arc) -> anyhow::Result { + /// + /// return size and boolean flag that shows if the size is exact + pub fn get_current_logical_size(self: &Arc) -> anyhow::Result<(u64, bool)> { let current_size = self.current_logical_size.current_size()?; debug!("Current size: {current_size:?}"); + let mut is_exact = true; let size = current_size.size(); if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) = (current_size, self.current_logical_size.initial_part_end) { + is_exact = false; self.try_spawn_size_init_task(init_lsn); } - Ok(size) + Ok((size, is_exact)) } /// Check if more than 'checkpoint_distance' of WAL has been accumulated in @@ -744,76 +891,81 @@ impl Timeline { walredo_mgr: Arc, remote_client: Option, pg_version: u32, - ) -> Self { + ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); let (state, _) = watch::channel(TimelineState::Suspended); let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0); let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); - let mut result = Timeline { - conf, - tenant_conf, - timeline_id, - tenant_id, - pg_version, - layers: RwLock::new(LayerMap::default()), + Arc::new_cyclic(|myself| { + let mut result = Timeline { + conf, + tenant_conf, + myself: myself.clone(), + timeline_id, + tenant_id, + pg_version, + layers: RwLock::new(LayerMap::default()), - walredo_mgr, + walredo_mgr, - remote_client: remote_client.map(Arc::new), + remote_client: remote_client.map(Arc::new), - // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. - last_record_lsn: SeqWait::new(RecordLsn { - last: disk_consistent_lsn, - prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)), - }), - disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0), + // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. + last_record_lsn: SeqWait::new(RecordLsn { + last: disk_consistent_lsn, + prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)), + }), + disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0), - last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0), - last_freeze_ts: RwLock::new(Instant::now()), + last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0), + last_freeze_ts: RwLock::new(Instant::now()), - ancestor_timeline: ancestor, - ancestor_lsn: metadata.ancestor_lsn(), + ancestor_timeline: ancestor, + ancestor_lsn: metadata.ancestor_lsn(), - metrics: TimelineMetrics::new(&tenant_id, &timeline_id), + metrics: TimelineMetrics::new(&tenant_id, &timeline_id), - flush_loop_state: Mutex::new(FlushLoopState::NotStarted), + flush_loop_state: Mutex::new(FlushLoopState::NotStarted), - layer_flush_start_tx, - layer_flush_done_tx, + layer_flush_start_tx, + layer_flush_done_tx, - write_lock: Mutex::new(()), - layer_removal_cs: Default::default(), + write_lock: Mutex::new(()), + layer_removal_cs: Default::default(), - gc_info: RwLock::new(GcInfo { - retain_lsns: Vec::new(), - horizon_cutoff: Lsn(0), - pitr_cutoff: Lsn(0), - }), + gc_info: std::sync::RwLock::new(GcInfo { + retain_lsns: Vec::new(), + horizon_cutoff: Lsn(0), + pitr_cutoff: Lsn(0), + }), - latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), - initdb_lsn: metadata.initdb_lsn(), + latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), + initdb_lsn: metadata.initdb_lsn(), - current_logical_size: if disk_consistent_lsn.is_valid() { - // we're creating timeline data with some layer files existing locally, - // need to recalculate timeline's logical size based on data in the layers. - LogicalSize::deferred_initial(disk_consistent_lsn) - } else { - // we're creating timeline data without any layers existing locally, - // initial logical size is 0. - LogicalSize::empty_initial() - }, - initial_size_computation_started: AtomicBool::new(false), - partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), - repartition_threshold: 0, + current_logical_size: if disk_consistent_lsn.is_valid() { + // we're creating timeline data with some layer files existing locally, + // need to recalculate timeline's logical size based on data in the layers. + LogicalSize::deferred_initial(disk_consistent_lsn) + } else { + // we're creating timeline data without any layers existing locally, + // initial logical size is 0. + LogicalSize::empty_initial() + }, + partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), + repartition_threshold: 0, - last_received_wal: Mutex::new(None), - rel_size_cache: RwLock::new(HashMap::new()), - state, - }; - result.repartition_threshold = result.get_checkpoint_distance() / 10; - result + last_received_wal: Mutex::new(None), + rel_size_cache: RwLock::new(HashMap::new()), + + download_all_remote_layers_task_info: RwLock::new(None), + + state, + }; + result.repartition_threshold = result.get_checkpoint_distance() / 10; + result + }) } pub(super) fn maybe_spawn_flush_loop(self: &Arc) { @@ -927,11 +1079,18 @@ impl Timeline { continue; } - let layer = - ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); + let file_size = direntry_path.metadata()?.len(); + + let layer = ImageLayer::new( + self.conf, + self.timeline_id, + self.tenant_id, + &imgfilename, + file_size, + ); trace!("found layer {}", layer.path().display()); - total_physical_size += layer.path().metadata()?.len(); + total_physical_size += file_size; layers.insert_historic(Arc::new(layer)); num_layers += 1; } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { @@ -951,16 +1110,23 @@ impl Timeline { continue; } - let layer = - DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); + let file_size = direntry_path.metadata()?.len(); + + let layer = DeltaLayer::new( + self.conf, + self.timeline_id, + self.tenant_id, + &deltafilename, + file_size, + ); trace!("found layer {}", layer.path().display()); - total_physical_size += layer.path().metadata()?.len(); + total_physical_size += file_size; layers.insert_historic(Arc::new(layer)); num_layers += 1; } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { // ignore these - } else if crate::storage_sync::is_temp_download_file(&direntry_path) { + } else if remote_timeline_client::is_temp_download_file(&direntry_path) { info!( "skipping temp download file, reconcile_with_remote will resume / clean up: {}", fname @@ -989,7 +1155,7 @@ impl Timeline { num_layers, disk_consistent_lsn, total_physical_size ); self.metrics - .current_physical_size_gauge + .resident_physical_size_gauge .set(total_physical_size); timer.stop_and_record(); @@ -997,21 +1163,14 @@ impl Timeline { Ok(()) } - async fn download_missing( + async fn create_remote_layers( &self, index_part: &IndexPart, - remote_client: &RemoteTimelineClient, local_layers: HashMap>, up_to_date_disk_consistent_lsn: Lsn, ) -> anyhow::Result>> { // Are we missing some files that are present in remote storage? - // Download them now. - // TODO Downloading many files this way is not efficient. - // Better to use FuturesUnordered. Maybe keep as is because: - // a) inplace download is a throw-away code, on-demand patch doesnt need that - // b) typical case now is that there is nothing to sync, this downloads a lot - // 1) if there was another pageserver that came and generated new files - // 2) during attach of a timeline with big history which we currently do not do + // Create RemoteLayer instances for them. let mut local_only_layers = local_layers; for remote_layer_name in &index_part.timeline_layers { let local_layer = local_only_layers.remove(remote_layer_name); @@ -1023,11 +1182,13 @@ impl Timeline { .unwrap_or(LayerFileMetadata::MISSING); // Is the local layer's size different from the size stored in the - // remote index file? If so, rename_to_backup those files & remove - // local_layer form the layer map. - // We'll download a fresh copy of the layer file below. + // remote index file? + // If so, rename_to_backup those files & replace their local layer with + // a RemoteLayer in the layer map so that we re-download them on-demand. if let Some(local_layer) = local_layer { - let local_layer_path = local_layer.local_path(); + let local_layer_path = local_layer + .local_path() + .expect("caller must ensure that local_layers only contains local layers"); ensure!( local_layer_path.exists(), "every layer from local_layers must exist on disk: {}", @@ -1048,7 +1209,7 @@ impl Timeline { assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display()); anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}"); } else { - self.metrics.current_physical_size_gauge.sub(local_size); + self.metrics.resident_physical_size_gauge.sub(local_size); self.layers.write().unwrap().remove_historic(local_layer); // fall-through to adding the remote layer } @@ -1069,7 +1230,7 @@ impl Timeline { } info!( - "remote layer does not exist locally, downloading it now: {}", + "remote layer does not exist locally, creating remote layer: {}", remote_layer_name.file_name() ); @@ -1083,28 +1244,18 @@ impl Timeline { continue; } - trace!("downloading image file: {remote_layer_name:?}"); - let downloaded_size = remote_client - .download_layer_file(remote_layer_name, &remote_layer_metadata) - .await - .with_context(|| { - format!("failed to download image layer {remote_layer_name:?}") - })?; - trace!("done"); + let remote_layer = RemoteLayer::new_img( + self.tenant_id, + self.timeline_id, + imgfilename, + &remote_layer_metadata, + ); + let remote_layer = Arc::new(remote_layer); - let image_layer = - ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, imgfilename); - - self.layers - .write() - .unwrap() - .insert_historic(Arc::new(image_layer)); - self.metrics - .current_physical_size_gauge - .add(downloaded_size); + self.layers.write().unwrap().insert_historic(remote_layer); } LayerFileName::Delta(deltafilename) => { - // Create a DeltaLayer struct for each delta file. + // Create a RemoteLayer for the delta file. // The end-LSN is exclusive, while disk_consistent_lsn is // inclusive. For example, if disk_consistent_lsn is 100, it is // OK for a delta layer to have end LSN 101, but if the end LSN @@ -1112,29 +1263,19 @@ impl Timeline { // before crash. if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 { warn!( - "found future delta layer {} on timeline {} remote_consistent_lsn is {}", - deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn - ); + "found future delta layer {} on timeline {} remote_consistent_lsn is {}", + deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn + ); continue; } - - trace!("downloading delta file: {remote_layer_name:?}"); - let sz = remote_client - .download_layer_file(remote_layer_name, &remote_layer_metadata) - .await - .with_context(|| { - format!("failed to download delta layer {remote_layer_name:?}") - })?; - trace!("done"); - - let delta_layer = - DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, deltafilename); - - self.layers - .write() - .unwrap() - .insert_historic(Arc::new(delta_layer)); - self.metrics.current_physical_size_gauge.add(sz); + let remote_layer = RemoteLayer::new_delta( + self.tenant_id, + self.timeline_id, + deltafilename, + &remote_layer_metadata, + ); + let remote_layer = Arc::new(remote_layer); + self.layers.write().unwrap().insert_historic(remote_layer); } #[cfg(test)] LayerFileName::Test(_) => unreachable!(), @@ -1144,22 +1285,22 @@ impl Timeline { Ok(local_only_layers) } + /// This function will synchronize local state with what we have in remote storage. /// - /// This function will synchronize local data with what we have in remote storage. - /// 1. It will download missing layer files. - /// 2. It will update local metadata if remote one has greater `disk_consistent_lsn`. - /// 3. It will upload files that are missing on the remote - /// 4. It will update index file on the remote accordingly - /// TODO may be a bit cleaner to do things based on populated remote client, - /// and then do things based on its upload_queue.latest_files + /// Steps taken: + /// 1. Initialize upload queue based on `index_part`. + /// 2. Create `RemoteLayer` instances for layers that exist only on the remote. + /// The list of layers on the remote comes from `index_part`. + /// The list of local layers is given by the layer map's `iter_historic_layers()`. + /// So, the layer map must have been loaded already. + /// 3. Schedule upload of local-only layer files (which will then also update the remote + /// IndexPart to include the new layer files). /// - /// This is used during tenant attach. The layer map must have been loaded - /// with local filesystem contents already. - /// - /// The caller should provide IndexPart if it exists on the remote storage. If it's None, - /// we assume that it is missing on the remote storage, which means that we initialized - /// a timeline and then restarted before successful upload was performed + /// Refer to the `storage_sync` module comment for more context. /// + /// # TODO + /// May be a bit cleaner to do things based on populated remote client, + /// and then do things based on its upload_queue.latest_files. #[instrument(skip(self, index_part, up_to_date_metadata))] pub async fn reconcile_with_remote( &self, @@ -1189,8 +1330,7 @@ impl Timeline { index_part.timeline_layers.len() ); remote_client.init_upload_queue(index_part)?; - - self.download_missing(index_part, remote_client, local_layers, disk_consistent_lsn) + self.create_remote_layers(index_part, local_layers, disk_consistent_lsn) .await? } None => { @@ -1202,7 +1342,10 @@ impl Timeline { // Are there local files that don't exist remotely? Schedule uploads for them for (layer_name, layer) in &local_only_layers { - let layer_path = layer.local_path(); + // XXX solve this in the type system + let layer_path = layer + .local_path() + .expect("local_only_layers only contains local layers"); let layer_size = layer_path .metadata() .with_context(|| format!("failed to get file {layer_path:?} metadata"))? @@ -1211,9 +1354,7 @@ impl Timeline { remote_client .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?; } - if !local_only_layers.is_empty() { - remote_client.schedule_index_upload(up_to_date_metadata)?; - } + remote_client.schedule_index_upload_for_file_changes()?; info!("Done"); @@ -1221,70 +1362,186 @@ impl Timeline { } fn try_spawn_size_init_task(self: &Arc, init_lsn: Lsn) { - // Atomically check if the timeline size calculation had already started. - // If the flag was not already set, this sets it. - if !self - .initial_size_computation_started - .swap(true, AtomicOrdering::SeqCst) + let permit = match Arc::clone(&self.current_logical_size.initial_size_computation) + .try_acquire_owned() { - // We need to start the computation task. - let self_clone = Arc::clone(self); - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - task_mgr::TaskKind::InitialLogicalSizeCalculation, - Some(self.tenant_id), - Some(self.timeline_id), - "initial size calculation", - false, - async move { - let mut timeline_state_updates = self_clone.subscribe_for_state_updates(); - let self_calculation = Arc::clone(&self_clone); - tokio::select! { - calculation_result = spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn)) => { - let calculated_size = calculation_result - .context("Failed to spawn calculation result task")? - .context("Failed to calculate logical size")?; - match self_clone.current_logical_size.initial_logical_size.set(calculated_size) { - Ok(()) => info!("Successfully calculated initial logical size"), - Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"), - } - Ok(()) - }, - new_event = async { - loop { - match timeline_state_updates.changed().await { - Ok(()) => { - let new_state = *timeline_state_updates.borrow(); - match new_state { - // we're running this job for active timelines only - TimelineState::Active => continue, - TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return Some(new_state), - } - } - Err(_sender_dropped_error) => return None, - } - } - } => { - match new_event { - Some(new_state) => info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates"), - None => info!("Timeline dropped state updates sender, stopping init size calculation"), - } - Ok(()) - }, + Ok(permit) => permit, + Err(TryAcquireError::NoPermits) => { + // computation already ongoing or finished with success + return; + } + Err(TryAcquireError::Closed) => unreachable!("we never call close"), + }; + debug_assert!(self + .current_logical_size + .initial_logical_size + .get() + .is_none()); + // We need to start the computation task. + let self_clone = Arc::clone(self); + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::InitialLogicalSizeCalculation, + Some(self.tenant_id), + Some(self.timeline_id), + "initial size calculation", + false, + // NB: don't log errors here, task_mgr will do that. + async move { + let calculated_size = match self_clone.logical_size_calculation_task(init_lsn).await + { + Ok(s) => s, + Err(CalculateLogicalSizeError::Cancelled) => { + // Don't make noise, this is a common task. + // In the unlikely case that there ihs another call to this function, we'll retry + // because initial_logical_size is still None. + info!("initial size calculation cancelled, likely timeline delete / tenant detach"); + return Ok(()); } - }.instrument(info_span!("initial_logical_size_calculation", tenant = %self.tenant_id, timeline = %self.timeline_id)), - ); + x @ Err(_) => x.context("Failed to calculate logical size")?, + }; + match self_clone + .current_logical_size + .initial_logical_size + .set(calculated_size) + { + Ok(()) => (), + Err(existing_size) => { + // This shouldn't happen because the semaphore is initialized with 1. + // But if it happens, just complain & report success so there are no further retries. + error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing") + } + } + // now that `initial_logical_size.is_some()`, reduce permit count to 0 + // so that we prevent future callers from spawning this task + permit.forget(); + Ok(()) + }, + ); + } + + pub fn spawn_ondemand_logical_size_calculation( + self: &Arc, + lsn: Lsn, + ) -> oneshot::Receiver> { + let (sender, receiver) = oneshot::channel(); + let self_clone = Arc::clone(self); + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::InitialLogicalSizeCalculation, + Some(self.tenant_id), + Some(self.timeline_id), + "ondemand logical size calculation", + false, + async move { + let res = self_clone.logical_size_calculation_task(lsn).await; + let _ = sender.send(res).ok(); + Ok(()) // Receiver is responsible for handling errors + }, + ); + receiver + } + + #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))] + async fn logical_size_calculation_task( + self: &Arc, + init_lsn: Lsn, + ) -> Result { + let mut timeline_state_updates = self.subscribe_for_state_updates(); + let self_calculation = Arc::clone(self); + let cancel = CancellationToken::new(); + + let calculation = async { + let cancel = cancel.child_token(); + tokio::task::spawn_blocking(move || { + // Run in a separate thread since this can do a lot of + // synchronous file IO without .await inbetween + // if there are no RemoteLayers that would require downloading. + let h = tokio::runtime::Handle::current(); + h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel)) + }) + .await + .context("Failed to spawn calculation result task")? + }; + let timeline_state_cancellation = async { + loop { + match timeline_state_updates.changed().await { + Ok(()) => { + let new_state = *timeline_state_updates.borrow(); + match new_state { + // we're running this job for active timelines only + TimelineState::Active => continue, + TimelineState::Broken + | TimelineState::Stopping + | TimelineState::Suspended => { + break format!("aborted because timeline became inactive (new state: {new_state:?})") + } + } + } + Err(_sender_dropped_error) => { + // can't happen, the sender is not dropped as long as the Timeline exists + break "aborted because state watch was dropped".to_string(); + } + } + } + }; + + let taskmgr_shutdown_cancellation = async { + task_mgr::shutdown_watcher().await; + "aborted because task_mgr shutdown requested".to_string() + }; + + tokio::pin!(calculation); + loop { + tokio::select! { + res = &mut calculation => { return res } + reason = timeline_state_cancellation => { + debug!(reason = reason, "cancelling calculation"); + cancel.cancel(); + return calculation.await; + } + reason = taskmgr_shutdown_cancellation => { + debug!(reason = reason, "cancelling calculation"); + cancel.cancel(); + return calculation.await; + } + } } } /// Calculate the logical size of the database at the latest LSN. /// /// NOTE: counted incrementally, includes ancestors, this can be a slow operation. - pub fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result { + async fn calculate_logical_size( + &self, + up_to_lsn: Lsn, + cancel: CancellationToken, + ) -> Result { info!( "Calculating logical size for timeline {} at {}", self.timeline_id, up_to_lsn ); + // These failpoints are used by python tests to ensure that we don't delete + // the timeline while the logical size computation is ongoing. + // The first failpoint is used to make this function pause. + // Then the python test initiates timeline delete operation in a thread. + // It waits for a few seconds, then arms the second failpoint and disables + // the first failpoint. The second failpoint prints an error if the timeline + // delete code has deleted the on-disk state while we're still running here. + // It shouldn't do that. If it does it anyway, the error will be caught + // by the test suite, highlighting the problem. + fail::fail_point!("timeline-calculate-logical-size-pause"); + fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| { + if !self + .conf + .metadata_path(self.timeline_id, self.tenant_id) + .exists() + { + error!("timeline-calculate-logical-size-pre metadata file does not exist") + } + // need to return something + Ok(0) + }); let timer = if up_to_lsn == self.initdb_lsn { if let Some(size) = self.current_logical_size.initialized_size() { if size != 0 { @@ -1300,7 +1557,9 @@ impl Timeline { } else { self.metrics.logical_size_histo.start_timer() }; - let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn)?; + let logical_size = self + .get_current_logical_size_non_incremental(up_to_lsn, cancel) + .await?; debug!("calculated logical size: {logical_size}"); timer.stop_and_record(); Ok(logical_size) @@ -1333,18 +1592,27 @@ trait TraversalLayerExt { } impl TraversalLayerExt for Arc { - fn traversal_id(&self) -> String { - debug_assert!( - self.local_path().to_str().unwrap() - .contains(&format!("{}", self.get_timeline_id())), - "need timeline ID to uniquely identify the layer when tranversal crosses ancestor boundary", - ); - format!("{}", self.local_path().display()) + fn traversal_id(&self) -> TraversalId { + match self.local_path() { + Some(local_path) => { + debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", self.get_timeline_id())), + "need timeline ID to uniquely identify the layer when traversal crosses ancestor boundary", + ); + format!("{}", local_path.display()) + } + None => { + format!( + "remote {}/{}", + self.get_timeline_id(), + self.filename().file_name() + ) + } + } } } impl TraversalLayerExt for Arc { - fn traversal_id(&self) -> String { + fn traversal_id(&self) -> TraversalId { format!( "timeline {} in-memory {}", self.get_timeline_id(), @@ -1367,14 +1635,14 @@ impl Timeline { key: Key, request_lsn: Lsn, reconstruct_state: &mut ValueReconstructState, - ) -> anyhow::Result<()> { + ) -> PageReconstructResult<()> { // Start from the current timeline. let mut timeline_owned; let mut timeline = self; // For debugging purposes, collect the path of layers that we traversed // through. It's included in the error message if we fail to find the key. - let mut traversal_path = Vec::<(ValueReconstructResult, Lsn, TraversalId)>::new(); + let mut traversal_path = Vec::::new(); let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { *cached_lsn @@ -1394,12 +1662,12 @@ impl Timeline { // The function should have updated 'state' //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); match result { - ValueReconstructResult::Complete => return Ok(()), + ValueReconstructResult::Complete => return PageReconstructResult::Success(()), ValueReconstructResult::Continue => { // If we reached an earlier cached page image, we're done. if cont_lsn == cached_lsn + 1 { self.metrics.materialized_page_cache_hit_counter.inc_by(1); - return Ok(()); + return PageReconstructResult::Success(()); } if prev_lsn <= cont_lsn { // Didn't make any progress in last iteration. Error out to avoid @@ -1432,68 +1700,139 @@ impl Timeline { timeline.ancestor_lsn, cont_lsn ); - let ancestor = timeline.get_ancestor_timeline()?; + let ancestor = match timeline.get_ancestor_timeline() { + Ok(timeline) => timeline, + Err(e) => return PageReconstructResult::from(e), + }; timeline_owned = ancestor; timeline = &*timeline_owned; prev_lsn = Lsn(u64::MAX); - continue; + continue 'outer; } - let layers = timeline.layers.read().unwrap(); + #[allow(clippy::never_loop)] // see comment at bottom of this loop + '_layer_map_search: loop { + let remote_layer = { + let layers = timeline.layers.read().unwrap(); - // Check the open and frozen in-memory layers first, in order from newest - // to oldest. - if let Some(open_layer) = &layers.open_layer { - let start_lsn = open_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); - // Get all the data needed to reconstruct the page version from this layer. - // But if we have an older cached page image, no need to go past that. - let lsn_floor = max(cached_lsn + 1, start_lsn); - result = open_layer.get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - )?; - cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, open_layer.traversal_id())); - continue; - } - } - for frozen_layer in layers.frozen_layers.iter().rev() { - let start_lsn = frozen_layer.get_lsn_range().start; - if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); - let lsn_floor = max(cached_lsn + 1, start_lsn); - result = frozen_layer.get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - )?; - cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, frozen_layer.traversal_id())); - continue 'outer; - } - } + // Check the open and frozen in-memory layers first, in order from newest + // to oldest. + if let Some(open_layer) = &layers.open_layer { + let start_lsn = open_layer.get_lsn_range().start; + if cont_lsn > start_lsn { + //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); + // Get all the data needed to reconstruct the page version from this layer. + // But if we have an older cached page image, no need to go past that. + let lsn_floor = max(cached_lsn + 1, start_lsn); + result = match open_layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + ) { + Ok(result) => result, + Err(e) => return PageReconstructResult::from(e), + }; + cont_lsn = lsn_floor; + traversal_path.push(( + result, + cont_lsn, + Box::new({ + let open_layer = Arc::clone(open_layer); + move || open_layer.traversal_id() + }), + )); + continue 'outer; + } + } + for frozen_layer in layers.frozen_layers.iter().rev() { + let start_lsn = frozen_layer.get_lsn_range().start; + if cont_lsn > start_lsn { + //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); + let lsn_floor = max(cached_lsn + 1, start_lsn); + result = match frozen_layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + ) { + Ok(result) => result, + Err(e) => return PageReconstructResult::from(e), + }; + cont_lsn = lsn_floor; + traversal_path.push(( + result, + cont_lsn, + Box::new({ + let frozen_layer = Arc::clone(frozen_layer); + move || frozen_layer.traversal_id() + }), + )); + continue 'outer; + } + } - if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn)? { - //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); - - let lsn_floor = max(cached_lsn + 1, lsn_floor); - result = layer.get_value_reconstruct_data( - key, - lsn_floor..cont_lsn, - reconstruct_state, - )?; - cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, layer.traversal_id())); - } else if timeline.ancestor_timeline.is_some() { - // Nothing on this timeline. Traverse to parent - result = ValueReconstructResult::Continue; - cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); - } else { - // Nothing found - result = ValueReconstructResult::Missing; + if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { + // If it's a remote layer, download it and retry. + if let Some(remote_layer) = + super::storage_layer::downcast_remote_layer(&layer) + { + // TODO: push a breadcrumb to 'traversal_path' to record the fact that + // we downloaded / would need to download this layer. + remote_layer // download happens outside the scope of `layers` guard object + } else { + // Get all the data needed to reconstruct the page version from this layer. + // But if we have an older cached page image, no need to go past that. + let lsn_floor = max(cached_lsn + 1, lsn_floor); + result = match layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + ) { + Ok(result) => result, + Err(e) => return PageReconstructResult::from(e), + }; + cont_lsn = lsn_floor; + traversal_path.push(( + result, + cont_lsn, + Box::new({ + let layer = Arc::clone(&layer); + move || layer.traversal_id() + }), + )); + continue 'outer; + } + } else if timeline.ancestor_timeline.is_some() { + // Nothing on this timeline. Traverse to parent + result = ValueReconstructResult::Continue; + cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); + continue 'outer; + } else { + // Nothing found + result = ValueReconstructResult::Missing; + continue 'outer; + } + }; + // Indicate to the caller that we need remote_layer replaced with a downloaded + // layer in the layer map. The control flow could be a lot simpler, but the point + // of this commit is to prepare this function to + // 1. become async + // 2. do the download right here, using + // ``` + // download_remote_layer().await?; + // continue 'layer_map_search; + // ``` + // For (2), current rustc requires that the layers lock guard is not in scope. + // Hence, the complicated control flow. + let remote_layer_as_persistent: Arc = + Arc::clone(&remote_layer) as Arc; + info!( + "need remote layer {}", + remote_layer_as_persistent.traversal_id() + ); + return PageReconstructResult::NeedsDownload( + Weak::clone(&timeline.myself), + Arc::downgrade(&remote_layer), + ); } } } @@ -1710,9 +2049,11 @@ impl Timeline { let lsn_range = frozen_layer.get_lsn_range(); let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { - let (partitioning, _lsn) = - self.repartition(self.initdb_lsn, self.get_compaction_target_size())?; - self.create_image_layers(&partitioning, self.initdb_lsn, true)? + let (partitioning, _lsn) = self + .repartition(self.initdb_lsn, self.get_compaction_target_size()) + .await?; + self.create_image_layers(&partitioning, self.initdb_lsn, true) + .await? } else { // normal case, write out a L0 delta layer file. let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?; @@ -1809,13 +2150,9 @@ impl Timeline { if let Some(remote_client) = &self.remote_client { for (path, layer_metadata) in layer_paths_to_upload { - remote_client - .schedule_layer_file_upload(&path, &layer_metadata) - .context("schedule_layer_file_upload")?; + remote_client.schedule_layer_file_upload(&path, &layer_metadata)?; } - remote_client - .schedule_index_upload(&metadata) - .context("schedule_layer_file_upload")?; + remote_client.schedule_index_upload_for_metadata_update(&metadata)?; } Ok(()) @@ -1853,7 +2190,7 @@ impl Timeline { // update the timeline's physical size let sz = new_delta_path.metadata()?.len(); - self.metrics.current_physical_size_gauge.add(sz); + self.metrics.resident_physical_size_gauge.add(sz); // update metrics self.metrics.num_persistent_files_created.inc_by(1); self.metrics.persistent_bytes_written.inc_by(sz); @@ -1861,15 +2198,28 @@ impl Timeline { Ok((new_delta_filename, LayerFileMetadata::new(sz))) } - fn repartition(&self, lsn: Lsn, partition_size: u64) -> anyhow::Result<(KeyPartitioning, Lsn)> { - let mut partitioning_guard = self.partitioning.lock().unwrap(); - if partitioning_guard.1 == Lsn(0) - || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold + async fn repartition( + &self, + lsn: Lsn, + partition_size: u64, + ) -> anyhow::Result<(KeyPartitioning, Lsn)> { { - let keyspace = self.collect_keyspace(lsn)?; - let partitioning = keyspace.partition(partition_size); + let partitioning_guard = self.partitioning.lock().unwrap(); + if partitioning_guard.1 != Lsn(0) + && lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold + { + // no repartitioning needed + return Ok((partitioning_guard.0.clone(), partitioning_guard.1)); + } + } + let keyspace = self.collect_keyspace(lsn).await?; + let partitioning = keyspace.partition(partition_size); + + let mut partitioning_guard = self.partitioning.lock().unwrap(); + if lsn > partitioning_guard.1 { *partitioning_guard = (partitioning, lsn); - return Ok((partitioning_guard.0.clone(), lsn)); + } else { + warn!("Concurrent repartitioning of keyspace. This unexpected, but probably harmless"); } Ok((partitioning_guard.0.clone(), partitioning_guard.1)) } @@ -1915,7 +2265,7 @@ impl Timeline { Ok(false) } - fn create_image_layers( + async fn create_image_layers( &self, partitioning: &KeyPartitioning, lsn: Lsn, @@ -1942,7 +2292,7 @@ impl Timeline { for range in &partition.ranges { let mut key = range.start; while key < range.end { - let img = match self.get(key, lsn) { + let img = match self.get_download(key, lsn).await { Ok(img) => img, Err(err) => { // If we fail to reconstruct a VM or FSM page, we can zero the @@ -2005,7 +2355,9 @@ impl Timeline { layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len())); - self.metrics.current_physical_size_gauge.add(metadata.len()); + self.metrics + .resident_physical_size_gauge + .add(metadata.len()); layers.insert_historic(Arc::new(l)); } drop(layers); @@ -2083,38 +2435,40 @@ impl Timeline { // This iterator walks through all key-value pairs from all the layers // we're compacting, in key, LSN order. - let all_values_iter = deltas_to_compact - .iter() - .map(|l| l.iter()) - .kmerge_by(|a, b| { - if let Ok((a_key, a_lsn, _)) = a { - if let Ok((b_key, b_lsn, _)) = b { - match a_key.cmp(b_key) { - Ordering::Less => true, - Ordering::Equal => a_lsn <= b_lsn, - Ordering::Greater => false, + let all_values_iter = + itertools::process_results(deltas_to_compact.iter().map(|l| l.iter()), |iter_iter| { + iter_iter.kmerge_by(|a, b| { + if let Ok((a_key, a_lsn, _)) = a { + if let Ok((b_key, b_lsn, _)) = b { + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + } else { + false } } else { - false + true } - } else { - true - } - }); + }) + })?; // This iterator walks through all keys and is needed to calculate size used by each key - let mut all_keys_iter = deltas_to_compact - .iter() - .map(|l| l.key_iter()) - .kmerge_by(|a, b| { - let (a_key, a_lsn, _) = a; - let (b_key, b_lsn, _) = b; - match a_key.cmp(b_key) { - Ordering::Less => true, - Ordering::Equal => a_lsn <= b_lsn, - Ordering::Greater => false, - } - }); + let mut all_keys_iter = itertools::process_results( + deltas_to_compact.iter().map(|l| l.key_iter()), + |iter_iter| { + iter_iter.kmerge_by(|a, b| { + let (a_key, a_lsn, _) = a; + let (b_key, b_lsn, _) = b; + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + }) + }, + )?; // Merge the contents of all the input delta layers into a new set // of delta layers, based on the current partitioning. @@ -2284,6 +2638,11 @@ impl Timeline { deltas_to_compact, } = self.compact_level0_phase1(target_file_size).await?; + if new_layers.is_empty() && deltas_to_compact.is_empty() { + // nothing to do + return Ok(()); + } + // Before deleting any layers, we need to wait for their upload ops to finish. // See storage_sync module level comment on consistency. // Do it here because we don't want to hold self.layers.write() while waiting. @@ -2310,7 +2669,9 @@ impl Timeline { } // update the timeline's physical size - self.metrics.current_physical_size_gauge.add(metadata.len()); + self.metrics + .resident_physical_size_gauge + .add(metadata.len()); new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len())); let x: Arc = Arc::new(l); @@ -2321,10 +2682,11 @@ impl Timeline { // delete the old ones let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len()); for l in deltas_to_compact { - let path = l.local_path(); - self.metrics - .current_physical_size_gauge - .sub(path.metadata()?.len()); + if let Some(path) = l.local_path() { + self.metrics + .resident_physical_size_gauge + .sub(path.metadata()?.len()); + } layer_names_to_delete.push(l.filename()); l.delete()?; layers.remove_historic(l); @@ -2365,55 +2727,71 @@ impl Timeline { /// /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine /// whether a record is needed for PITR. - pub(super) fn update_gc_info( + /// + /// NOTE: This function holds a short-lived lock to protect the 'gc_info' + /// field, so that the three values passed as argument are stored + /// atomically. But the caller is responsible for ensuring that no new + /// branches are created that would need to be included in 'retain_lsns', + /// for example. The caller should hold `Tenant::gc_cs` lock to ensure + /// that. + /// + pub(super) async fn update_gc_info( &self, retain_lsns: Vec, cutoff_horizon: Lsn, pitr: Duration, ) -> anyhow::Result<()> { - let mut gc_info = self.gc_info.write().unwrap(); - - gc_info.horizon_cutoff = cutoff_horizon; - gc_info.retain_lsns = retain_lsns; - - // Calculate pitr cutoff point. - // If we cannot determine a cutoff LSN, be conservative and don't GC anything. - let mut pitr_cutoff_lsn: Lsn; - - if pitr != Duration::ZERO { - // conservative, safe default is to remove nothing, when we have no - // commit timestamp data available - pitr_cutoff_lsn = *self.get_latest_gc_cutoff_lsn(); - - // First, calculate pitr_cutoff_timestamp and then convert it to LSN. - // If we don't have enough data to convert to LSN, - // play safe and don't remove any layers. + // First, calculate pitr_cutoff_timestamp and then convert it to LSN. + // + // Some unit tests depend on garbage-collection working even when + // CLOG data is missing, so that find_lsn_for_timestamp() doesn't + // work, so avoid calling it altogether if time-based retention is not + // configured. It would be pointless anyway. + let pitr_cutoff = if pitr != Duration::ZERO { let now = SystemTime::now(); if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); - match self.find_lsn_for_timestamp(pitr_timestamp)? { - LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, + match with_ondemand_download(|| self.find_lsn_for_timestamp(pitr_timestamp)).await? + { + LsnForTimestamp::Present(lsn) => lsn, LsnForTimestamp::Future(lsn) => { + // The timestamp is in the future. That sounds impossible, + // but what it really means is that there hasn't been + // any commits since the cutoff timestamp. debug!("future({})", lsn); - pitr_cutoff_lsn = gc_info.horizon_cutoff; + cutoff_horizon } LsnForTimestamp::Past(lsn) => { debug!("past({})", lsn); + // conservative, safe default is to remove nothing, when we + // have no commit timestamp data available + *self.get_latest_gc_cutoff_lsn() } LsnForTimestamp::NoData(lsn) => { debug!("nodata({})", lsn); + // conservative, safe default is to remove nothing, when we + // have no commit timestamp data available + *self.get_latest_gc_cutoff_lsn() } } - debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) + } else { + // If we don't have enough data to convert to LSN, + // play safe and don't remove any layers. + *self.get_latest_gc_cutoff_lsn() } } else { - // No time-based retention. (Some unit tests depend on garbage-collection - // working even when CLOG data is missing, so that find_lsn_for_timestamp() - // above doesn't work.) - pitr_cutoff_lsn = gc_info.horizon_cutoff; - } - gc_info.pitr_cutoff = pitr_cutoff_lsn; + // No time-based retention was configured. Set time-based cutoff to + // same as LSN based. + cutoff_horizon + }; + + // Grab the lock and update the values + *self.gc_info.write().unwrap() = GcInfo { + retain_lsns, + horizon_cutoff: cutoff_horizon, + pitr_cutoff, + }; Ok(()) } @@ -2487,9 +2865,6 @@ impl Timeline { ); write_guard.store_and_unlock(new_gc_cutoff).wait(); } - // Persist the new GC cutoff value in the metadata file, before - // we actually remove anything. - self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; info!("GC starting"); @@ -2600,19 +2975,34 @@ impl Timeline { layers_to_remove.push(Arc::clone(&l)); } - // Actually delete the layers from disk and remove them from the map. - // (couldn't do this in the loop above, because you cannot modify a collection - // while iterating it. BTreeMap::retain() would be another option) - let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len()); - for doomed_layer in layers_to_remove { - let path = doomed_layer.local_path(); - self.metrics - .current_physical_size_gauge - .sub(path.metadata()?.len()); - layer_names_to_delete.push(doomed_layer.filename()); - doomed_layer.delete()?; - layers.remove_historic(doomed_layer); - result.layers_removed += 1; + if !layers_to_remove.is_empty() { + // Persist the new GC cutoff value in the metadata file, before + // we actually remove anything. + self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; + + // Actually delete the layers from disk and remove them from the map. + // (couldn't do this in the loop above, because you cannot modify a collection + // while iterating it. BTreeMap::retain() would be another option) + let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len()); + for doomed_layer in layers_to_remove { + if let Some(path) = doomed_layer.local_path() { + self.metrics + .resident_physical_size_gauge + .sub(path.metadata()?.len()); + } + layer_names_to_delete.push(doomed_layer.filename()); + doomed_layer.delete()?; // FIXME: schedule succeeded deletions before returning? + layers.remove_historic(doomed_layer); + result.layers_removed += 1; + } + + if result.layers_removed != 0 { + fail_point!("after-timeline-gc-removed-layers"); + } + + if let Some(remote_client) = &self.remote_client { + remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?; + } } info!( @@ -2620,14 +3010,6 @@ impl Timeline { result.layers_removed, new_gc_cutoff ); - if result.layers_removed != 0 { - fail_point!("after-timeline-gc-removed-layers"); - } - - if let Some(remote_client) = &self.remote_client { - remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?; - } - result.elapsed = now.elapsed()?; Ok(result) } @@ -2640,7 +3022,7 @@ impl Timeline { key: Key, request_lsn: Lsn, mut data: ValueReconstructState, - ) -> anyhow::Result { + ) -> PageReconstructResult { // Perform WAL redo if needed data.records.reverse(); @@ -2652,9 +3034,11 @@ impl Timeline { key, img_lsn ); - Ok(img.clone()) + PageReconstructResult::Success(img.clone()) } else { - bail!("base image for {} at {} not found", key, request_lsn); + PageReconstructResult::from(anyhow!( + "base image for {key} at {request_lsn} not found" + )) } } else { // We need to do WAL redo. @@ -2662,12 +3046,12 @@ impl Timeline { // If we don't have a base image, then the oldest WAL record better initialize // the page if data.img.is_none() && !data.records.first().unwrap().1.will_init() { - bail!( + PageReconstructResult::from(anyhow!( "Base image for {} at {} not found, but got {} WAL records", key, request_lsn, data.records.len() - ); + )) } else { if data.img.is_some() { trace!( @@ -2682,14 +3066,18 @@ impl Timeline { let last_rec_lsn = data.records.last().unwrap().0; - let img = self + let img = match self .walredo_mgr .request_redo(key, request_lsn, data.img, data.records, self.pg_version) - .context("Failed to reconstruct a page image:")?; + .context("Failed to reconstruct a page image:") + { + Ok(img) => img, + Err(e) => return PageReconstructResult::from(e), + }; if img.len() == page_cache::PAGE_SZ { let cache = page_cache::get(); - cache + if let Err(e) = cache .memorize_materialized_page( self.tenant_id, self.timeline_id, @@ -2697,29 +3085,347 @@ impl Timeline { last_rec_lsn, &img, ) - .context("Materialized page memoization failed")?; + .context("Materialized page memoization failed") + { + return PageReconstructResult::from(e); + } } - Ok(img) + PageReconstructResult::Success(img) + } + } + } + + /// Download a layer file from remote storage and insert it into the layer map. + /// + /// It's safe to call this function for the same layer concurrently. In that case: + /// - If the layer has already been downloaded, `OK(...)` is returned. + /// - If the layer is currently being downloaded, we wait until that download succeeded / failed. + /// - If it succeeded, we return `Ok(...)`. + /// - If it failed, we or another concurrent caller will initiate a new download attempt. + /// + /// Download errors are classified and retried if appropriate by the underlying RemoteTimelineClient function. + /// It has an internal limit for the maximum number of retries and prints appropriate log messages. + /// If we exceed the limit, it returns an error, and this function passes it through. + /// The caller _could_ retry further by themselves by calling this function again, but _should not_ do it. + /// The reason is that they cannot distinguish permanent errors from temporary ones, whereas + /// the underlying RemoteTimelineClient can. + /// + /// There is no internal timeout or slowness detection. + /// If the caller has a deadline or needs a timeout, they can simply stop polling: + /// we're **cancellation-safe** because the download happens in a separate task_mgr task. + /// So, the current download attempt will run to completion even if we stop polling. + #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))] + pub async fn download_remote_layer( + self: Arc, + remote_layer: Arc, + ) -> anyhow::Result<()> { + let permit = match Arc::clone(&remote_layer.ongoing_download) + .acquire_owned() + .await + { + Ok(permit) => permit, + Err(_closed) => { + info!("download of layer has already finished"); + return Ok(()); + } + }; + + let (sender, receiver) = tokio::sync::oneshot::channel(); + // Spawn a task so that download does not outlive timeline when we detach tenant / delete timeline. + task_mgr::spawn( + &tokio::runtime::Handle::current(), + TaskKind::RemoteDownloadTask, + Some(self.tenant_id), + Some(self.timeline_id), + &format!("download layer {}", remote_layer.short_id()), + false, + async move { + let remote_client = self.remote_client.as_ref().unwrap(); + + // Does retries + exponential back-off internally. + // When this fails, don't layer further retry attempts here. + let result = remote_client + .download_layer_file(&remote_layer.file_name, &remote_layer.layer_metadata) + .await; + + if let Ok(size) = &result { + // XXX the temp file is still around in Err() case + // and consumes space until we clean up upon pageserver restart. + self.metrics.resident_physical_size_gauge.add(*size); + + // Download complete. Replace the RemoteLayer with the corresponding + // Delta- or ImageLayer in the layer map. + let new_layer = remote_layer.create_downloaded_layer(self.conf, *size); + let mut layers = self.layers.write().unwrap(); + { + let l: Arc = remote_layer.clone(); + layers.remove_historic(l); + } + layers.insert_historic(new_layer); + drop(layers); + + // Now that we've inserted the download into the layer map, + // close the semaphore. This will make other waiters for + // this download return Ok(()). + assert!(!remote_layer.ongoing_download.is_closed()); + remote_layer.ongoing_download.close(); + } else { + // Keep semaphore open. We'll drop the permit at the end of the function. + } + + // Don't treat it as an error if the task that triggered the download + // is no longer interested in the result. + sender.send(result.map(|_sz| ())).ok(); + + // In case we failed and there are other waiters, this will make one + // of them retry the download in a new task. + // XXX: This resets the exponential backoff because it's a new call to + // download_layer file. + drop(permit); + + Ok(()) + }, + ); + + receiver.await.context("download task cancelled")? + } + + pub async fn spawn_download_all_remote_layers( + self: Arc, + ) -> Result { + let mut status_guard = self.download_all_remote_layers_task_info.write().unwrap(); + if let Some(st) = &*status_guard { + match &st.state { + DownloadRemoteLayersTaskState::Running => { + return Err(st.clone()); + } + DownloadRemoteLayersTaskState::ShutDown + | DownloadRemoteLayersTaskState::Completed => { + *status_guard = None; + } + } + } + + let self_clone = Arc::clone(&self); + let task_id = task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::DownloadAllRemoteLayers, + Some(self.tenant_id), + Some(self.timeline_id), + "download all remote layers task", + false, + async move { + self_clone.download_all_remote_layers().await; + let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap(); + match &mut *status_guard { + None => { + warn!("tasks status is supposed to be Some(), since we are running"); + } + Some(st) => { + let exp_task_id = format!("{}", task_mgr::current_task_id().unwrap()); + if st.task_id != exp_task_id { + warn!("task id changed while we were still running, expecting {} but have {}", exp_task_id, st.task_id); + } else { + st.state = DownloadRemoteLayersTaskState::Completed; + } + } + }; + Ok(()) + } + .instrument(info_span!(parent: None, "download_all_remote_layers", tenant = %self.tenant_id, timeline = %self.timeline_id)) + ); + + let initial_info = DownloadRemoteLayersTaskInfo { + task_id: format!("{task_id}"), + state: DownloadRemoteLayersTaskState::Running, + total_layer_count: 0, + successful_download_count: 0, + failed_download_count: 0, + }; + *status_guard = Some(initial_info.clone()); + + Ok(initial_info) + } + + async fn download_all_remote_layers(self: &Arc) { + let mut downloads: FuturesUnordered<_> = { + let layers = self.layers.read().unwrap(); + layers + .iter_historic_layers() + .filter_map(|l| l.downcast_remote_layer()) + .map({ + |l| { + let self_clone = Arc::clone(self); + self_clone.download_remote_layer(l) + } + }) + .collect() + }; + + macro_rules! lock_status { + ($st:ident) => { + let mut st = self.download_all_remote_layers_task_info.write().unwrap(); + let st = st + .as_mut() + .expect("this function is only called after the task has been spawned"); + assert_eq!( + st.task_id, + format!( + "{}", + task_mgr::current_task_id().expect("we run inside a task_mgr task") + ) + ); + let $st = st; + }; + } + + { + lock_status!(st); + st.total_layer_count = downloads.len().try_into().unwrap(); + } + loop { + tokio::select! { + dl = downloads.next() => { + lock_status!(st); + match dl { + None => break, + Some(Ok(())) => { + st.successful_download_count += 1; + }, + Some(Err(e)) => { + error!(error = %e, "layer download failed"); + st.failed_download_count += 1; + } + } + } + _ = task_mgr::shutdown_watcher() => { + // Kind of pointless to watch for shutdowns here, + // as download_remote_layer spawns other task_mgr tasks internally. + lock_status!(st); + st.state = DownloadRemoteLayersTaskState::ShutDown; + } + } + } + { + lock_status!(st); + st.state = DownloadRemoteLayersTaskState::Completed; + } + } + + pub fn get_download_all_remote_layers_task_info(&self) -> Option { + self.download_all_remote_layers_task_info + .read() + .unwrap() + .clone() + } +} + +/// Helper function to deal with [`PageReconstructResult`]. +/// +/// Takes a sync closure that returns a [`PageReconstructResult`]. +/// If it is [`PageReconstructResult::NeedsDownload`], +/// do the download and retry the closure. +/// +/// ### Background +/// +/// This is a crutch to make on-demand downloads efficient in +/// our async-sync-async sandwich codebase. Some context: +/// +/// - The code that does the downloads uses async Rust. +/// - The code that initiates download is many levels of sync Rust. +/// - The sync code must wait for the download to finish to +/// make further progress. +/// - The sync code is invoked directly from async functions upstack. +/// +/// Example (there are also much worse ones where the sandwich is taller) +/// +/// async handle_get_page_at_lsn_request page_service.rs +/// sync get_rel_page_at_lsn timeline.rs +/// sync timeline.get timeline.rs +/// sync get_reconstruct_data timeline.rs +/// async download_remote_layer timeline.rs +/// +/// It is not possible to Timeline::download_remote_layer().await within +/// get_reconstruct_data, so instead, we return [`PageReconstructResult::NeedsDownload`] +/// which contains references to the [`Timeline`] and [`RemoteLayer`]. +/// We bubble that error upstack to the async code, which can then call +/// `Timeline::download_remote_layer().await`. +/// That is _efficient_ because tokio can use the same OS thread to do +/// other work while we're waiting for the download. +/// +/// It is a deliberate decision to use a new result type to communicate +/// the need for download instead of adding another variant to [`PageReconstructError`]. +/// The reason is that with the latter approach, any place that does +/// `?` on a `Result` will implicitly ignore the +/// need for download. We want that to be explicit, so that +/// - the code base becomes greppable for places that don't do a download +/// - future code changes will need to explicilty address for on-demand download +/// +/// Alternatives to consider in the future: +/// +/// - Inside `get_reconstruct_data`, we can std::thread::spawn a thread +/// and use it to block_on the download_remote_layer future. +/// That is obviously inefficient as it creates one thread per download. +/// - Convert everything to async. The problem here is that the sync +/// functions are used by many other sync functions. So, the scope +/// creep of such a conversion is tremendous. +/// - Compromise between the two: implement async functions for each sync +/// function. Switch over the hot code paths (GetPage()) to use the +/// async path, so that the hot path doesn't spawn threads. Other code +/// paths would remain sync initially, and get converted to async over time. +/// +pub async fn with_ondemand_download(mut f: F) -> Result +where + F: Send + FnMut() -> PageReconstructResult, + T: Send, +{ + loop { + let closure_result = f(); + match closure_result { + PageReconstructResult::NeedsDownload(weak_timeline, weak_remote_layer) => { + // if the timeline is gone, it has likely been deleted / tenant detached + let tl = weak_timeline.upgrade().context("timeline is gone")?; + // if the remote layer got removed, retry the function, it might succeed now + let remote_layer = match weak_remote_layer.upgrade() { + None => { + info!("remote layer is gone, retrying closure"); + continue; + } + Some(l) => l, + }; + // Does retries internally + tl.download_remote_layer(remote_layer).await?; + // Download successful, retry the closure + continue; + } + PageReconstructResult::Success(closure_value) => return Ok(closure_value), + PageReconstructResult::Error(e) => { + return Err(anyhow::Error::new(e).context("Failed to reconstruct the page")) } } } } +type TraversalPathItem = ( + ValueReconstructResult, + Lsn, + Box TraversalId>, +); + /// Helper function for get_reconstruct_data() to add the path of layers traversed /// to an error, as anyhow context information. -fn layer_traversal_error( - msg: String, - path: Vec<(ValueReconstructResult, Lsn, TraversalId)>, -) -> anyhow::Result<()> { +fn layer_traversal_error(msg: String, path: Vec) -> PageReconstructResult<()> { // We want the original 'msg' to be the outermost context. The outermost context // is the most high-level information, which also gets propagated to the client. let mut msg_iter = path - .iter() + .into_iter() .map(|(r, c, l)| { format!( "layer traversal: result {:?}, cont_lsn {}, layer: {}", - r, c, l, + r, + c, + l(), ) }) .chain(std::iter::once(msg)); @@ -2727,7 +3433,8 @@ fn layer_traversal_error( let err = anyhow!(msg_iter.next().unwrap()); // Append all subsequent traversals, and the error message 'msg', as contexts. - Err(msg_iter.fold(err, |err, msg| err.context(msg))) + let msg = msg_iter.fold(err, |err, msg| err.context(msg)); + PageReconstructResult::from(msg) } /// Various functions to mutate the timeline. @@ -2787,9 +3494,9 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> { let mut new_path = path.to_owned(); for i in 0u32.. { - new_path.set_file_name(format!("{}.{}.old", filename, i)); + new_path.set_file_name(format!("{filename}.{i}.old")); if !new_path.exists() { - std::fs::rename(&path, &new_path)?; + std::fs::rename(path, &new_path)?; return Ok(()); } } diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs new file mode 100644 index 0000000000..790b2f59aa --- /dev/null +++ b/pageserver/src/tenant/upload_queue.rs @@ -0,0 +1,213 @@ +use crate::metrics::RemoteOpFileKind; + +use super::storage_layer::LayerFileName; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::remote_timeline_client::index::IndexPart; +use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use std::collections::{HashMap, VecDeque}; +use std::fmt::Debug; + +use std::sync::Arc; +use tracing::info; + +use std::sync::atomic::AtomicU32; +use utils::lsn::Lsn; + +// clippy warns that Uninitialized is much smaller than Initialized, which wastes +// memory for Uninitialized variants. Doesn't matter in practice, there are not +// that many upload queues in a running pageserver, and most of them are initialized +// anyway. +#[allow(clippy::large_enum_variant)] +pub(crate) enum UploadQueue { + Uninitialized, + Initialized(UploadQueueInitialized), + Stopped(UploadQueueStopped), +} + +impl UploadQueue { + fn as_str(&self) -> &'static str { + match self { + UploadQueue::Uninitialized => "Uninitialized", + UploadQueue::Initialized(_) => "Initialized", + UploadQueue::Stopped(_) => "Stopped", + } + } +} + +/// This keeps track of queued and in-progress tasks. +pub(crate) struct UploadQueueInitialized { + /// Counter to assign task IDs + pub(crate) task_counter: u64, + + /// All layer files stored in the remote storage, taking into account all + /// in-progress and queued operations + pub(crate) latest_files: HashMap, + + /// How many file uploads or deletions been scheduled, since the + /// last (scheduling of) metadata index upload? + pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64, + + /// Metadata stored in the remote storage, taking into account all + /// in-progress and queued operations. + /// DANGER: do not return to outside world, e.g., safekeepers. + pub(crate) latest_metadata: TimelineMetadata, + + /// `disk_consistent_lsn` from the last metadata file that was successfully + /// uploaded. `Lsn(0)` if nothing was uploaded yet. + /// Unlike `latest_files` or `latest_metadata`, this value is never ahead. + /// Safekeeper can rely on it to make decisions for WAL storage. + pub(crate) last_uploaded_consistent_lsn: Lsn, + + // Breakdown of different kinds of tasks currently in-progress + pub(crate) num_inprogress_layer_uploads: usize, + pub(crate) num_inprogress_metadata_uploads: usize, + pub(crate) num_inprogress_deletions: usize, + + /// Tasks that are currently in-progress. In-progress means that a tokio Task + /// has been launched for it. An in-progress task can be busy uploading, but it can + /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can + /// be waiting for retry in `exponential_backoff`. + pub(crate) inprogress_tasks: HashMap>, + + /// Queued operations that have not been launched yet. They might depend on previous + /// tasks to finish. For example, metadata upload cannot be performed before all + /// preceding layer file uploads have completed. + pub(crate) queued_operations: VecDeque, +} + +pub(crate) struct UploadQueueStopped { + pub(crate) last_uploaded_consistent_lsn: Lsn, +} + +impl UploadQueue { + pub(crate) fn initialize_empty_remote( + &mut self, + metadata: &TimelineMetadata, + ) -> anyhow::Result<&mut UploadQueueInitialized> { + match self { + UploadQueue::Uninitialized => (), + UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => { + anyhow::bail!("already initialized, state {}", self.as_str()) + } + } + + info!("initializing upload queue for empty remote"); + + let state = UploadQueueInitialized { + // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead. + latest_files: HashMap::new(), + latest_files_changes_since_metadata_upload_scheduled: 0, + latest_metadata: metadata.clone(), + // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent + // safekeepers from garbage-collecting anything. + last_uploaded_consistent_lsn: Lsn(0), + // what follows are boring default initializations + task_counter: 0, + num_inprogress_layer_uploads: 0, + num_inprogress_metadata_uploads: 0, + num_inprogress_deletions: 0, + inprogress_tasks: HashMap::new(), + queued_operations: VecDeque::new(), + }; + + *self = UploadQueue::Initialized(state); + Ok(self.initialized_mut().expect("we just set it")) + } + + pub(crate) fn initialize_with_current_remote_index_part( + &mut self, + index_part: &IndexPart, + ) -> anyhow::Result<&mut UploadQueueInitialized> { + match self { + UploadQueue::Uninitialized => (), + UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => { + anyhow::bail!("already initialized, state {}", self.as_str()) + } + } + + let mut files = HashMap::with_capacity(index_part.timeline_layers.len()); + for layer_name in &index_part.timeline_layers { + let layer_metadata = index_part + .layer_metadata + .get(layer_name) + .map(LayerFileMetadata::from) + .unwrap_or(LayerFileMetadata::MISSING); + files.insert(layer_name.to_owned(), layer_metadata); + } + + let index_part_metadata = index_part.parse_metadata()?; + info!( + "initializing upload queue with remote index_part.disk_consistent_lsn: {}", + index_part_metadata.disk_consistent_lsn() + ); + + let state = UploadQueueInitialized { + latest_files: files, + latest_files_changes_since_metadata_upload_scheduled: 0, + latest_metadata: index_part_metadata.clone(), + last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(), + // what follows are boring default initializations + task_counter: 0, + num_inprogress_layer_uploads: 0, + num_inprogress_metadata_uploads: 0, + num_inprogress_deletions: 0, + inprogress_tasks: HashMap::new(), + queued_operations: VecDeque::new(), + }; + + *self = UploadQueue::Initialized(state); + Ok(self.initialized_mut().expect("we just set it")) + } + + pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> { + match self { + UploadQueue::Uninitialized | UploadQueue::Stopped(_) => { + anyhow::bail!("queue is in state {}", self.as_str()) + } + UploadQueue::Initialized(x) => Ok(x), + } + } +} + +/// An in-progress upload or delete task. +#[derive(Debug)] +pub(crate) struct UploadTask { + /// Unique ID of this task. Used as the key in `inprogress_tasks` above. + pub(crate) task_id: u64, + pub(crate) retries: AtomicU32, + + pub(crate) op: UploadOp, +} + +#[derive(Debug)] +pub(crate) enum UploadOp { + /// Upload a layer file + UploadLayer(LayerFileName, LayerFileMetadata), + + /// Upload the metadata file + UploadMetadata(IndexPart, Lsn), + + /// Delete a file. + Delete(RemoteOpFileKind, LayerFileName), + + /// Barrier. When the barrier operation is reached, + Barrier(tokio::sync::watch::Sender<()>), +} + +impl std::fmt::Display for UploadOp { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + UploadOp::UploadLayer(path, metadata) => { + write!( + f, + "UploadLayer({}, size={:?})", + path.file_name(), + metadata.file_size() + ) + } + UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn), + UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()), + UploadOp::Barrier(_) => write!(f, "Barrier"), + } + } +} diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 46e4acd50c..fb216123c1 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -12,7 +12,7 @@ //! use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME}; use once_cell::sync::OnceCell; -use std::fs::{File, OpenOptions}; +use std::fs::{self, File, OpenOptions}; use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write}; use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; @@ -240,6 +240,10 @@ impl VirtualFile { self.with_file("fsync", |file| file.sync_all())? } + pub fn metadata(&self) -> Result { + self.with_file("metadata", |file| file.metadata())? + } + /// Helper function that looks up the underlying File for this VirtualFile, /// opening it and evicting some other File if necessary. It calls 'func' /// with the physical File. diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index e8a2e99f06..1c974f7e2a 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -21,7 +21,6 @@ //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. -use anyhow::Context; use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; @@ -32,6 +31,7 @@ use tracing::*; use crate::pgdatadir_mapping::*; use crate::tenant::Timeline; +use crate::tenant::{with_ondemand_download, PageReconstructError}; use crate::walrecord::*; use crate::ZERO_PAGE; use pageserver_api::reltag::{RelTag, SlruKind}; @@ -52,10 +52,11 @@ pub struct WalIngest<'a> { } impl<'a> WalIngest<'a> { - pub fn new(timeline: &Timeline, startpoint: Lsn) -> Result { + pub async fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. - let checkpoint_bytes = timeline.get_checkpoint(startpoint)?; + let checkpoint_bytes = + with_ondemand_download(|| timeline.get_checkpoint(startpoint)).await?; let checkpoint = CheckPoint::decode(&checkpoint_bytes)?; trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); @@ -74,16 +75,15 @@ impl<'a> WalIngest<'a> { /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the /// relations/pages that the record affects. /// - pub fn ingest_record( + pub async fn ingest_record( &mut self, recdata: Bytes, lsn: Lsn, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, decoded: &mut DecodedWALRecord, - ) -> Result<()> { + ) -> anyhow::Result<()> { modification.lsn = lsn; - decode_wal_record(recdata, decoded, self.timeline.pg_version) - .context("failed decoding wal record")?; + decode_wal_record(recdata, decoded, self.timeline.pg_version)?; let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -98,7 +98,8 @@ impl<'a> WalIngest<'a> { if decoded.xl_rmid == pg_constants::RM_HEAP_ID || decoded.xl_rmid == pg_constants::RM_HEAP2_ID { - self.ingest_heapam_record(&mut buf, modification, decoded)?; + self.ingest_heapam_record(&mut buf, modification, decoded) + .await?; } // Handle other special record types if decoded.xl_rmid == pg_constants::RM_SMGR_ID @@ -112,7 +113,8 @@ impl<'a> WalIngest<'a> { == pg_constants::XLOG_SMGR_TRUNCATE { let truncate = XlSmgrTruncate::decode(&mut buf); - self.ingest_xlog_smgr_truncate(modification, &truncate)?; + self.ingest_xlog_smgr_truncate(modification, &truncate) + .await?; } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { debug!( "handle RM_DBASE_ID for Postgres version {:?}", @@ -125,7 +127,8 @@ impl<'a> WalIngest<'a> { let createdb = XlCreateDatabase::decode(&mut buf); debug!("XLOG_DBASE_CREATE v14"); - self.ingest_xlog_dbase_create(modification, &createdb)?; + self.ingest_xlog_dbase_create(modification, &createdb) + .await?; } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == postgres_ffi::v14::bindings::XLOG_DBASE_DROP { @@ -148,7 +151,8 @@ impl<'a> WalIngest<'a> { // So we can reuse XlCreateDatabase here. debug!("XLOG_DBASE_CREATE_FILE_COPY"); let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(modification, &createdb)?; + self.ingest_xlog_dbase_create(modification, &createdb) + .await?; } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == postgres_ffi::v15::bindings::XLOG_DBASE_DROP { @@ -173,11 +177,13 @@ impl<'a> WalIngest<'a> { segno, rpageno, ZERO_PAGE.clone(), - )?; + ) + .await?; } else { assert!(info == pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(&mut buf); - self.ingest_clog_truncate_record(modification, &xlrec)?; + self.ingest_clog_truncate_record(modification, &xlrec) + .await?; } } else if decoded.xl_rmid == pg_constants::RM_XACT_ID { let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; @@ -188,7 +194,8 @@ impl<'a> WalIngest<'a> { modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, - )?; + ) + .await?; } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED || info == pg_constants::XLOG_XACT_ABORT_PREPARED { @@ -198,7 +205,8 @@ impl<'a> WalIngest<'a> { modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, - )?; + ) + .await?; // Remove twophase file. see RemoveTwoPhaseFile() in postgres code trace!( "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}", @@ -223,7 +231,8 @@ impl<'a> WalIngest<'a> { segno, rpageno, ZERO_PAGE.clone(), - )?; + ) + .await?; } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -234,7 +243,8 @@ impl<'a> WalIngest<'a> { segno, rpageno, ZERO_PAGE.clone(), - )?; + ) + .await?; } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { let xlrec = XlMultiXactCreate::decode(&mut buf); self.ingest_multixact_create_record(modification, &xlrec)?; @@ -279,7 +289,8 @@ impl<'a> WalIngest<'a> { // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. for blk in decoded.blocks.iter() { - self.ingest_decoded_block(modification, lsn, decoded, blk)?; + self.ingest_decoded_block(modification, lsn, decoded, blk) + .await?; } // If checkpoint data was updated, store the new version in the repository @@ -297,18 +308,18 @@ impl<'a> WalIngest<'a> { Ok(()) } - fn ingest_decoded_block( + async fn ingest_decoded_block( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, lsn: Lsn, decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, - ) -> Result<()> { + ) -> Result<(), PageReconstructError> { let rel = RelTag { spcnode: blk.rnode_spcnode, dbnode: blk.rnode_dbnode, relnode: blk.rnode_relnode, - forknum: blk.forknum as u8, + forknum: blk.forknum, }; // @@ -345,23 +356,25 @@ impl<'a> WalIngest<'a> { page_set_lsn(&mut image, lsn) } assert_eq!(image.len(), BLCKSZ as usize); - self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; + self.put_rel_page_image(modification, rel, blk.blkno, image.freeze()) + .await?; } else { let rec = NeonWalRecord::Postgres { will_init: blk.will_init || blk.apply_image, rec: decoded.record.clone(), }; - self.put_rel_wal_record(modification, rel, blk.blkno, rec)?; + self.put_rel_wal_record(modification, rel, blk.blkno, rec) + .await?; } Ok(()) } - fn ingest_heapam_record( + async fn ingest_heapam_record( &mut self, buf: &mut Bytes, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, decoded: &mut DecodedWALRecord, - ) -> Result<()> { + ) -> anyhow::Result<()> { // Handle VM bit updates that are implicitly part of heap records. // First, look at the record to determine which VM bits need @@ -440,7 +453,7 @@ impl<'a> WalIngest<'a> { // replaying it would fail to find the previous image of the page, because // it doesn't exist. So check if the VM page(s) exist, and skip the WAL // record if it doesn't. - let vm_size = self.get_relsize(vm_rel, modification.lsn)?; + let vm_size = self.get_relsize(vm_rel, modification.lsn).await?; if let Some(blknum) = new_vm_blk { if blknum >= vm_size { new_vm_blk = None; @@ -465,7 +478,8 @@ impl<'a> WalIngest<'a> { old_heap_blkno, flags: pg_constants::VISIBILITYMAP_VALID_BITS, }, - )?; + ) + .await?; } else { // Clear VM bits for one heap page, or for two pages that reside on // different VM pages. @@ -479,7 +493,8 @@ impl<'a> WalIngest<'a> { old_heap_blkno: None, flags: pg_constants::VISIBILITYMAP_VALID_BITS, }, - )?; + ) + .await?; } if let Some(old_vm_blk) = old_vm_blk { self.put_rel_wal_record( @@ -491,7 +506,8 @@ impl<'a> WalIngest<'a> { old_heap_blkno, flags: pg_constants::VISIBILITYMAP_VALID_BITS, }, - )?; + ) + .await?; } } } @@ -501,11 +517,11 @@ impl<'a> WalIngest<'a> { } /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record. - fn ingest_xlog_dbase_create( + async fn ingest_xlog_dbase_create( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, rec: &XlCreateDatabase, - ) -> Result<()> { + ) -> anyhow::Result<()> { let db_id = rec.db_id; let tablespace_id = rec.tablespace_id; let src_db_id = rec.src_db_id; @@ -518,16 +534,22 @@ impl<'a> WalIngest<'a> { // get calls instead. let req_lsn = modification.tline.get_last_record_lsn(); - let rels = modification - .tline - .list_rels(src_tablespace_id, src_db_id, req_lsn)?; + let rels = with_ondemand_download(|| { + modification + .tline + .list_rels(src_tablespace_id, src_db_id, req_lsn) + }) + .await?; debug!("ingest_xlog_dbase_create: {} rels", rels.len()); // Copy relfilemap - let filemap = modification - .tline - .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)?; + let filemap = with_ondemand_download(|| { + modification + .tline + .get_relmap_file(src_tablespace_id, src_db_id, req_lsn) + }) + .await?; modification.put_relmap_file(tablespace_id, db_id, filemap)?; let mut num_rels_copied = 0; @@ -536,7 +558,9 @@ impl<'a> WalIngest<'a> { assert_eq!(src_rel.spcnode, src_tablespace_id); assert_eq!(src_rel.dbnode, src_db_id); - let nblocks = modification.tline.get_rel_size(src_rel, req_lsn, true)?; + let nblocks = + with_ondemand_download(|| modification.tline.get_rel_size(src_rel, req_lsn, true)) + .await?; let dst_rel = RelTag { spcnode: tablespace_id, dbnode: db_id, @@ -551,9 +575,12 @@ impl<'a> WalIngest<'a> { for blknum in 0..nblocks { debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel); - let content = modification - .tline - .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)?; + let content = with_ondemand_download(|| { + modification + .tline + .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true) + }) + .await?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; } @@ -572,7 +599,7 @@ impl<'a> WalIngest<'a> { &mut self, modification: &mut DatadirModification, rec: &XlSmgrCreate, - ) -> Result<()> { + ) -> anyhow::Result<()> { let rel = RelTag { spcnode: rec.rnode.spcnode, dbnode: rec.rnode.dbnode, @@ -586,11 +613,11 @@ impl<'a> WalIngest<'a> { /// Subroutine of ingest_record(), to handle an XLOG_SMGR_TRUNCATE record. /// /// This is the same logic as in PostgreSQL's smgr_redo() function. - fn ingest_xlog_smgr_truncate( + async fn ingest_xlog_smgr_truncate( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, rec: &XlSmgrTruncate, - ) -> Result<()> { + ) -> anyhow::Result<()> { let spcnode = rec.rnode.spcnode; let dbnode = rec.rnode.dbnode; let relnode = rec.rnode.relnode; @@ -620,7 +647,7 @@ impl<'a> WalIngest<'a> { modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?; fsm_physical_page_no += 1; } - let nblocks = self.get_relsize(rel, modification.lsn)?; + let nblocks = self.get_relsize(rel, modification.lsn).await?; if nblocks > fsm_physical_page_no { // check if something to do: FSM is larger than truncate position self.put_rel_truncation(modification, rel, fsm_physical_page_no)?; @@ -641,7 +668,7 @@ impl<'a> WalIngest<'a> { modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?; vm_page_no += 1; } - let nblocks = self.get_relsize(rel, modification.lsn)?; + let nblocks = self.get_relsize(rel, modification.lsn).await?; if nblocks > vm_page_no { // check if something to do: VM is larger than truncate position self.put_rel_truncation(modification, rel, vm_page_no)?; @@ -652,12 +679,12 @@ impl<'a> WalIngest<'a> { /// Subroutine of ingest_record(), to handle an XLOG_XACT_* records. /// - fn ingest_xact_record( + async fn ingest_xact_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, parsed: &XlXactParsedRecord, is_commit: bool, - ) -> Result<()> { + ) -> anyhow::Result<()> { // Record update of CLOG pages let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE; let mut segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -713,7 +740,9 @@ impl<'a> WalIngest<'a> { relnode: xnode.relnode, }; let last_lsn = self.timeline.get_last_record_lsn(); - if modification.tline.get_rel_exists(rel, last_lsn, true)? { + if with_ondemand_download(|| modification.tline.get_rel_exists(rel, last_lsn, true)) + .await? + { self.put_rel_drop(modification, rel)?; } } @@ -721,11 +750,11 @@ impl<'a> WalIngest<'a> { Ok(()) } - fn ingest_clog_truncate_record( + async fn ingest_clog_truncate_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, xlrec: &XlClogTruncate, - ) -> Result<()> { + ) -> anyhow::Result<()> { info!( "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}", xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db @@ -765,10 +794,14 @@ impl<'a> WalIngest<'a> { // it. So we use the previous record's LSN in the get calls // instead. let req_lsn = modification.tline.get_last_record_lsn(); - for segno in modification - .tline - .list_slru_segments(SlruKind::Clog, req_lsn)? - { + + let slru_segments = with_ondemand_download(|| { + modification + .tline + .list_slru_segments(SlruKind::Clog, req_lsn) + }) + .await?; + for segno in slru_segments { let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; if slru_may_delete_clogsegment(segpage, xlrec.pageno) { modification.drop_slru_segment(SlruKind::Clog, segno)?; @@ -917,26 +950,26 @@ impl<'a> WalIngest<'a> { Ok(()) } - fn put_rel_page_image( + async fn put_rel_page_image( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, rel: RelTag, blknum: BlockNumber, img: Bytes, - ) -> Result<()> { - self.handle_rel_extend(modification, rel, blknum)?; + ) -> anyhow::Result<()> { + self.handle_rel_extend(modification, rel, blknum).await?; modification.put_rel_page_image(rel, blknum, img)?; Ok(()) } - fn put_rel_wal_record( + async fn put_rel_wal_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, rel: RelTag, blknum: BlockNumber, rec: NeonWalRecord, - ) -> Result<()> { - self.handle_rel_extend(modification, rel, blknum)?; + ) -> anyhow::Result<()> { + self.handle_rel_extend(modification, rel, blknum).await?; modification.put_rel_wal_record(rel, blknum, rec)?; Ok(()) } @@ -946,7 +979,7 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification, rel: RelTag, nblocks: BlockNumber, - ) -> Result<()> { + ) -> anyhow::Result<()> { modification.put_rel_truncation(rel, nblocks)?; Ok(()) } @@ -956,33 +989,37 @@ impl<'a> WalIngest<'a> { Ok(()) } - fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> Result { - let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true)? { + async fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result { + let exists = + with_ondemand_download(|| self.timeline.get_rel_exists(rel, lsn, true)).await?; + let nblocks = if !exists { 0 } else { - self.timeline.get_rel_size(rel, lsn, true)? + with_ondemand_download(|| self.timeline.get_rel_size(rel, lsn, true)).await? }; Ok(nblocks) } - fn handle_rel_extend( + async fn handle_rel_extend( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, rel: RelTag, blknum: BlockNumber, - ) -> Result<()> { + ) -> anyhow::Result<()> { let new_nblocks = blknum + 1; // Check if the relation exists. We implicitly create relations on first // record. // TODO: would be nice if to be more explicit about it let last_lsn = modification.lsn; - let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true)? { - // create it with 0 size initially, the logic below will extend it - modification.put_rel_creation(rel, 0)?; - 0 - } else { - self.timeline.get_rel_size(rel, last_lsn, true)? - }; + let old_nblocks = + if !with_ondemand_download(|| self.timeline.get_rel_exists(rel, last_lsn, true)).await? + { + // create it with 0 size initially, the logic below will extend it + modification.put_rel_creation(rel, 0)?; + 0 + } else { + with_ondemand_download(|| self.timeline.get_rel_size(rel, last_lsn, true)).await? + }; if new_nblocks > old_nblocks { //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks); @@ -996,26 +1033,27 @@ impl<'a> WalIngest<'a> { Ok(()) } - fn put_slru_page_image( + async fn put_slru_page_image( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, kind: SlruKind, segno: u32, blknum: BlockNumber, img: Bytes, - ) -> Result<()> { - self.handle_slru_extend(modification, kind, segno, blknum)?; + ) -> anyhow::Result<()> { + self.handle_slru_extend(modification, kind, segno, blknum) + .await?; modification.put_slru_page_image(kind, segno, blknum, img)?; Ok(()) } - fn handle_slru_extend( + async fn handle_slru_extend( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification<'_>, kind: SlruKind, segno: u32, blknum: BlockNumber, - ) -> Result<()> { + ) -> anyhow::Result<()> { // we don't use a cache for this like we do for relations. SLRUS are explcitly // extended with ZEROPAGE records, not with commit records, so it happens // a lot less frequently. @@ -1025,15 +1063,17 @@ impl<'a> WalIngest<'a> { // record. // TODO: would be nice if to be more explicit about it let last_lsn = self.timeline.get_last_record_lsn(); - let old_nblocks = if !self - .timeline - .get_slru_segment_exists(kind, segno, last_lsn)? + let old_nblocks = if !with_ondemand_download(|| { + self.timeline.get_slru_segment_exists(kind, segno, last_lsn) + }) + .await? { // create it with 0 size initially, the logic below will extend it modification.put_slru_segment_creation(kind, segno, 0)?; 0 } else { - self.timeline.get_slru_segment_size(kind, segno, last_lsn)? + with_ondemand_download(|| self.timeline.get_slru_segment_size(kind, segno, last_lsn)) + .await? }; if new_nblocks > old_nblocks { @@ -1081,12 +1121,12 @@ mod tests { static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); - fn init_walingest_test(tline: &Timeline) -> Result { + async fn init_walingest_test(tline: &Timeline) -> Result { let mut m = tline.begin_modification(Lsn(0x10)); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file m.commit()?; - let walingest = WalIngest::new(tline, Lsn(0x10))?; + let walingest = WalIngest::new(tline, Lsn(0x10)).await?; Ok(walingest) } @@ -1095,62 +1135,107 @@ mod tests { async fn test_relsize() -> Result<()> { let tenant = TenantHarness::create("test_relsize")?.load().await; let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&*tline)?; + let mut walingest = init_walingest_test(&tline).await?; let mut m = tline.begin_modification(Lsn(0x20)); walingest.put_rel_creation(&mut m, TESTREL_A)?; - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2")) + .await?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x30)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3")) + .await?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x40)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4")) + .await?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x50)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5")) + .await?; m.commit()?; - assert_current_logical_size(&*tline, Lsn(0x50)); + assert_current_logical_size(&tline, Lsn(0x50)); // The relation was created at LSN 2, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false); - assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err()); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download()?, + false + ); + assert!(tline + .get_rel_size(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download() + .is_err()); - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + 1 + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x50), false) + .no_ondemand_download()?, + 3 + ); // Check page contents at each LSN assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 2") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 2 at 5") ); @@ -1158,23 +1243,39 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x60)); walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?; m.commit()?; - assert_current_logical_size(&*tline, Lsn(0x60)); + assert_current_logical_size(&tline, Lsn(0x60)); // Check reported size and contents after truncation - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 2); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(0x60), false) + .no_ondemand_download()?, + 2 + ); + assert_eq!( + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1 at 4") ); // should still see the truncated block with older LSN - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(0x50), false) + .no_ondemand_download()?, + 3 + ); + assert_eq!( + tline + .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 2 at 5") ); @@ -1182,35 +1283,62 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x68)); walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false)?, 0); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x68), false) + .no_ondemand_download()?, + 0 + ); // Extend from 0 to 2 blocks, leaving a gap let mut m = tline.begin_modification(Lsn(0x70)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1")) + .await?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false)?, 2); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(0x70), false) + .no_ondemand_download()?, + 2 + ); + assert_eq!( + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false) + .no_ondemand_download()?, ZERO_PAGE ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1") ); // Extend a lot more, leaving a big gap that spans across segments let mut m = tline.begin_modification(Lsn(0x80)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500")) + .await?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, 1501); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x80), false) + .no_ondemand_download()?, + 1501 + ); for blk in 2..1500 { assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false) + .no_ondemand_download()?, ZERO_PAGE ); } assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1500") ); @@ -1223,15 +1351,27 @@ mod tests { async fn test_drop_extend() -> Result<()> { let tenant = TenantHarness::create("test_drop_extend")?.load().await; let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&*tline)?; + let mut walingest = init_walingest_test(&tline).await?; let mut m = tline.begin_modification(Lsn(0x20)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2")) + .await?; m.commit()?; // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + 1 + ); // Drop rel let mut m = tline.begin_modification(Lsn(0x30)); @@ -1239,19 +1379,36 @@ mod tests { m.commit()?; // Check that rel is not visible anymore - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30), false)?, false); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x30), false) + .no_ondemand_download()?, + false + ); // FIXME: should fail //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30), false)?.is_none()); // Re-create it let mut m = tline.begin_modification(Lsn(0x40)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4")) + .await?; m.commit()?; // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false)?, 1); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x40), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x40), false) + .no_ondemand_download()?, + 1 + ); Ok(()) } @@ -1263,30 +1420,52 @@ mod tests { async fn test_truncate_extend() -> Result<()> { let tenant = TenantHarness::create("test_truncate_extend")?.load().await; let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&*tline)?; + let mut walingest = init_walingest_test(&tline).await?; // Create a 20 MB relation (the size is arbitrary) let relsize = 20 * 1024 * 1024 / 8192; let mut m = tline.begin_modification(Lsn(0x20)); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); - walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data)) + .await?; } m.commit()?; // The relation was created at LSN 20, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false); - assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err()); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download()?, + false + ); + assert!(tline + .get_rel_size(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download() + .is_err()); - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, relsize); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + relsize + ); // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false) + .no_ondemand_download()?, TEST_IMG(&data) ); } @@ -1298,24 +1477,38 @@ mod tests { m.commit()?; // Check reported size and contents after truncation - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 1); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x60), false) + .no_ondemand_download()?, + 1 + ); for blkno in 0..1 { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false) + .no_ondemand_download()?, TEST_IMG(&data) ); } // should still see all blocks with older LSN - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, relsize); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x50), false) + .no_ondemand_download()?, + relsize + ); for blkno in 0..relsize { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG(&data) ); } @@ -1326,18 +1519,32 @@ mod tests { let mut m = tline.begin_modification(lsn); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, lsn); - walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data)) + .await?; } m.commit()?; - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, relsize); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x80), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x80), false) + .no_ondemand_download()?, + relsize + ); // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x80); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false) + .no_ondemand_download()?, TEST_IMG(&data) ); } @@ -1351,21 +1558,25 @@ mod tests { async fn test_large_rel() -> Result<()> { let tenant = TenantHarness::create("test_large_rel")?.load().await; let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&*tline)?; + let mut walingest = init_walingest_test(&tline).await?; let mut lsn = 0x10; for blknum in 0..RELSEG_SIZE + 1 { lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); - walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img) + .await?; m.commit()?; } - assert_current_logical_size(&*tline, Lsn(lsn)); + assert_current_logical_size(&tline, Lsn(lsn)); assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, RELSEG_SIZE + 1 ); @@ -1374,8 +1585,13 @@ mod tests { let mut m = tline.begin_modification(Lsn(lsn)); walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, RELSEG_SIZE); - assert_current_logical_size(&*tline, Lsn(lsn)); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, + RELSEG_SIZE + ); + assert_current_logical_size(&tline, Lsn(lsn)); // Truncate another block lsn += 0x10; @@ -1383,10 +1599,12 @@ mod tests { walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)?; m.commit()?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, RELSEG_SIZE - 1 ); - assert_current_logical_size(&*tline, Lsn(lsn)); + assert_current_logical_size(&tline, Lsn(lsn)); // Truncate to 1500, and then truncate all the way down to 0, one block at a time // This tests the behavior at segment boundaries @@ -1397,13 +1615,15 @@ mod tests { walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?; m.commit()?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, size as BlockNumber ); size -= 1; } - assert_current_logical_size(&*tline, Lsn(lsn)); + assert_current_logical_size(&tline, Lsn(lsn)); Ok(()) } diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index e627e9ecd0..aaf46579a7 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -44,10 +44,13 @@ pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result let broker_endpoint = conf.broker_endpoint.clone(); // Note: we do not attempt connecting here (but validate endpoints sanity). - let broker_client = storage_broker::connect(broker_endpoint.clone()).context(format!( - "Failed to create broker client to {}", - &conf.broker_endpoint - ))?; + let broker_client = + storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context( + format!( + "Failed to create broker client to {}", + &conf.broker_endpoint + ), + )?; if BROKER_CLIENT.set(broker_client).is_err() { panic!("broker already initialized"); @@ -126,15 +129,21 @@ impl TaskHandle { match self.events_receiver.changed().await { Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()), Err(_task_channel_part_dropped) => { - TaskEvent::End(match self.join_handle.take() { + TaskEvent::End(match self.join_handle.as_mut() { Some(jh) => { if !jh.is_finished() { warn!("sender is dropped while join handle is still alive"); } - jh.await + let res = jh + .await .map_err(|e| anyhow::anyhow!("Failed to join task: {e}")) - .and_then(|x| x) + .and_then(|x| x); + + // For cancellation-safety, drop join_handle only after successful .await. + self.join_handle = None; + + res } None => { // Another option is to have an enum, join handle or result and give away the reference to it diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 8048707480..8b60e59305 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -145,21 +145,17 @@ async fn connection_manager_loop_step( let wal_connection = walreceiver_state.wal_connection.as_mut() .expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { - TaskEvent::Update(c) => { - match c { - TaskStateUpdate::Init | TaskStateUpdate::Started => {}, - TaskStateUpdate::Progress(status) => { - if status.has_processed_wal { - // We have advanced last_record_lsn by processing the WAL received - // from this safekeeper. This is good enough to clean unsuccessful - // retries history and allow reconnecting to this safekeeper without - // sleeping for a long time. - walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); - } - wal_connection.status = status.to_owned(); - } + TaskEvent::Update(TaskStateUpdate::Init | TaskStateUpdate::Started) => {}, + TaskEvent::Update(TaskStateUpdate::Progress(new_status)) => { + if new_status.has_processed_wal { + // We have advanced last_record_lsn by processing the WAL received + // from this safekeeper. This is good enough to clean unsuccessful + // retries history and allow reconnecting to this safekeeper without + // sleeping for a long time. + walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); } - }, + wal_connection.status = new_status; + } TaskEvent::End(walreceiver_task_result) => { match walreceiver_task_result { Ok(()) => debug!("WAL receiving task finished"), @@ -210,7 +206,18 @@ async fn connection_manager_loop_step( } }, - _ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {} + Some(()) = async { + match time_until_next_retry { + Some(sleep_time) => { + tokio::time::sleep(sleep_time).await; + Some(()) + }, + None => { + debug!("No candidates to retry, waiting indefinitely for the broker events"); + None + } + } + } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"), } if let Some(new_candidate) = walreceiver_state.next_connection_candidate() { @@ -400,7 +407,7 @@ impl WalreceiverState { .await .context("walreceiver connection handling failure") } - .instrument(info_span!("walreceiver_connection", id = %id)) + .instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id)) }); let now = Utc::now().naive_utc(); @@ -480,20 +487,25 @@ impl WalreceiverState { .values() .filter_map(|retry| retry.next_retry_at) .filter(|next_retry_at| next_retry_at > &now) - .min(); + .min()?; - next_retry_at.and_then(|next_retry_at| (next_retry_at - now).to_std().ok()) + (next_retry_at - now).to_std().ok() } /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key. fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) { - self.wal_stream_candidates.insert( - NodeId(timeline_update.safekeeper_id), + let new_safekeeper_id = NodeId(timeline_update.safekeeper_id); + let old_entry = self.wal_stream_candidates.insert( + new_safekeeper_id, BrokerSkTimeline { timeline: timeline_update, latest_update: Utc::now().naive_utc(), }, ); + + if old_entry.is_none() { + info!("New SK node was added: {new_safekeeper_id}"); + } } /// Cleans up stale broker records and checks the rest for the new connection candidate. @@ -720,12 +732,13 @@ impl WalreceiverState { /// Remove candidates which haven't sent broker updates for a while. fn cleanup_old_candidates(&mut self) { let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len()); + let lagging_wal_timeout = self.lagging_wal_timeout; self.wal_stream_candidates.retain(|node_id, broker_info| { if let Ok(time_since_latest_broker_update) = (Utc::now().naive_utc() - broker_info.latest_update).to_std() { - let should_retain = time_since_latest_broker_update < self.lagging_wal_timeout; + let should_retain = time_since_latest_broker_update < lagging_wal_timeout; if !should_retain { node_ids_to_remove.push(*node_id); } @@ -735,8 +748,11 @@ impl WalreceiverState { } }); - for node_id in node_ids_to_remove { - self.wal_connection_retries.remove(&node_id); + if !node_ids_to_remove.is_empty() { + for node_id in node_ids_to_remove { + info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"); + self.wal_connection_retries.remove(&node_id); + } } } @@ -789,7 +805,7 @@ fn wal_stream_connection_config( auth_token: Option<&str>, ) -> anyhow::Result { let (host, port) = - parse_host_port(&listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?; + parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?; let port = port.unwrap_or(5432); Ok(PgConnectionConfig::new_host_port(host, port) .extend_options([ @@ -883,10 +899,10 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, - status: connection_status.clone(), + status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskStateUpdate::Progress(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status)) .ok(); Ok(()) }), @@ -1045,10 +1061,10 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, - status: connection_status.clone(), + status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskStateUpdate::Progress(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status)) .ok(); Ok(()) }), @@ -1110,10 +1126,10 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: NodeId(1), - status: connection_status.clone(), + status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskStateUpdate::Progress(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status)) .ok(); Ok(()) }), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index cf2a99f1b5..aca5e8e019 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -1,6 +1,7 @@ //! Actual Postgres connection handler to stream WAL to the server. use std::{ + error::Error, str::FromStr, sync::Arc, time::{Duration, SystemTime}, @@ -11,7 +12,7 @@ use bytes::BytesMut; use chrono::{NaiveDateTime, Utc}; use fail::fail_point; use futures::StreamExt; -use postgres::{SimpleQueryMessage, SimpleQueryRow}; +use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow}; use postgres_ffi::v14::xlog_utils::normalize_lsn; use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_protocol::message::backend::ReplicationMessage; @@ -32,10 +33,10 @@ use crate::{ use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; use pq_proto::ReplicationFeedback; -use utils::lsn::Lsn; +use utils::{lsn::Lsn, postgres_backend_async::is_expected_io_error}; /// Status of the connection. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Copy)] pub struct WalConnectionStatus { /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running. pub is_connected: bool, @@ -68,10 +69,17 @@ pub async fn handle_walreceiver_connection( let mut config = wal_source_connconf.to_tokio_postgres_config(); config.application_name("pageserver"); config.replication_mode(tokio_postgres::config::ReplicationMode::Physical); - time::timeout(connect_timeout, config.connect(postgres::NoTls)) - .await - .context("Timed out while waiting for walreceiver connection to open")? - .context("Failed to open walreceiver connection")? + match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await { + Ok(Ok(client_and_conn)) => client_and_conn, + Ok(Err(conn_err)) => { + let expected_error = ignore_expected_errors(conn_err)?; + info!("DB connection stream finished: {expected_error}"); + return Ok(()); + } + Err(elapsed) => anyhow::bail!( + "Timed out while waiting {elapsed} for walreceiver connection to open" + ), + } }; info!("connected!"); @@ -83,7 +91,7 @@ pub async fn handle_walreceiver_connection( streaming_lsn: None, commit_lsn: None, }; - if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); return Ok(()); } @@ -103,10 +111,8 @@ pub async fn handle_walreceiver_connection( connection_result = connection => match connection_result{ Ok(()) => info!("Walreceiver db connection closed"), Err(connection_error) => { - if connection_error.is_closed() { - info!("Connection closed regularly: {connection_error}") - } else { - warn!("Connection aborted: {connection_error}") + if let Err(e) = ignore_expected_errors(connection_error) { + warn!("Connection aborted: {e:#}") } } }, @@ -135,7 +141,7 @@ pub async fn handle_walreceiver_connection( connection_status.latest_connection_update = Utc::now().naive_utc(); connection_status.latest_wal_update = Utc::now().naive_utc(); connection_status.commit_lsn = Some(end_of_wal); - if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); return Ok(()); } @@ -173,7 +179,7 @@ pub async fn handle_walreceiver_connection( let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); - let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?; + let mut walingest = WalIngest::new(timeline.as_ref(), startpoint).await?; while let Some(replication_message) = { select! { @@ -184,7 +190,15 @@ pub async fn handle_walreceiver_connection( replication_message = physical_stream.next() => replication_message, } } { - let replication_message = replication_message?; + let replication_message = match replication_message { + Ok(message) => message, + Err(replication_error) => { + let expected_error = ignore_expected_errors(replication_error)?; + info!("Replication stream finished: {expected_error}"); + return Ok(()); + } + }; + let now = Utc::now().naive_utc(); let last_rec_lsn_before_msg = last_rec_lsn; @@ -207,7 +221,7 @@ pub async fn handle_walreceiver_connection( } &_ => {} }; - if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!("Wal connection event listener dropped, aborting the connection: {e}"); return Ok(()); } @@ -236,8 +250,9 @@ pub async fn handle_walreceiver_connection( ensure!(lsn.is_aligned()); walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded) - .context("could not ingest record at {lsn}")?; + .ingest_record(recdata.clone(), lsn, &mut modification, &mut decoded) + .await + .with_context(|| format!("could not ingest record at {lsn}"))?; fail_point!("walreceiver-after-ingest"); @@ -273,8 +288,7 @@ pub async fn handle_walreceiver_connection( if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg { // We have successfully processed at least one WAL record. connection_status.has_processed_wal = true; - if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) - { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!("Wal connection event listener dropped, aborting the connection: {e}"); return Ok(()); } @@ -313,10 +327,11 @@ pub async fn handle_walreceiver_connection( // Send the replication feedback message. // Regular standby_status_update fields are put into this message. + let (timeline_logical_size, _) = timeline + .get_current_logical_size() + .context("Status update creation failed to get current logical size")?; let status_update = ReplicationFeedback { - current_timeline_size: timeline - .get_current_logical_size() - .context("Status update creation failed to get current logical size")?, + current_timeline_size: timeline_logical_size, ps_writelsn: write_lsn, ps_flushlsn: flush_lsn, ps_applylsn: apply_lsn, @@ -386,3 +401,32 @@ async fn identify_system(client: &mut Client) -> anyhow::Result Err(IdentifyError.into()) } } + +/// We don't want to report connectivity problems as real errors towards connection manager because +/// 1. they happen frequently enough to make server logs hard to read and +/// 2. the connection manager can retry other safekeeper. +/// +/// If this function returns `Ok(pg_error)`, it's such an error. +/// The caller should log it at info level and then report to connection manager that we're done handling this connection. +/// Connection manager will then handle reconnections. +/// +/// If this function returns an `Err()`, the caller can bubble it up using `?`. +/// The connection manager will log the error at ERROR level. +fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result { + if pg_error.is_closed() + || pg_error + .source() + .and_then(|source| source.downcast_ref::()) + .map(is_expected_io_error) + .unwrap_or(false) + { + return Ok(pg_error); + } else if let Some(db_error) = pg_error.as_db_error() { + if db_error.code() == &SqlState::CONNECTION_FAILURE + && db_error.message().contains("end streaming") + { + return Ok(pg_error); + } + } + Err(pg_error).context("connection error") +} diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 38fb9a4247..7581140934 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -1,6 +1,7 @@ //! //! Functions for parsing WAL records. //! + use anyhow::Result; use bytes::{Buf, Bytes}; use postgres_ffi::pg_constants; diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index ca7cfb7413..7cf489562b 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -409,7 +409,7 @@ impl PostgresRedoManager { key ); for &xid in xids { - let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE; + let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -459,7 +459,7 @@ impl PostgresRedoManager { key ); for &xid in xids { - let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE; + let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -647,7 +647,7 @@ impl PostgresRedoProcess { info!("running initdb in {}", datadir.display()); let initdb = Command::new(pg_bin_dir_path.join("initdb")) - .args(&["-D", &datadir.to_string_lossy()]) + .args(["-D", &datadir.to_string_lossy()]) .arg("-N") .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir_path) diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 7f4e30a12e..ec377dbb1e 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -4,11 +4,12 @@ MODULE_big = neon OBJS = \ $(WIN32RES) \ + file_cache.o \ libpagestore.o \ libpqwalproposer.o \ + neon.o \ pagestore_smgr.o \ relsize_cache.o \ - neon.o \ walproposer.o \ walproposer_utils.o diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c new file mode 100644 index 0000000000..96c2461e2d --- /dev/null +++ b/pgxn/neon/file_cache.c @@ -0,0 +1,597 @@ +/* + * + * file_cache.c + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * pgxn/neon/file_cache.c + * + *------------------------------------------------------------------------- + */ + +#include +#include +#include + +#include "postgres.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "pagestore_client.h" +#include "access/parallel.h" +#include "postmaster/bgworker.h" +#include "storage/relfilenode.h" +#include "storage/buf_internals.h" +#include "storage/latch.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "utils/dynahash.h" +#include "utils/guc.h" +#include "storage/fd.h" +#include "storage/pg_shmem.h" +#include "storage/buf_internals.h" + +/* + * Local file cache is used to temporary store relations pages in local file system. + * All blocks of all relations are stored inside one file and addressed using shared hash map. + * Currently LRU eviction policy based on L2 list is used as replacement algorithm. + * As far as manipulation of L2-list requires global critical section, we are not using partitioned hash. + * Also we are using exclusive lock even for read operation because LRU requires relinking element in L2 list. + * If this lock become a bottleneck, we can consider other eviction strategies, for example clock algorithm. + * + * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about + * its consistency. + */ + +/* Local file storage allocation chunk. + * Should be power of two and not less than 32. Using larger than page chunks can + * 1. Reduce hash-map memory footprint: 8TB database contains billion pages + * and size of hash entry is 40 bytes, so we need 40Gb just for hash map. + * 1Mb chunks can reduce hash map size to 320Mb. + * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed + */ +#define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */ +#define MB ((uint64)1024*1024) + +#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) + +typedef struct FileCacheEntry +{ + BufferTag key; + uint32 offset; + uint32 access_count; + uint32 bitmap[BLOCKS_PER_CHUNK/32]; + dlist_node lru_node; /* LRU list node */ +} FileCacheEntry; + +typedef struct FileCacheControl +{ + uint32 size; /* size of cache file in chunks */ + dlist_head lru; /* double linked list for LRU replacement algorithm */ +} FileCacheControl; + +static HTAB* lfc_hash; +static int lfc_desc; +static LWLockId lfc_lock; +static int lfc_max_size; +static int lfc_size_limit; +static char* lfc_path; +static FileCacheControl* lfc_ctl; +static shmem_startup_hook_type prev_shmem_startup_hook; +#if PG_VERSION_NUM>=150000 +static shmem_request_hook_type prev_shmem_request_hook; +#endif + +static void +lfc_shmem_startup(void) +{ + bool found; + static HASHCTL info; + + if (prev_shmem_startup_hook) + { + prev_shmem_startup_hook(); + } + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + + lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found); + if (!found) + { + uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size); + lfc_lock = (LWLockId)GetNamedLWLockTranche("lfc_lock"); + info.keysize = sizeof(BufferTag); + info.entrysize = sizeof(FileCacheEntry); + lfc_hash = ShmemInitHash("lfc_hash", + /* lfc_size+1 because we add new element to hash table before eviction of victim */ + lfc_size+1, lfc_size+1, + &info, + HASH_ELEM | HASH_BLOBS); + lfc_ctl->size = 0; + dlist_init(&lfc_ctl->lru); + + /* Remove file cache on restart */ + (void)unlink(lfc_path); + } + LWLockRelease(AddinShmemInitLock); +} + +static void +lfc_shmem_request(void) +{ +#if PG_VERSION_NUM>=150000 + if (prev_shmem_request_hook) + prev_shmem_request_hook(); +#endif + + RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry))); + RequestNamedLWLockTranche("lfc_lock", 1); +} + +bool +lfc_check_limit_hook(int *newval, void **extra, GucSource source) +{ + if (*newval > lfc_max_size) + { + elog(ERROR, "neon.file_cache_size_limit can not be larger than neon.max_file_cache_size"); + return false; + } + return true; +} + +void +lfc_change_limit_hook(int newval, void *extra) +{ + uint32 new_size = SIZE_MB_TO_CHUNKS(newval); + /* + * Stats collector detach shared memory, so we should not try to access shared memory here. + * Parallel workers first assign default value (0), so not perform truncation in parallel workers. + */ + if (!lfc_ctl || !UsedShmemSegAddr || IsParallelWorker()) + return; + + /* Open cache file if not done yet */ + if (lfc_desc == 0) + { + lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT); + if (lfc_desc < 0) { + elog(LOG, "Failed to open file cache %s: %m", lfc_path); + lfc_size_limit = 0; /* disable file cache */ + return; + } + } + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + while (new_size < lfc_ctl->size && !dlist_is_empty(&lfc_ctl->lru)) + { + /* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */ + FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); + Assert(victim->access_count == 0); +#ifdef FALLOC_FL_PUNCH_HOLE + if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0) + elog(LOG, "Failed to punch hole in file: %m"); +#endif + hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL); + lfc_ctl->size -= 1; + } + elog(LOG, "set local file cache limit to %d", new_size); + LWLockRelease(lfc_lock); +} + +void +lfc_init(void) +{ + /* + * In order to create our shared memory area, we have to be loaded via + * shared_preload_libraries. + */ + if (!process_shared_preload_libraries_in_progress) + elog(ERROR, "Neon module should be loaded via shared_preload_libraries"); + + DefineCustomIntVariable("neon.max_file_cache_size", + "Maximal size of Neon local file cache", + NULL, + &lfc_max_size, + 0, /* disabled by default */ + 0, + INT_MAX, + PGC_POSTMASTER, + GUC_UNIT_MB, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("neon.file_cache_size_limit", + "Current limit for size of Neon local file cache", + NULL, + &lfc_size_limit, + 0, /* disabled by default */ + 0, + INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MB, + NULL, + lfc_change_limit_hook, + NULL); + + DefineCustomStringVariable("neon.file_cache_path", + "Path to local file cache (can be raw device)", + NULL, + &lfc_path, + "file.cache", + PGC_POSTMASTER, + 0, + NULL, + NULL, + NULL); + + if (lfc_max_size == 0) + return; + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = lfc_shmem_startup; +#if PG_VERSION_NUM>=150000 + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = lfc_shmem_request; +#else + lfc_shmem_request(); +#endif +} + +/* + * Check if page is present in the cache. + * Returns true if page is found in local cache. + */ +bool +lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) +{ + BufferTag tag; + FileCacheEntry* entry; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); + bool found; + uint32 hash; + + if (lfc_size_limit == 0) /* fast exit if file cache is disabled */ + return false; + + tag.rnode = rnode; + tag.forkNum = forkNum; + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1); + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_SHARED); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) != 0; + LWLockRelease(lfc_lock); + return found; +} + +/* + * Try to read page from local cache. + * Returns true if page is found in local cache. + * In case of error lfc_size_limit is set to zero to disable any further opera-tins with cache. + */ +bool +lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + char *buffer) +{ + BufferTag tag; + FileCacheEntry* entry; + ssize_t rc; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); + bool result = true; + uint32 hash; + + if (lfc_size_limit == 0) /* fast exit if file cache is disabled */ + return false; + + tag.rnode = rnode; + tag.forkNum = forkNum; + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1); + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0) + { + /* Page is not cached */ + LWLockRelease(lfc_lock); + return false; + } + /* Unlink entry from LRU list to pin it for the duration of IO operation */ + if (entry->access_count++ == 0) + dlist_delete(&entry->lru_node); + LWLockRelease(lfc_lock); + + /* Open cache file if not done yet */ + if (lfc_desc == 0) + { + lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT); + if (lfc_desc < 0) { + elog(LOG, "Failed to open file cache %s: %m", lfc_path); + lfc_size_limit = 0; /* disable file cache */ + result = false; + } + } + + if (lfc_desc > 0) + { + rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ); + if (rc != BLCKSZ) + { + elog(INFO, "Failed to read file cache: %m"); + lfc_size_limit = 0; /* disable file cache */ + result = false; + } + } + + /* Place entry to the head of LRU list */ + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + Assert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); + LWLockRelease(lfc_lock); + + return result; +} + +/* + * Put page in local file cache. + * If cache is full then evict some other page. + */ +void +lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + char *buffer) +{ + BufferTag tag; + FileCacheEntry* entry; + ssize_t rc; + bool found; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); + uint32 hash; + + if (lfc_size_limit == 0) /* fast exit if file cache is disabled */ + return; + + tag.rnode = rnode; + tag.forkNum = forkNum; + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1); + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); + + if (found) + { + /* Unlink entry from LRU list to pin it for the duration of IO operation */ + if (entry->access_count++ == 0) + dlist_delete(&entry->lru_node); + } + else + { + /* + * We have two choices if all cache pages are pinned (i.e. used in IO operations): + * 1. Wait until some of this operation is completed and pages is unpinned + * 2. Allocate one more chunk, so that specified cache size is more recommendation than hard limit. + * As far as probability of such event (that all pages are pinned) is considered to be very very small: + * there are should be very large number of concurrent IO operations and them are limited by max_connections, + * we prefer not to complicate code and use second approach. + */ + if (lfc_ctl->size >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru)) + { + /* Cache overflow: evict least recently used chunk */ + FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); + Assert(victim->access_count == 0); + entry->offset = victim->offset; /* grab victim's chunk */ + hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL); + elog(LOG, "Swap file cache page"); + } + else + entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */ + entry->access_count = 1; + memset(entry->bitmap, 0, sizeof entry->bitmap); + } + LWLockRelease(lfc_lock); + + /* Open cache file if not done yet */ + if (lfc_desc == 0) + { + lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT); + if (lfc_desc < 0) { + elog(LOG, "Failed to open file cache %s: %m", lfc_path); + lfc_size_limit = 0; /* disable file cache */ + } + } + if (lfc_desc > 0) + { + rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ); + if (rc != BLCKSZ) + { + elog(INFO, "Failed to write file cache: %m"); + lfc_size_limit = 0; /* disable file cache */ + } + } + /* Place entry to the head of LRU list */ + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + Assert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); + if (lfc_size_limit != 0) + entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31)); + LWLockRelease(lfc_lock); +} + + +/* + * Record structure holding the to be exposed cache data. + */ +typedef struct +{ + uint32 pageoffs; + Oid relfilenode; + Oid reltablespace; + Oid reldatabase; + ForkNumber forknum; + BlockNumber blocknum; + uint16 accesscount; +} LocalCachePagesRec; + +/* + * Function context for data persisting over repeated calls. + */ +typedef struct +{ + TupleDesc tupdesc; + LocalCachePagesRec *record; +} LocalCachePagesContext; + +/* + * Function returning data from the local file cache + * relation node/tablespace/database/blocknum and access_counter + */ +PG_FUNCTION_INFO_V1(local_cache_pages); + +#define NUM_LOCALCACHE_PAGES_ELEM 7 + +Datum +local_cache_pages(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Datum result; + MemoryContext oldcontext; + LocalCachePagesContext *fctx; /* User function context. */ + TupleDesc tupledesc; + TupleDesc expected_tupledesc; + HeapTuple tuple; + + if (SRF_IS_FIRSTCALL()) + { + HASH_SEQ_STATUS status; + FileCacheEntry* entry; + uint32 n_pages = 0; + uint32 i; + + funcctx = SRF_FIRSTCALL_INIT(); + + /* Switch context when allocating stuff to be used in later calls */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* Create a user function context for cross-call persistence */ + fctx = (LocalCachePagesContext *) palloc(sizeof(LocalCachePagesContext)); + + /* + * To smoothly support upgrades from version 1.0 of this extension + * transparently handle the (non-)existence of the pinning_backends + * column. We unfortunately have to get the result type for that... - + * we can't use the result type determined by the function definition + * without potentially crashing when somebody uses the old (or even + * wrong) function definition though. + */ + if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM) + elog(ERROR, "incorrect number of output arguments"); + + /* Construct a tuple descriptor for the result rows. */ + tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts); + TupleDescInitEntry(tupledesc, (AttrNumber) 1, "pageoffs", + INT8OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber", + INT2OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber", + INT8OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 7, "accesscount", + INT4OID, -1, 0); + + fctx->tupdesc = BlessTupleDesc(tupledesc); + + LWLockAcquire(lfc_lock, LW_SHARED); + + hash_seq_init(&status, lfc_hash); + while ((entry = hash_seq_search(&status)) != NULL) + { + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + n_pages += (entry->bitmap[i >> 5] & (1 << (i & 31))) != 0; + } + fctx->record = (LocalCachePagesRec *) + MemoryContextAllocHuge(CurrentMemoryContext, + sizeof(LocalCachePagesRec) * n_pages); + + /* Set max calls and remember the user function context. */ + funcctx->max_calls = n_pages; + funcctx->user_fctx = fctx; + + /* Return to original context when allocating transient memory */ + MemoryContextSwitchTo(oldcontext); + + /* + * Scan through all the buffers, saving the relevant fields in the + * fctx->record structure. + * + * We don't hold the partition locks, so we don't get a consistent + * snapshot across all buffers, but we do grab the buffer header + * locks, so the information of each buffer is self-consistent. + */ + n_pages = 0; + hash_seq_init(&status, lfc_hash); + while ((entry = hash_seq_search(&status)) != NULL) + { + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + { + if (entry->bitmap[i >> 5] & (1 << (i & 31))) + { + fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i; + fctx->record[n_pages].relfilenode = entry->key.rnode.relNode; + fctx->record[n_pages].reltablespace = entry->key.rnode.spcNode; + fctx->record[n_pages].reldatabase = entry->key.rnode.dbNode; + fctx->record[n_pages].forknum = entry->key.forkNum; + fctx->record[n_pages].blocknum = entry->key.blockNum + i; + fctx->record[n_pages].accesscount = entry->access_count; + n_pages += 1; + } + } + } + Assert(n_pages == funcctx->max_calls); + LWLockRelease(lfc_lock); + } + + funcctx = SRF_PERCALL_SETUP(); + + /* Get the saved state */ + fctx = funcctx->user_fctx; + + if (funcctx->call_cntr < funcctx->max_calls) + { + uint32 i = funcctx->call_cntr; + Datum values[NUM_LOCALCACHE_PAGES_ELEM]; + bool nulls[NUM_LOCALCACHE_PAGES_ELEM] = { + false, false, false, false, false, false, false + }; + + values[0] = Int64GetDatum((int64) fctx->record[i].pageoffs); + values[1] = ObjectIdGetDatum(fctx->record[i].relfilenode); + values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace); + values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase); + values[4] = ObjectIdGetDatum(fctx->record[i].forknum); + values[5] = Int64GetDatum((int64) fctx->record[i].blocknum); + values[6] = Int32GetDatum(fctx->record[i].accesscount); + + /* Build and return the tuple. */ + tuple = heap_form_tuple(fctx->tupdesc, values, nulls); + result = HeapTupleGetDatum(tuple); + + SRF_RETURN_NEXT(funcctx, result); + } + else + SRF_RETURN_DONE(funcctx); +} diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 1aba2e1ede..88e3a12d96 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -111,6 +111,7 @@ pageserver_connect() PQfinish(pageserver_conn); pageserver_conn = NULL; FreeWaitEventSet(pageserver_conn_wes); + pageserver_conn_wes = NULL; neon_log(ERROR, "could not complete handshake with pageserver: %s", msg); @@ -179,7 +180,10 @@ pageserver_disconnect(void) prefetch_on_ps_disconnect(); } if (pageserver_conn_wes != NULL) + { FreeWaitEventSet(pageserver_conn_wes); + pageserver_conn_wes = NULL; + } } static void @@ -206,7 +210,7 @@ pageserver_send(NeonRequest * request) */ if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) { - char *msg = PQerrorMessage(pageserver_conn); + char *msg = pchomp(PQerrorMessage(pageserver_conn)); pageserver_disconnect(); neon_log(ERROR, "failed to send page request: %s", msg); @@ -239,29 +243,33 @@ pageserver_receive(void) PG_TRY(); { /* read response */ - resp_buff.len = call_PQgetCopyData(&resp_buff.data); - resp_buff.cursor = 0; + int rc; - if (resp_buff.len < 0) + rc = call_PQgetCopyData(&resp_buff.data); + if (rc >= 0) { - if (resp_buff.len == -1) + resp_buff.len = rc; + resp_buff.cursor = 0; + resp = nm_unpack_response(&resp_buff); + PQfreemem(resp_buff.data); + + if (message_level_is_interesting(PageStoreTrace)) { - pageserver_disconnect(); - return NULL; + char *msg = nm_to_string((NeonMessage *) resp); + + neon_log(PageStoreTrace, "got response: %s", msg); + pfree(msg); } - else if (resp_buff.len == -2) - neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); } - resp = nm_unpack_response(&resp_buff); - PQfreemem(resp_buff.data); - - if (message_level_is_interesting(PageStoreTrace)) + else if (rc == -1) { - char *msg = nm_to_string((NeonMessage *) resp); - - neon_log(PageStoreTrace, "got response: %s", msg); - pfree(msg); + pageserver_disconnect(); + resp = NULL; } + else if (rc == -2) + neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); + else + neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc); } PG_CATCH(); { @@ -420,7 +428,7 @@ pg_init_libpagestore(void) NULL, NULL, NULL); DefineCustomStringVariable("neon.safekeeper_token_env", - "the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $ZENITH_AUTH_TOKEN", + "the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $NEON_AUTH_TOKEN", NULL, &safekeeper_token_env, NULL, @@ -516,4 +524,5 @@ pg_init_libpagestore(void) smgr_init_hook = smgr_init_neon; dbsize_hook = neon_dbsize; } + lfc_init(); } diff --git a/pgxn/neon/neon--1.0.sql b/pgxn/neon/neon--1.0.sql index 58b98a5923..6cf111ea6a 100644 --- a/pgxn/neon/neon--1.0.sql +++ b/pgxn/neon/neon--1.0.sql @@ -22,3 +22,13 @@ AS 'MODULE_PATHNAME', 'backpressure_throttling_time' LANGUAGE C STRICT PARALLEL UNSAFE; +CREATE FUNCTION local_cache_pages() +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'local_cache_pages' +LANGUAGE C PARALLEL SAFE; + +-- Create a view for convenient access. +CREATE VIEW local_cache AS + SELECT P.* FROM local_cache_pages() AS P + (pageoffs int8, relfilenode oid, reltablespace oid, reldatabase oid, + relforknumber int2, relblocknumber int8, accesscount int4); diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 170a0cb72d..831756b849 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -203,4 +203,11 @@ extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumbe extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); +/* functions for local file cache */ +extern void lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer); +extern bool lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer); +extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno); +extern void lfc_init(void); + + #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 73bf330baf..0b34cb3ca9 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1669,7 +1669,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, * (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData * call smgrextend for destination relation n using size of source relation */ - get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks); + n_blocks = neon_nblocks(reln, forkNum); while (n_blocks < blkno) neon_wallog_page(reln, forkNum, n_blocks++, buffer, true); @@ -1684,6 +1684,8 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, forkNum, blkno, (uint32) (lsn >> 32), (uint32) lsn); + lfc_write(reln->smgr_rnode.node, forkNum, blkno, buffer); + #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) mdextend(reln, forkNum, blkno, buffer, skipFsync); @@ -1757,6 +1759,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } + if (lfc_cache_contains(reln->smgr_rnode.node, forknum, blocknum)) + return false; + tag = (BufferTag) { .rnode = reln->smgr_rnode.node, .forkNum = forknum, @@ -1899,6 +1904,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, { case T_NeonGetPageResponse: memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); + lfc_write(rnode, forkNum, blkno, buffer); break; case T_NeonErrorResponse: @@ -1950,6 +1956,12 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } + /* Try to read from local file cache */ + if (lfc_read(reln->smgr_rnode.node, forkNum, blkno, buffer)) + { + return; + } + request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno); neon_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); @@ -2111,6 +2123,8 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, forknum, blocknum, (uint32) (lsn >> 32), (uint32) lsn); + lfc_write(reln->smgr_rnode.node, forknum, blocknum, buffer); + #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) mdwrite(reln, forknum, blocknum, buffer, skipFsync); diff --git a/poetry.lock b/poetry.lock index 2fa7f03679..edbcddd576 100644 --- a/poetry.lock +++ b/poetry.lock @@ -941,11 +941,11 @@ xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] [[package]] name = "mypy" -version = "0.971" +version = "0.991" description = "Optional static typing for Python" category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.dependencies] mypy-extensions = ">=0.4.3" @@ -954,6 +954,7 @@ typing-extensions = ">=3.10" [package.extras] dmypy = ["psutil (>=4.0)"] +install-types = ["pip"] python2 = ["typed-ast (>=1.4.0,<2)"] reports = ["lxml"] @@ -1227,6 +1228,17 @@ pytest = ">=6.1.0" [package.extras] testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"] +[[package]] +name = "pytest-httpserver" +version = "1.0.6" +description = "pytest-httpserver is a httpserver for pytest" +category = "main" +optional = false +python-versions = ">=3.7,<4.0" + +[package.dependencies] +Werkzeug = ">=2.0.0" + [[package]] name = "pytest-lazy-fixture" version = "0.6.3" @@ -1406,7 +1418,7 @@ pbr = "*" [[package]] name = "setuptools" -version = "65.5.0" +version = "65.5.1" description = "Easily download, build, install, upgrade, and uninstall Python packages" category = "main" optional = false @@ -1414,7 +1426,7 @@ python-versions = ">=3.7" [package.extras] docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] [[package]] @@ -1583,7 +1595,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "98d63eaa73253882440e0fc8cdb305bb536944768c5ba313c25d0ee65f546544" +content-hash = "af44b269c235a6fd59dacb4ff9e05cbc13a79b57254a8d5d4bde934bd5691a70" [metadata.files] aiopg = [ @@ -1949,29 +1961,36 @@ moto = [ {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"}, ] mypy = [ - {file = "mypy-0.971-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f2899a3cbd394da157194f913a931edfd4be5f274a88041c9dc2d9cdcb1c315c"}, - {file = "mypy-0.971-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:98e02d56ebe93981c41211c05adb630d1d26c14195d04d95e49cd97dbc046dc5"}, - {file = "mypy-0.971-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:19830b7dba7d5356d3e26e2427a2ec91c994cd92d983142cbd025ebe81d69cf3"}, - {file = "mypy-0.971-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:02ef476f6dcb86e6f502ae39a16b93285fef97e7f1ff22932b657d1ef1f28655"}, - {file = "mypy-0.971-cp310-cp310-win_amd64.whl", hash = "sha256:25c5750ba5609a0c7550b73a33deb314ecfb559c350bb050b655505e8aed4103"}, - {file = "mypy-0.971-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d3348e7eb2eea2472db611486846742d5d52d1290576de99d59edeb7cd4a42ca"}, - {file = "mypy-0.971-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3fa7a477b9900be9b7dd4bab30a12759e5abe9586574ceb944bc29cddf8f0417"}, - {file = "mypy-0.971-cp36-cp36m-win_amd64.whl", hash = "sha256:2ad53cf9c3adc43cf3bea0a7d01a2f2e86db9fe7596dfecb4496a5dda63cbb09"}, - {file = "mypy-0.971-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:855048b6feb6dfe09d3353466004490b1872887150c5bb5caad7838b57328cc8"}, - {file = "mypy-0.971-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:23488a14a83bca6e54402c2e6435467a4138785df93ec85aeff64c6170077fb0"}, - {file = "mypy-0.971-cp37-cp37m-win_amd64.whl", hash = "sha256:4b21e5b1a70dfb972490035128f305c39bc4bc253f34e96a4adf9127cf943eb2"}, - {file = "mypy-0.971-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:9796a2ba7b4b538649caa5cecd398d873f4022ed2333ffde58eaf604c4d2cb27"}, - {file = "mypy-0.971-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5a361d92635ad4ada1b1b2d3630fc2f53f2127d51cf2def9db83cba32e47c856"}, - {file = "mypy-0.971-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b793b899f7cf563b1e7044a5c97361196b938e92f0a4343a5d27966a53d2ec71"}, - {file = "mypy-0.971-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d1ea5d12c8e2d266b5fb8c7a5d2e9c0219fedfeb493b7ed60cd350322384ac27"}, - {file = "mypy-0.971-cp38-cp38-win_amd64.whl", hash = "sha256:23c7ff43fff4b0df93a186581885c8512bc50fc4d4910e0f838e35d6bb6b5e58"}, - {file = "mypy-0.971-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1f7656b69974a6933e987ee8ffb951d836272d6c0f81d727f1d0e2696074d9e6"}, - {file = "mypy-0.971-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d2022bfadb7a5c2ef410d6a7c9763188afdb7f3533f22a0a32be10d571ee4bbe"}, - {file = "mypy-0.971-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef943c72a786b0f8d90fd76e9b39ce81fb7171172daf84bf43eaf937e9f220a9"}, - {file = "mypy-0.971-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d744f72eb39f69312bc6c2abf8ff6656973120e2eb3f3ec4f758ed47e414a4bf"}, - {file = "mypy-0.971-cp39-cp39-win_amd64.whl", hash = "sha256:77a514ea15d3007d33a9e2157b0ba9c267496acf12a7f2b9b9f8446337aac5b0"}, - {file = "mypy-0.971-py3-none-any.whl", hash = "sha256:0d054ef16b071149917085f51f89555a576e2618d5d9dd70bd6eea6410af3ac9"}, - {file = "mypy-0.971.tar.gz", hash = "sha256:40b0f21484238269ae6a57200c807d80debc6459d444c0489a102d7c6a75fa56"}, + {file = "mypy-0.991-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab"}, + {file = "mypy-0.991-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d"}, + {file = "mypy-0.991-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0c8f3be99e8a8bd403caa8c03be619544bc2c77a7093685dcf308c6b109426c6"}, + {file = "mypy-0.991-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9ec663ed6c8f15f4ae9d3c04c989b744436c16d26580eaa760ae9dd5d662eb"}, + {file = "mypy-0.991-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4307270436fd7694b41f913eb09210faff27ea4979ecbcd849e57d2da2f65305"}, + {file = "mypy-0.991-cp310-cp310-win_amd64.whl", hash = "sha256:901c2c269c616e6cb0998b33d4adbb4a6af0ac4ce5cd078afd7bc95830e62c1c"}, + {file = "mypy-0.991-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d13674f3fb73805ba0c45eb6c0c3053d218aa1f7abead6e446d474529aafc372"}, + {file = "mypy-0.991-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1c8cd4fb70e8584ca1ed5805cbc7c017a3d1a29fb450621089ffed3e99d1857f"}, + {file = "mypy-0.991-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:209ee89fbb0deed518605edddd234af80506aec932ad28d73c08f1400ef80a33"}, + {file = "mypy-0.991-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37bd02ebf9d10e05b00d71302d2c2e6ca333e6c2a8584a98c00e038db8121f05"}, + {file = "mypy-0.991-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:26efb2fcc6b67e4d5a55561f39176821d2adf88f2745ddc72751b7890f3194ad"}, + {file = "mypy-0.991-cp311-cp311-win_amd64.whl", hash = "sha256:3a700330b567114b673cf8ee7388e949f843b356a73b5ab22dd7cff4742a5297"}, + {file = "mypy-0.991-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1f7d1a520373e2272b10796c3ff721ea1a0712288cafaa95931e66aa15798813"}, + {file = "mypy-0.991-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:641411733b127c3e0dab94c45af15fea99e4468f99ac88b39efb1ad677da5711"}, + {file = "mypy-0.991-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3d80e36b7d7a9259b740be6d8d906221789b0d836201af4234093cae89ced0cd"}, + {file = "mypy-0.991-cp37-cp37m-win_amd64.whl", hash = "sha256:e62ebaad93be3ad1a828a11e90f0e76f15449371ffeecca4a0a0b9adc99abcef"}, + {file = "mypy-0.991-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b86ce2c1866a748c0f6faca5232059f881cda6dda2a893b9a8373353cfe3715a"}, + {file = "mypy-0.991-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ac6e503823143464538efda0e8e356d871557ef60ccd38f8824a4257acc18d93"}, + {file = "mypy-0.991-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0cca5adf694af539aeaa6ac633a7afe9bbd760df9d31be55ab780b77ab5ae8bf"}, + {file = "mypy-0.991-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a12c56bf73cdab116df96e4ff39610b92a348cc99a1307e1da3c3768bbb5b135"}, + {file = "mypy-0.991-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:652b651d42f155033a1967739788c436491b577b6a44e4c39fb340d0ee7f0d70"}, + {file = "mypy-0.991-cp38-cp38-win_amd64.whl", hash = "sha256:4175593dc25d9da12f7de8de873a33f9b2b8bdb4e827a7cae952e5b1a342e243"}, + {file = "mypy-0.991-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98e781cd35c0acf33eb0295e8b9c55cdbef64fcb35f6d3aa2186f289bed6e80d"}, + {file = "mypy-0.991-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6d7464bac72a85cb3491c7e92b5b62f3dcccb8af26826257760a552a5e244aa5"}, + {file = "mypy-0.991-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c9166b3f81a10cdf9b49f2d594b21b31adadb3d5e9db9b834866c3258b695be3"}, + {file = "mypy-0.991-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8472f736a5bfb159a5e36740847808f6f5b659960115ff29c7cecec1741c648"}, + {file = "mypy-0.991-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5e80e758243b97b618cdf22004beb09e8a2de1af481382e4d84bc52152d1c476"}, + {file = "mypy-0.991-cp39-cp39-win_amd64.whl", hash = "sha256:74e259b5c19f70d35fcc1ad3d56499065c601dfe94ff67ae48b85596b9ec1461"}, + {file = "mypy-0.991-py3-none-any.whl", hash = "sha256:de32edc9b0a7e67c2775e574cb061a537660e51210fbf6006b0b36ea695ae9bb"}, + {file = "mypy-0.991.tar.gz", hash = "sha256:3c0165ba8f354a6d9881809ef29f1a9318a236a6d81c690094c5df32107bde06"}, ] mypy-boto3-s3 = [ {file = "mypy-boto3-s3-1.26.0.post1.tar.gz", hash = "sha256:6d7079f8c739dc993cbedad0736299c413b297814b73795a3855a79169ecc938"}, @@ -2157,6 +2176,10 @@ pytest-asyncio = [ {file = "pytest-asyncio-0.19.0.tar.gz", hash = "sha256:ac4ebf3b6207259750bc32f4c1d8fcd7e79739edbc67ad0c58dd150b1d072fed"}, {file = "pytest_asyncio-0.19.0-py3-none-any.whl", hash = "sha256:7a97e37cfe1ed296e2e84941384bdd37c376453912d397ed39293e0916f521fa"}, ] +pytest-httpserver = [ + {file = "pytest_httpserver-1.0.6-py3-none-any.whl", hash = "sha256:ac2379acc91fe8bdbe2911c93af8dd130e33b5899fb9934d15669480739c6d32"}, + {file = "pytest_httpserver-1.0.6.tar.gz", hash = "sha256:9040d07bf59ac45d8de3db1d4468fd2d1d607975e4da4c872ecc0402cdbf7b3e"}, +] pytest-lazy-fixture = [ {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"}, {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"}, @@ -2260,8 +2283,8 @@ sarif-om = [ {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"}, ] setuptools = [ - {file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"}, - {file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"}, + {file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"}, + {file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"}, ] six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 14a5450d5e..cbc067093e 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -2,6 +2,7 @@ name = "proxy" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] anyhow = "1.0" @@ -16,12 +17,14 @@ hashbrown = "0.12" hex = "0.4.3" hmac = "0.12.1" hyper = "0.14" +hyper-tungstenite = "0.8.1" itertools = "0.10.3" md5 = "0.7.0" once_cell = "1.13.0" parking_lot = "0.12" pin-project-lite = "0.2.7" rand = "0.8.3" +regex = "1.4.5" reqwest = { version = "0.11", default-features = false, features = [ "json", "rustls-tls" ] } routerify = "3" rustls = "0.20.0" @@ -33,12 +36,14 @@ sha2 = "0.10.2" socket2 = "0.4.4" thiserror = "1.0.30" tokio = { version = "1.17", features = ["macros"] } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } tokio-rustls = "0.23.0" +tls-listener = { version = "0.5.1", features = ["rustls", "hyper-h1"] } tracing = "0.1.36" tracing-subscriber = { version = "0.3", features = ["env-filter"] } url = "2.2.2" uuid = { version = "1.2", features = ["v4", "serde"] } +webpki-roots = "0.22.5" x509-parser = "0.14" metrics = { path = "../libs/metrics" } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 4b937f017a..e6a179a040 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -8,7 +8,9 @@ pub use console::{GetAuthInfoError, WakeComputeError}; use crate::{ auth::{self, AuthFlow, ClientCredentials}, - compute, http, mgmt, stream, url, + compute, + console::messages::MetricsAuxInfo, + http, mgmt, stream, url, waiters::{self, Waiter, Waiters}, }; use once_cell::sync::Lazy; @@ -126,25 +128,13 @@ pub struct AuthSuccess { pub value: T, } -impl AuthSuccess { - /// Very similar to [`std::option::Option::map`]. - /// Maps [`AuthSuccess`] to [`AuthSuccess`] by applying - /// a function to a contained value. - pub fn map(self, f: impl FnOnce(T) -> R) -> AuthSuccess { - AuthSuccess { - reported_auth_ok: self.reported_auth_ok, - value: f(self.value), - } - } -} - /// Info for establishing a connection to a compute node. /// This is what we get after auth succeeded, but not before! pub struct NodeInfo { - /// Project from [`auth::ClientCredentials`]. - pub project: String, /// Compute node connection params. pub config: compute::ConnCfg, + /// Labels for proxy's metrics. + pub aux: MetricsAuxInfo, } impl BackendType<'_, ClientCredentials<'_>> { @@ -159,7 +149,7 @@ impl BackendType<'_, ClientCredentials<'_>> { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the project name. // We now expect to see a very specific payload in the place of password. - let fetch_magic_payload = async { + let fetch_magic_payload = |client| async { warn!("project name not specified, resorting to the password hack auth flow"); let payload = AuthFlow::new(client) .begin(auth::PasswordHack) @@ -171,38 +161,61 @@ impl BackendType<'_, ClientCredentials<'_>> { auth::Result::Ok(payload) }; + // If we want to use cleartext password flow, we can read the password + // from the client and pretend that it's a magic payload (PasswordHack hack). + let fetch_plaintext_password = |client| async { + info!("using cleartext password flow"); + let payload = AuthFlow::new(client) + .begin(auth::CleartextPassword) + .await? + .authenticate() + .await?; + + auth::Result::Ok(auth::password_hack::PasswordHackPayload { + project: String::new(), + password: payload, + }) + }; + // TODO: find a proper way to merge those very similar blocks. - let (mut config, payload) = match self { + let (mut node, payload) = match self { Console(endpoint, creds) if creds.project.is_none() => { - let payload = fetch_magic_payload.await?; + let payload = fetch_magic_payload(client).await?; let mut creds = creds.as_ref(); creds.project = Some(payload.project.as_str().into()); - let config = console::Api::new(endpoint, extra, &creds) + let node = console::Api::new(endpoint, extra, &creds) .wake_compute() .await?; - (config, payload) + (node, payload) + } + Console(endpoint, creds) if creds.use_cleartext_password_flow => { + // This is a hack to allow cleartext password in secure connections (wss). + let payload = fetch_plaintext_password(client).await?; + let creds = creds.as_ref(); + let node = console::Api::new(endpoint, extra, &creds) + .wake_compute() + .await?; + + (node, payload) } Postgres(endpoint, creds) if creds.project.is_none() => { - let payload = fetch_magic_payload.await?; + let payload = fetch_magic_payload(client).await?; let mut creds = creds.as_ref(); creds.project = Some(payload.project.as_str().into()); - let config = postgres::Api::new(endpoint, &creds).wake_compute().await?; + let node = postgres::Api::new(endpoint, &creds).wake_compute().await?; - (config, payload) + (node, payload) } _ => return Ok(None), }; - config.password(payload.password); + node.config.password(payload.password); Ok(Some(AuthSuccess { reported_auth_ok: false, - value: NodeInfo { - project: payload.project, - config, - }, + value: node, })) } @@ -233,10 +246,6 @@ impl BackendType<'_, ClientCredentials<'_>> { console::Api::new(&endpoint, extra, &creds) .handle_user(client) .await? - .map(|config| NodeInfo { - project: creds.project.unwrap().into_owned(), - config, - }) } Postgres(endpoint, creds) => { info!("performing mock authentication using a local postgres instance"); @@ -245,10 +254,6 @@ impl BackendType<'_, ClientCredentials<'_>> { postgres::Api::new(&endpoint, &creds) .handle_user(client) .await? - .map(|config| NodeInfo { - project: creds.project.unwrap().into_owned(), - config, - }) } // NOTE: this auth backend doesn't use client credentials. Link(url) => { diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index 040870fc8e..b3e3fd0c10 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -1,16 +1,16 @@ //! Cloud API V2. -use super::{AuthSuccess, ConsoleReqExtra}; +use super::{AuthSuccess, ConsoleReqExtra, NodeInfo}; use crate::{ auth::{self, AuthFlow, ClientCredentials}, compute, + console::messages::{ConsoleError, GetRoleSecret, WakeCompute}, error::{io_error, UserFacingError}, http, sasl, scram, stream::PqStream, }; use futures::TryFutureExt; use reqwest::StatusCode as HttpStatusCode; -use serde::Deserialize; use std::future::Future; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; @@ -136,24 +136,6 @@ impl UserFacingError for WakeComputeError { } } -/// Console's response which holds client's auth secret. -#[derive(Deserialize, Debug)] -struct GetRoleSecret { - role_secret: Box, -} - -/// Console's response which holds compute node's `host:port` pair. -#[derive(Deserialize, Debug)] -struct WakeCompute { - address: Box, -} - -/// Console's error response with human-readable description. -#[derive(Deserialize, Debug)] -struct ConsoleError { - error: Box, -} - /// Auth secret which is managed by the cloud. pub enum AuthInfo { /// Md5 hash of user's password. @@ -194,7 +176,7 @@ impl<'a> Api<'a> { pub(super) async fn handle_user( &'a self, client: &mut PqStream, - ) -> auth::Result> { + ) -> auth::Result> { handle_user(client, self, Self::get_auth_info, Self::wake_compute).await } } @@ -238,7 +220,7 @@ impl Api<'_> { } /// Wake up the compute node and return the corresponding connection info. - pub async fn wake_compute(&self) -> Result { + pub async fn wake_compute(&self) -> Result { let request_id = uuid::Uuid::new_v4().to_string(); async { let request = self @@ -269,7 +251,10 @@ impl Api<'_> { .dbname(self.creds.dbname) .user(self.creds.user); - Ok(config) + Ok(NodeInfo { + config, + aux: body.aux, + }) } .map_err(crate::error::log_error) .instrument(info_span!("wake_compute", id = request_id)) @@ -284,11 +269,11 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>( endpoint: &'a Endpoint, get_auth_info: impl FnOnce(&'a Endpoint) -> GetAuthInfo, wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute, -) -> auth::Result> +) -> auth::Result> where Endpoint: AsRef>, GetAuthInfo: Future, GetAuthInfoError>>, - WakeCompute: Future>, + WakeCompute: Future>, { let creds = endpoint.as_ref(); @@ -325,19 +310,20 @@ where } }; - let mut config = wake_compute(endpoint).await?; + let mut node = wake_compute(endpoint).await?; if let Some(keys) = scram_keys { - config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(keys)); + use tokio_postgres::config::AuthKeys; + node.config.auth_keys(AuthKeys::ScramSha256(keys)); } Ok(AuthSuccess { reported_auth_ok: false, - value: config, + value: node, }) } /// Parse http response body, taking status code into account. -async fn parse_body Deserialize<'a>>( +async fn parse_body serde::Deserialize<'a>>( response: reqwest::Response, ) -> Result { let status = response.status(); diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 440a55f194..e16bbc70e4 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -1,6 +1,6 @@ use super::{AuthSuccess, NodeInfo}; use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters}; -use pq_proto::{BeMessage as Be, BeParameterStatusMessage}; +use pq_proto::BeMessage as Be; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, info_span}; @@ -60,7 +60,7 @@ pub async fn handle_user( info!(parent: &span, "sending the auth URL to the user"); client .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())? + .write_message_noflush(&Be::CLIENT_ENCODING)? .write_message(&Be::NoticeResponse(&greeting)) .await?; @@ -86,8 +86,8 @@ pub async fn handle_user( Ok(AuthSuccess { reported_auth_ok: true, value: NodeInfo { - project: db_info.project, config, + aux: db_info.aux, }, }) } diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs index 8f16dc9fa8..260342f103 100644 --- a/proxy/src/auth/backend/postgres.rs +++ b/proxy/src/auth/backend/postgres.rs @@ -2,7 +2,7 @@ use super::{ console::{self, AuthInfo, GetAuthInfoError, WakeComputeError}, - AuthSuccess, + AuthSuccess, NodeInfo, }; use crate::{ auth::{self, ClientCredentials}, @@ -57,7 +57,7 @@ impl<'a> Api<'a> { pub(super) async fn handle_user( &'a self, client: &mut PqStream, - ) -> auth::Result> { + ) -> auth::Result> { // We reuse user handling logic from a production module. console::handle_user(client, self, Self::get_auth_info, Self::wake_compute).await } @@ -103,7 +103,7 @@ impl Api<'_> { } /// We don't need to wake anything locally, so we just return the connection info. - pub async fn wake_compute(&self) -> Result { + pub async fn wake_compute(&self) -> Result { let mut config = compute::ConnCfg::new(); config .host(self.endpoint.host_str().unwrap_or("localhost")) @@ -111,7 +111,10 @@ impl Api<'_> { .dbname(self.creds.dbname) .user(self.creds.user); - Ok(config) + Ok(NodeInfo { + config, + aux: Default::default(), + }) } } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 0a3b84bb52..3b71bef9aa 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -34,6 +34,9 @@ pub struct ClientCredentials<'a> { pub user: &'a str, pub dbname: &'a str, pub project: Option>, + /// If `True`, we'll use the old cleartext password flow. This is used for + /// websocket connections, which want to minimize the number of round trips. + pub use_cleartext_password_flow: bool, } impl ClientCredentials<'_> { @@ -50,6 +53,7 @@ impl<'a> ClientCredentials<'a> { user: self.user, dbname: self.dbname, project: self.project().map(Cow::Borrowed), + use_cleartext_password_flow: self.use_cleartext_password_flow, } } } @@ -59,6 +63,7 @@ impl<'a> ClientCredentials<'a> { params: &'a StartupMessageParams, sni: Option<&str>, common_name: Option<&str>, + use_cleartext_password_flow: bool, ) -> Result { use ClientCredsParseError::*; @@ -108,6 +113,7 @@ impl<'a> ClientCredentials<'a> { user = user, dbname = dbname, project = project.as_deref(), + use_cleartext_password_flow = use_cleartext_password_flow, "credentials" ); @@ -115,6 +121,7 @@ impl<'a> ClientCredentials<'a> { user, dbname, project, + use_cleartext_password_flow, }) } } @@ -141,7 +148,7 @@ mod tests { let options = StartupMessageParams::new([("user", "john_doe")]); // TODO: check that `creds.dbname` is None. - let creds = ClientCredentials::parse(&options, None, None)?; + let creds = ClientCredentials::parse(&options, None, None, false)?; assert_eq!(creds.user, "john_doe"); Ok(()) @@ -151,7 +158,7 @@ mod tests { fn parse_missing_project() -> anyhow::Result<()> { let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]); - let creds = ClientCredentials::parse(&options, None, None)?; + let creds = ClientCredentials::parse(&options, None, None, false)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project, None); @@ -166,7 +173,7 @@ mod tests { let sni = Some("foo.localhost"); let common_name = Some("localhost"); - let creds = ClientCredentials::parse(&options, sni, common_name)?; + let creds = ClientCredentials::parse(&options, sni, common_name, false)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("foo")); @@ -182,7 +189,7 @@ mod tests { ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let creds = ClientCredentials::parse(&options, None, None)?; + let creds = ClientCredentials::parse(&options, None, None, false)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("bar")); @@ -201,7 +208,7 @@ mod tests { let sni = Some("baz.localhost"); let common_name = Some("localhost"); - let creds = ClientCredentials::parse(&options, sni, common_name)?; + let creds = ClientCredentials::parse(&options, sni, common_name, false)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("baz")); @@ -220,7 +227,8 @@ mod tests { let sni = Some("second.localhost"); let common_name = Some("localhost"); - let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"); + let err = + ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail"); match err { InconsistentProjectNames { domain, option } => { assert_eq!(option, "first"); @@ -237,7 +245,8 @@ mod tests { let sni = Some("project.localhost"); let common_name = Some("example.com"); - let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"); + let err = + ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail"); match err { InconsistentSni { sni, cn } => { assert_eq!(sni, "project.localhost"); diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index d9ee50894d..4b982c0c5e 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -37,6 +37,17 @@ impl AuthMethod for PasswordHack { } } +/// Use clear-text password auth called `password` in docs +/// +pub struct CleartextPassword; + +impl AuthMethod for CleartextPassword { + #[inline(always)] + fn first_message(&self) -> BeMessage<'_> { + Be::AuthenticationCleartextPassword + } +} + /// This wrapper for [`PqStream`] performs client authentication. #[must_use] pub struct AuthFlow<'a, Stream, State> { @@ -86,6 +97,18 @@ impl AuthFlow<'_, S, PasswordHack> { } } +impl AuthFlow<'_, S, CleartextPassword> { + /// Perform user authentication. Raise an error in case authentication failed. + pub async fn authenticate(self) -> super::Result> { + let msg = self.stream.read_password_message().await?; + let password = msg + .strip_suffix(&[0]) + .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; + + Ok(password.to_vec()) + } +} + /// Stream wrapper for handling [SCRAM](crate::scram) auth. impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 4c5edb9673..094db73061 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -8,18 +8,17 @@ use tokio::net::TcpStream; use tokio_postgres::NoTls; use tracing::{error, info}; +const COULD_NOT_CONNECT: &str = "Could not connect to compute node"; + #[derive(Debug, Error)] pub enum ConnectionError { /// This error doesn't seem to reveal any secrets; for instance, /// [`tokio_postgres::error::Kind`] doesn't contain ip addresses and such. - #[error("Failed to connect to the compute node: {0}")] + #[error("{COULD_NOT_CONNECT}: {0}")] Postgres(#[from] tokio_postgres::Error), - #[error("Failed to connect to the compute node")] - FailedToConnectToCompute, - - #[error("Failed to fetch compute node version")] - FailedToFetchPgVersion, + #[error("{COULD_NOT_CONNECT}: {0}")] + CouldNotConnect(#[from] io::Error), } impl UserFacingError for ConnectionError { @@ -29,10 +28,10 @@ impl UserFacingError for ConnectionError { // This helps us drop irrelevant library-specific prefixes. // TODO: propagate severity level and other parameters. Postgres(err) => match err.as_db_error() { - Some(err) => err.message().to_string(), + Some(err) => err.message().to_owned(), None => err.to_string(), }, - other => other.to_string(), + _ => COULD_NOT_CONNECT.to_owned(), } } } @@ -44,12 +43,12 @@ pub type ScramKeys = tokio_postgres::config::ScramKeys<32>; /// Eventually, `tokio_postgres` will be replaced with something better. /// Newtype allows us to implement methods on top of it. #[repr(transparent)] -pub struct ConnCfg(pub tokio_postgres::Config); +pub struct ConnCfg(Box); impl ConnCfg { /// Construct a new connection config. pub fn new() -> Self { - Self(tokio_postgres::Config::new()) + Self(Default::default()) } } @@ -95,7 +94,7 @@ impl ConnCfg { io::ErrorKind::Other, format!( "couldn't connect: bad compute config, \ - ports and hosts entries' count does not match: {:?}", + ports and hosts entries' count does not match: {:?}", self.0 ), )); @@ -131,8 +130,8 @@ impl ConnCfg { pub struct PostgresConnection { /// Socket connected to a compute node. pub stream: TcpStream, - /// PostgreSQL version of this instance. - pub version: String, + /// PostgreSQL connection parameters. + pub params: std::collections::HashMap, } impl ConnCfg { @@ -156,6 +155,7 @@ impl ConnCfg { self.0.application_name(app_name); } + // TODO: This is especially ugly... if let Some(replication) = params.get("replication") { use tokio_postgres::config::ReplicationMode; match replication { @@ -172,22 +172,24 @@ impl ConnCfg { // TODO: extend the list of the forwarded startup parameters. // Currently, tokio-postgres doesn't allow us to pass // arbitrary parameters, but the ones above are a good start. + // + // This and the reverse params problem can be better addressed + // in a bespoke connection machinery (a new library for that sake). - let (socket_addr, mut stream) = self - .connect_raw() - .await - .map_err(|_| ConnectionError::FailedToConnectToCompute)?; - - // TODO: establish a secure connection to the DB - let (client, conn) = self.0.connect_raw(&mut stream, NoTls).await?; - let version = conn - .parameter("server_version") - .ok_or(ConnectionError::FailedToFetchPgVersion)? - .into(); - + // TODO: establish a secure connection to the DB. + let (socket_addr, mut stream) = self.connect_raw().await?; + let (client, connection) = self.0.connect_raw(&mut stream, NoTls).await?; info!("connected to user's compute node at {socket_addr}"); + + // This is very ugly but as of now there's no better way to + // extract the connection parameters from tokio-postgres' connection. + // TODO: solve this problem in a more elegant manner (e.g. the new library). + let params = connection.parameters; + + // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw. + // Yet another reason to rework the connection establishing code. let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); - let db = PostgresConnection { stream, version }; + let db = PostgresConnection { stream, params }; Ok((db, cancel_closure)) } diff --git a/proxy/src/console.rs b/proxy/src/console.rs new file mode 100644 index 0000000000..78f09ac9e1 --- /dev/null +++ b/proxy/src/console.rs @@ -0,0 +1,5 @@ +///! Various stuff for dealing with the Neon Console. +///! Later we might move some API wrappers here. + +/// Payloads used in the console's APIs. +pub mod messages; diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs new file mode 100644 index 0000000000..63a97069b8 --- /dev/null +++ b/proxy/src/console/messages.rs @@ -0,0 +1,190 @@ +use serde::Deserialize; +use std::fmt; + +/// Generic error response with human-readable description. +/// Note that we can't always present it to user as is. +#[derive(Debug, Deserialize)] +pub struct ConsoleError { + pub error: Box, +} + +/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`]. +/// Returned by the `/proxy_get_role_secret` API method. +#[derive(Deserialize)] +pub struct GetRoleSecret { + pub role_secret: Box, +} + +// Manually implement debug to omit sensitive info. +impl fmt::Debug for GetRoleSecret { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("GetRoleSecret").finish_non_exhaustive() + } +} + +/// Response which holds compute node's `host:port` pair. +/// Returned by the `/proxy_wake_compute` API method. +#[derive(Debug, Deserialize)] +pub struct WakeCompute { + pub address: Box, + pub aux: MetricsAuxInfo, +} + +/// Async response which concludes the link auth flow. +/// Also known as `kickResponse` in the console. +#[derive(Debug, Deserialize)] +pub struct KickSession<'a> { + /// Session ID is assigned by the proxy. + pub session_id: &'a str, + + /// Compute node connection params. + #[serde(deserialize_with = "KickSession::parse_db_info")] + pub result: DatabaseInfo, +} + +impl KickSession<'_> { + fn parse_db_info<'de, D>(des: D) -> Result + where + D: serde::Deserializer<'de>, + { + #[derive(Deserialize)] + enum Wrapper { + // Currently, console only reports `Success`. + // `Failure(String)` used to be here... RIP. + Success(DatabaseInfo), + } + + Wrapper::deserialize(des).map(|x| match x { + Wrapper::Success(info) => info, + }) + } +} + +/// Compute node connection params. +#[derive(Deserialize)] +pub struct DatabaseInfo { + pub host: String, + pub port: u16, + pub dbname: String, + pub user: String, + /// Console always provides a password, but it might + /// be inconvenient for debug with local PG instance. + pub password: Option, + pub aux: MetricsAuxInfo, +} + +// Manually implement debug to omit sensitive info. +impl fmt::Debug for DatabaseInfo { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("DatabaseInfo") + .field("host", &self.host) + .field("port", &self.port) + .field("dbname", &self.dbname) + .field("user", &self.user) + .finish_non_exhaustive() + } +} + +/// Various labels for prometheus metrics. +/// Also known as `ProxyMetricsAuxInfo` in the console. +#[derive(Debug, Deserialize, Default)] +pub struct MetricsAuxInfo { + pub endpoint_id: Box, + pub project_id: Box, + pub branch_id: Box, +} + +impl MetricsAuxInfo { + /// Definitions of labels for traffic metric. + pub const TRAFFIC_LABELS: &'static [&'static str] = &[ + // Received (rx) / sent (tx). + "direction", + // ID of a project. + "project_id", + // ID of an endpoint within a project. + "endpoint_id", + // ID of a branch within a project (snapshot). + "branch_id", + ]; + + /// Values of labels for traffic metric. + // TODO: add more type safety (validate arity & positions). + pub fn traffic_labels(&self, direction: &'static str) -> [&str; 4] { + [ + direction, + &self.project_id, + &self.endpoint_id, + &self.branch_id, + ] + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + fn dummy_aux() -> serde_json::Value { + json!({ + "endpoint_id": "endpoint", + "project_id": "project", + "branch_id": "branch", + }) + } + + #[test] + fn parse_kick_session() -> anyhow::Result<()> { + // This is what the console's kickResponse looks like. + let json = json!({ + "session_id": "deadbeef", + "result": { + "Success": { + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "password": "password", + "aux": dummy_aux(), + } + } + }); + let _: KickSession = serde_json::from_str(&json.to_string())?; + + Ok(()) + } + + #[test] + fn parse_db_info() -> anyhow::Result<()> { + // with password + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "password": "password", + "aux": dummy_aux(), + }))?; + + // without password + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "aux": dummy_aux(), + }))?; + + // new field (forward compatibility) + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "project": "hello_world", + "N.E.W": "forward compatibility check", + "aux": dummy_aux(), + }))?; + + Ok(()) + } +} diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 096a33d73d..e847edc8bd 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -1,4 +1,5 @@ pub mod server; +pub mod websocket; use crate::url::ApiUrl; diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs new file mode 100644 index 0000000000..33c2752307 --- /dev/null +++ b/proxy/src/http/websocket.rs @@ -0,0 +1,263 @@ +use bytes::{Buf, Bytes}; +use futures::{Sink, Stream, StreamExt}; +use hyper::server::accept::{self}; +use hyper::server::conn::AddrIncoming; +use hyper::upgrade::Upgraded; +use hyper::{Body, Request, Response, StatusCode}; +use hyper_tungstenite::{tungstenite, WebSocketStream}; +use hyper_tungstenite::{tungstenite::Message, HyperWebsocket}; +use pin_project_lite::pin_project; +use tokio::net::TcpListener; + +use std::convert::Infallible; +use std::future::ready; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; +use tls_listener::TlsListener; + +use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; + +use tracing::{error, info, info_span, warn, Instrument}; +use utils::http::{error::ApiError, json::json_response}; + +use crate::cancellation::CancelMap; +use crate::config::ProxyConfig; +use crate::proxy::handle_ws_client; + +pin_project! { + /// This is a wrapper around a WebSocketStream that implements AsyncRead and AsyncWrite. + pub struct WebSocketRW { + #[pin] + stream: WebSocketStream, + chunk: Option, + } +} + +// FIXME: explain why this is safe or try to remove `unsafe impl`. +unsafe impl Sync for WebSocketRW {} + +impl WebSocketRW { + pub fn new(stream: WebSocketStream) -> Self { + Self { + stream, + chunk: None, + } + } + + fn has_chunk(&self) -> bool { + if let Some(ref chunk) = self.chunk { + chunk.remaining() > 0 + } else { + false + } + } +} + +fn ws_err_into(e: tungstenite::Error) -> io::Error { + io::Error::new(io::ErrorKind::Other, e.to_string()) +} + +impl AsyncWrite for WebSocketRW { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + let mut this = self.project(); + match this.stream.as_mut().poll_ready(cx) { + Poll::Ready(Ok(())) => { + if let Err(e) = this + .stream + .as_mut() + .start_send(Message::Binary(buf.to_vec())) + { + Poll::Ready(Err(ws_err_into(e))) + } else { + Poll::Ready(Ok(buf.len())) + } + } + Poll::Ready(Err(e)) => Poll::Ready(Err(ws_err_into(e))), + Poll::Pending => { + cx.waker().wake_by_ref(); + Poll::Pending + } + } + } + + fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().stream.poll_flush(cx).map_err(ws_err_into) + } + + fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().stream.poll_close(cx).map_err(ws_err_into) + } +} + +impl AsyncRead for WebSocketRW { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + if buf.remaining() == 0 { + return Poll::Ready(Ok(())); + } + + let inner_buf = match self.as_mut().poll_fill_buf(cx) { + Poll::Ready(Ok(buf)) => buf, + Poll::Ready(Err(err)) => return Poll::Ready(Err(err)), + Poll::Pending => return Poll::Pending, + }; + let len = std::cmp::min(inner_buf.len(), buf.remaining()); + buf.put_slice(&inner_buf[..len]); + + self.consume(len); + Poll::Ready(Ok(())) + } +} + +impl AsyncBufRead for WebSocketRW { + fn poll_fill_buf(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + loop { + if self.as_mut().has_chunk() { + let buf = self.project().chunk.as_ref().unwrap().chunk(); + return Poll::Ready(Ok(buf)); + } else { + match self.as_mut().project().stream.poll_next(cx) { + Poll::Ready(Some(Ok(message))) => match message { + Message::Text(_) => {} + Message::Binary(chunk) => { + *self.as_mut().project().chunk = Some(Bytes::from(chunk)); + } + Message::Ping(_) => { + // No need to send a reply: tungstenite takes care of this for you. + } + Message::Pong(_) => {} + Message::Close(_) => { + // No need to send a reply: tungstenite takes care of this for you. + return Poll::Ready(Ok(&[])); + } + Message::Frame(_) => { + unreachable!(); + } + }, + Poll::Ready(Some(Err(err))) => return Poll::Ready(Err(ws_err_into(err))), + Poll::Ready(None) => return Poll::Ready(Ok(&[])), + Poll::Pending => return Poll::Pending, + } + } + } + } + + fn consume(self: Pin<&mut Self>, amt: usize) { + if amt > 0 { + self.project() + .chunk + .as_mut() + .expect("No chunk present") + .advance(amt); + } + } +} + +async fn serve_websocket( + websocket: HyperWebsocket, + config: &ProxyConfig, + cancel_map: &CancelMap, + session_id: uuid::Uuid, + hostname: Option, +) -> anyhow::Result<()> { + let websocket = websocket.await?; + handle_ws_client( + config, + cancel_map, + session_id, + WebSocketRW::new(websocket), + hostname, + ) + .await?; + Ok(()) +} + +async fn ws_handler( + mut request: Request, + config: &'static ProxyConfig, + cancel_map: Arc, + session_id: uuid::Uuid, +) -> Result, ApiError> { + let host = request + .headers() + .get("host") + .and_then(|h| h.to_str().ok()) + .and_then(|h| h.split(':').next()) + .map(|s| s.to_string()); + + // Check if the request is a websocket upgrade request. + if hyper_tungstenite::is_upgrade_request(&request) { + let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None) + .map_err(|e| ApiError::BadRequest(e.into()))?; + + tokio::spawn(async move { + if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await + { + error!("error in websocket connection: {:?}", e); + } + }); + + // Return the response so the spawned future can continue. + Ok(response) + } else { + json_response(StatusCode::OK, "Connect with a websocket client") + } +} + +pub async fn task_main( + ws_listener: TcpListener, + config: &'static ProxyConfig, +) -> anyhow::Result<()> { + scopeguard::defer! { + info!("websocket server has shut down"); + } + + let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config()); + let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config { + Some(config) => config.into(), + None => { + warn!("TLS config is missing, WebSocket Secure server will not be started"); + return Ok(()); + } + }; + + let addr_incoming = AddrIncoming::from_listener(ws_listener)?; + + let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| { + if let Err(err) = conn { + error!("failed to accept TLS connection for websockets: {:?}", err); + ready(false) + } else { + ready(true) + } + }); + + let make_svc = hyper::service::make_service_fn(|_stream| async move { + Ok::<_, Infallible>(hyper::service::service_fn( + move |req: Request| async move { + let cancel_map = Arc::new(CancelMap::default()); + let session_id = uuid::Uuid::new_v4(); + ws_handler(req, config, cancel_map, session_id) + .instrument(info_span!( + "ws-client", + session = format_args!("{session_id}") + )) + .await + }, + )) + }); + + hyper::Server::builder(accept::from_stream(tls_listener)) + .serve(make_svc) + .await?; + + Ok(()) +} diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 2855d1f900..aa6766c102 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -8,6 +8,7 @@ mod auth; mod cancellation; mod compute; mod config; +mod console; mod error; mod http; mod mgmt; @@ -109,12 +110,23 @@ async fn main() -> anyhow::Result<()> { info!("Starting proxy on {proxy_address}"); let proxy_listener = TcpListener::bind(proxy_address).await?; - let tasks = [ + let mut tasks = vec![ tokio::spawn(http::server::task_main(http_listener)), tokio::spawn(proxy::task_main(config, proxy_listener)), tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)), - ] - .map(flatten_err); + ]; + + if let Some(wss_address) = arg_matches.get_one::("wss") { + let wss_address: SocketAddr = wss_address.parse()?; + info!("Starting wss on {}", wss_address); + let wss_listener = TcpListener::bind(wss_address).await?; + tasks.push(tokio::spawn(http::websocket::task_main( + wss_listener, + config, + ))); + } + + let tasks = tasks.into_iter().map(flatten_err); set_build_info_metric(GIT_VERSION); // This will block until all tasks have completed. @@ -154,6 +166,11 @@ fn cli() -> clap::Command { .help("listen for incoming http connections (metrics, etc) on ip:port") .default_value("127.0.0.1:7001"), ) + .arg( + Arg::new("wss") + .long("wss") + .help("listen for incoming wss connections on ip:port"), + ) .arg( Arg::new("uri") .short('u') diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 23e10b5a9b..cf83b48ae0 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -1,13 +1,18 @@ -use crate::auth; +use crate::{ + auth, + console::messages::{DatabaseInfo, KickSession}, +}; use anyhow::Context; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; -use serde::Deserialize; use std::{ net::{TcpListener, TcpStream}, thread, }; use tracing::{error, info, info_span}; -use utils::postgres_backend::{self, AuthType, PostgresBackend}; +use utils::{ + postgres_backend::{self, AuthType, PostgresBackend}, + postgres_backend_async::QueryError, +}; /// Console management API listener thread. /// It spawns console response handlers needed for the link auth. @@ -45,68 +50,18 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> { } } -fn handle_connection(socket: TcpStream) -> anyhow::Result<()> { +fn handle_connection(socket: TcpStream) -> Result<(), QueryError> { let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?; pgbackend.run(&mut MgmtHandler) } -/// Known as `kickResponse` in the console. -#[derive(Debug, Deserialize)] -struct PsqlSessionResponse { - session_id: String, - result: PsqlSessionResult, -} - -#[derive(Debug, Deserialize)] -enum PsqlSessionResult { - Success(DatabaseInfo), - Failure(String), -} - /// A message received by `mgmt` when a compute node is ready. pub type ComputeReady = Result; -impl PsqlSessionResult { - fn into_compute_ready(self) -> ComputeReady { - match self { - Self::Success(db_info) => Ok(db_info), - Self::Failure(message) => Err(message), - } - } -} - -/// Compute node connection params provided by the console. -/// This struct and its parents are mgmt API implementation -/// detail and thus should remain in this module. -// TODO: restore deserialization tests from git history. -#[derive(Deserialize)] -pub struct DatabaseInfo { - pub host: String, - pub port: u16, - pub dbname: String, - pub user: String, - /// Console always provides a password, but it might - /// be inconvenient for debug with local PG instance. - pub password: Option, - pub project: String, -} - -// Manually implement debug to omit sensitive info. -impl std::fmt::Debug for DatabaseInfo { - fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { - fmt.debug_struct("DatabaseInfo") - .field("host", &self.host) - .field("port", &self.port) - .field("dbname", &self.dbname) - .field("user", &self.user) - .finish_non_exhaustive() - } -} - // TODO: replace with an http-based protocol. struct MgmtHandler; impl postgres_backend::Handler for MgmtHandler { - fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> { + fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> { try_process_query(pgb, query).map_err(|e| { error!("failed to process response: {e:?}"); e @@ -114,14 +69,14 @@ impl postgres_backend::Handler for MgmtHandler { } } -fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> { - let resp: PsqlSessionResponse = serde_json::from_str(query)?; +fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> { + let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?; let span = info_span!("event", session_id = resp.session_id); let _enter = span.enter(); info!("got response: {:?}", resp.result); - match auth::backend::notify(&resp.session_id, resp.result.into_compute_ready()) { + match auth::backend::notify(resp.session_id, Ok(resp.result)) { Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? @@ -129,49 +84,9 @@ fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<( } Err(e) => { error!("failed to deliver response to per-client task"); - pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + pgb.write_message(&BeMessage::ErrorResponse(&e.to_string(), None))?; } } Ok(()) } - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn parse_db_info() -> anyhow::Result<()> { - // with password - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "password": "password", - "project": "hello_world", - }))?; - - // without password - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "project": "hello_world", - }))?; - - // new field (forward compatibility) - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "project": "hello_world", - "N.E.W": "forward compatibility check", - }))?; - - Ok(()) - } -} diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index da3cb144e3..63573d49c0 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -11,7 +11,7 @@ use anyhow::{bail, Context}; use futures::TryFutureExt; use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; use once_cell::sync::Lazy; -use pq_proto::{BeMessage as Be, *}; +use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{error, info, info_span, Instrument}; @@ -39,12 +39,7 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { register_int_counter_vec!( "proxy_io_bytes_per_client", "Number of bytes sent/received between client and backend.", - &[ - // Received (rx) / sent (tx). - "direction", - // Proxy can keep calling it `project` internally. - "endpoint_id" - ] + crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS, ) .unwrap() }); @@ -87,6 +82,47 @@ pub async fn task_main( } } +pub async fn handle_ws_client( + config: &ProxyConfig, + cancel_map: &CancelMap, + session_id: uuid::Uuid, + stream: impl AsyncRead + AsyncWrite + Unpin + Send, + hostname: Option, +) -> anyhow::Result<()> { + // The `closed` counter will increase when this future is destroyed. + NUM_CONNECTIONS_ACCEPTED_COUNTER.inc(); + scopeguard::defer! { + NUM_CONNECTIONS_CLOSED_COUNTER.inc(); + } + + let tls = config.tls_config.as_ref(); + let hostname = hostname.as_deref(); + + // TLS is None here, because the connection is already encrypted. + let do_handshake = handshake(stream, None, cancel_map).instrument(info_span!("handshake")); + let (mut stream, params) = match do_handshake.await? { + Some(x) => x, + None => return Ok(()), // it's a cancellation request + }; + + // Extract credentials which we're going to use for auth. + let creds = { + let common_name = tls.and_then(|tls| tls.common_name.as_deref()); + let result = config + .auth_backend + .as_ref() + .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name, true)) + .transpose(); + + async { result }.or_else(|e| stream.throw_error(e)).await? + }; + + let client = Client::new(stream, creds, ¶ms, session_id); + cancel_map + .with_session(|session| client.connect_to_db(session)) + .await +} + async fn handle_client( config: &ProxyConfig, cancel_map: &CancelMap, @@ -113,7 +149,7 @@ async fn handle_client( let result = config .auth_backend .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name)) + .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name, false)) .transpose(); async { result }.or_else(|e| stream.throw_error(e)).await? @@ -255,29 +291,32 @@ impl Client<'_, S> { // Note that we do this only (for the most part) after we've connected // to a compute (see above) which performs its own authentication. if !auth_result.reported_auth_ok { - stream - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; + stream.write_message_noflush(&Be::AuthenticationOk)?; + } + + // Forward all postgres connection params to the client. + // Right now the implementation is very hacky and inefficent (ideally, + // we don't need an intermediate hashmap), but at least it should be correct. + for (name, value) in &db.params { + // TODO: Theoretically, this could result in a big pile of params... + stream.write_message_noflush(&Be::ParameterStatus { + name: name.as_bytes(), + value: value.as_bytes(), + })?; } stream - .write_message_noflush(&BeMessage::ParameterStatus( - BeParameterStatusMessage::ServerVersion(&db.version), - ))? .write_message_noflush(&Be::BackendKeyData(cancel_key_data))? - .write_message(&BeMessage::ReadyForQuery) + .write_message(&Be::ReadyForQuery) .await?; - // TODO: add more identifiers. - let metric_id = node.project; - - let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx", &metric_id]); + let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&node.aux.traffic_labels("tx")); let mut client = MeasuredStream::new(stream.into_inner(), |cnt| { // Number of bytes we sent to the client (outbound). m_sent.inc_by(cnt as u64); }); - let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx", &metric_id]); + let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&node.aux.traffic_labels("rx")); let mut db = MeasuredStream::new(db.stream, |cnt| { // Number of bytes the client sent to the compute node (inbound). m_recv.inc_by(cnt as u64); diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 24fbc57b99..ed429df421 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -139,8 +139,8 @@ async fn dummy_proxy( stream .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())? - .write_message(&BeMessage::ReadyForQuery) + .write_message_noflush(&Be::CLIENT_ENCODING)? + .write_message(&Be::ReadyForQuery) .await?; Ok(()) diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index 89668465fa..424beccec9 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -48,7 +48,7 @@ impl ServerSecret { Self { iterations: 4096, - salt_base64: base64::encode(&mocked_salt), + salt_base64: base64::encode(mocked_salt), stored_key: ScramKey::default(), server_key: ScramKey::default(), doomed: true, @@ -68,7 +68,7 @@ impl ServerSecret { Some(Self { iterations, - salt_base64: base64::encode(&salt), + salt_base64: base64::encode(salt), stored_key: password.client_key().sha256(), server_key: password.server_key(), doomed: false, diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 19e1479068..02a0fabe9a 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -2,7 +2,7 @@ use crate::error::UserFacingError; use anyhow::bail; use bytes::BytesMut; use pin_project_lite::pin_project; -use pq_proto::{BeMessage, FeMessage, FeStartupPacket}; +use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket}; use rustls::ServerConfig; use std::pin::Pin; use std::sync::Arc; @@ -47,18 +47,13 @@ fn err_connection() -> io::Error { io::Error::new(io::ErrorKind::ConnectionAborted, "connection is lost") } -// TODO: change error type of `FeMessage::read_fut` -fn from_anyhow(e: anyhow::Error) -> io::Error { - io::Error::new(io::ErrorKind::Other, e.to_string()) -} - impl PqStream { /// Receive [`FeStartupPacket`], which is a first packet sent by a client. pub async fn read_startup_packet(&mut self) -> io::Result { // TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket` let msg = FeStartupPacket::read_fut(&mut self.stream) .await - .map_err(from_anyhow)? + .map_err(ConnectionError::into_io_error)? .ok_or_else(err_connection)?; match msg { @@ -80,7 +75,7 @@ impl PqStream { async fn read_message(&mut self) -> io::Result { FeMessage::read_fut(&mut self.stream) .await - .map_err(from_anyhow)? + .map_err(ConnectionError::into_io_error)? .ok_or_else(err_connection) } } @@ -112,7 +107,8 @@ impl PqStream { /// This method exists due to `&str` not implementing `Into`. pub async fn throw_error_str(&mut self, error: &'static str) -> anyhow::Result { tracing::info!("forwarding error to user: {error}"); - self.write_message(&BeMessage::ErrorResponse(error)).await?; + self.write_message(&BeMessage::ErrorResponse(error, None)) + .await?; bail!(error) } @@ -124,7 +120,8 @@ impl PqStream { { let msg = error.to_string_client(); tracing::info!("forwarding error to user: {msg}"); - self.write_message(&BeMessage::ErrorResponse(&msg)).await?; + self.write_message(&BeMessage::ErrorResponse(&msg, None)) + .await?; bail!(error) } } diff --git a/pyproject.toml b/pyproject.toml index b297f7f70b..b4fb7a9e7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,10 +32,11 @@ toml = "^0.10.2" psutil = "^5.9.4" types-psutil = "^5.9.5.4" types-toml = "^0.10.8" +pytest-httpserver = "^1.0.6" [tool.poetry.dev-dependencies] flake8 = "^5.0.4" -mypy = "==0.971" +mypy = "==0.991" black = "^22.6.0" isort = "^5.10.1" @@ -60,10 +61,8 @@ skip = [ ] [tool.mypy] -# mypy uses regex exclude = "^vendor/" -# some tests don't typecheck when this flag is set -check_untyped_defs = false +check_untyped_defs = true # Help mypy find imports when running against list of individual files. # Without this line it would behave differently when executed on the entire project. mypy_path = "$MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner" diff --git a/run_clippy.sh b/run_clippy.sh index bf770432d0..fe0e745d7d 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -9,8 +9,8 @@ # In vscode, this setting is Rust-analyzer>Check On Save:Command -# Not every feature is supported in macOS builds, e.g. `profiling`, -# avoid running regular linting script that checks every feature. +# Not every feature is supported in macOS builds. Avoid running regular linting +# script that checks every feature. if [[ "$OSTYPE" == "darwin"* ]]; then # no extra features to test currently, add more here when needed cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index d11ef1711a..d0c804fe4e 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -2,6 +2,7 @@ name = "safekeeper" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [dependencies] async-stream = "0.3" @@ -20,8 +21,8 @@ hyper = "0.14" nix = "0.25" once_cell = "1.13.0" parking_lot = "0.12.1" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } regex = "1.4.5" serde = { version = "1.0", features = ["derive"] } serde_json = "1" @@ -29,7 +30,7 @@ serde_with = "2.0" signal-hook = "0.3.10" thiserror = "1" tokio = { version = "1.17", features = ["macros", "fs"] } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } toml_edit = { version = "0.14", features = ["easy"] } tracing = "0.1.27" url = "2.2.2" diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index cab5053b5b..b130ea86bd 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -82,6 +82,9 @@ struct Args { /// established; plaintext otherwise. #[arg(long, default_value = DEFAULT_ENDPOINT, verbatim_doc_comment)] broker_endpoint: Uri, + /// Broker keepalive interval. + #[arg(long, value_parser= humantime::parse_duration, default_value = storage_broker::DEFAULT_KEEPALIVE_INTERVAL)] + broker_keepalive_interval: Duration, /// Peer safekeeper is considered dead after not receiving heartbeats from /// it during this period passed as a human readable duration. #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT)] @@ -126,28 +129,47 @@ fn main() -> anyhow::Result<()> { logging::init(LogFormat::from_config(&args.log_format)?)?; info!("version: {GIT_VERSION}"); + let args_workdir = &args.datadir; + let workdir = args_workdir.canonicalize().with_context(|| { + format!("Failed to get the absolute path for input workdir {args_workdir:?}") + })?; + // Change into the data directory. - std::env::set_current_dir(&args.datadir)?; + std::env::set_current_dir(&workdir)?; // Set or read our ID. - let id = set_id(&args.datadir, args.id.map(NodeId))?; + let id = set_id(&workdir, args.id.map(NodeId))?; if args.init { return Ok(()); } + let auth = match args.auth_validation_public_key_path.as_ref() { + None => { + info!("auth is disabled"); + None + } + Some(path) => { + info!("loading JWT auth key from {}", path.display()); + Some(Arc::new( + JwtAuth::from_key_path(path).context("failed to load the auth key")?, + )) + } + }; + let conf = SafeKeeperConf { - workdir: args.datadir, + workdir, my_id: id, listen_pg_addr: args.listen_pg, listen_http_addr: args.listen_http, no_sync: args.no_sync, broker_endpoint: args.broker_endpoint, + broker_keepalive_interval: args.broker_keepalive_interval, heartbeat_timeout: args.heartbeat_timeout, remote_storage: args.remote_storage, max_offloader_lag_bytes: args.max_offloader_lag, backup_runtime_threads: args.wal_backup_threads, wal_backup_enabled: !args.disable_wal_backup, - auth_validation_public_key_path: args.auth_validation_public_key_path, + auth, }; // initialize sentry if SENTRY_DSN is provided @@ -177,19 +199,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { e })?; - let auth = match conf.auth_validation_public_key_path.as_ref() { - None => { - info!("auth is disabled"); - None - } - Some(path) => { - info!("loading JWT auth key from {}", path.display()); - Some(Arc::new( - JwtAuth::from_key_path(path).context("failed to load the auth key")?, - )) - } - }; - // Register metrics collector for active timelines. It's important to do this // after daemonizing, otherwise process collector will be upset. let timeline_collector = safekeeper::metrics::TimelineCollector::new(); @@ -203,12 +212,11 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?; let conf_ = conf.clone(); - let auth_ = auth.clone(); threads.push( thread::Builder::new() .name("http_endpoint_thread".into()) .spawn(|| { - let router = http::make_router(conf_, auth_); + let router = http::make_router(conf_); endpoint::serve_thread_main( router, http_listener, @@ -221,11 +229,7 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let conf_cloned = conf.clone(); let safekeeper_thread = thread::Builder::new() .name("safekeeper thread".into()) - .spawn(|| { - if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener, auth) { - info!("safekeeper thread terminated: {e}"); - } - }) + .spawn(|| wal_service::thread_main(conf_cloned, pg_listener)) .unwrap(); threads.push(safekeeper_thread); @@ -235,7 +239,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { thread::Builder::new() .name("broker thread".into()) .spawn(|| { - // TODO: add auth? broker::thread_main(conf_); })?, ); @@ -304,7 +307,8 @@ fn set_id(workdir: &Path, given_id: Option) -> Result { } else { bail!("safekeeper id is not specified"); }; - let mut f = File::create(&id_file_path)?; + let mut f = File::create(&id_file_path) + .with_context(|| format!("Failed to create id file at {id_file_path:?}"))?; f.write_all(my_id.to_string().as_bytes())?; f.sync_all()?; info!("initialized safekeeper id {}", my_id); diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index df2dc92efe..92f35bf51f 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -66,7 +66,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { /// Subscribe and fetch all the interesting data from the broker. async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { - let mut client = storage_broker::connect(conf.broker_endpoint)?; + let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?; // TODO: subscribe only to local timelines instead of all let request = SubscribeSafekeeperInfoRequest { diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index f4a0f8520c..ba5e453e41 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -239,7 +239,7 @@ mod test { conf: &SafeKeeperConf, ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir"); + fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir"); Ok(( FileStorage::restore_new(ttid, conf)?, FileStorage::load_control_file_conf(conf, ttid)?, @@ -250,7 +250,7 @@ mod test { conf: &SafeKeeperConf, ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir"); + fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir"); let state = SafeKeeperState::empty(); let storage = FileStorage::create_new(ttid, conf, state.clone())?; Ok((storage, state)) diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 05527303ca..60df5dd372 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -8,16 +8,16 @@ use crate::receive_wal::ReceiveWalConn; use crate::send_wal::ReplicationConn; use crate::{GlobalTimelines, SafeKeeperConf}; -use anyhow::{bail, ensure, Context, Result}; +use anyhow::Context; use postgres_ffi::PG_TLI; use regex::Regex; use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; use std::str; -use std::sync::Arc; use tracing::info; -use utils::auth::{Claims, JwtAuth, Scope}; +use utils::auth::{Claims, Scope}; +use utils::postgres_backend_async::QueryError; use utils::{ id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, @@ -32,7 +32,6 @@ pub struct SafekeeperPostgresHandler { pub tenant_id: Option, pub timeline_id: Option, pub ttid: TenantTimelineId, - auth: Option>, claims: Option, } @@ -44,7 +43,7 @@ enum SafekeeperPostgresCommand { JSONCtrl { cmd: AppendLogicalMessage }, } -fn parse_cmd(cmd: &str) -> Result { +fn parse_cmd(cmd: &str) -> anyhow::Result { if cmd.starts_with("START_WAL_PUSH") { Ok(SafekeeperPostgresCommand::StartWalPush) } else if cmd.starts_with("START_REPLICATION") { @@ -64,13 +63,17 @@ fn parse_cmd(cmd: &str) -> Result { cmd: serde_json::from_str(cmd)?, }) } else { - bail!("unsupported command {}", cmd); + anyhow::bail!("unsupported command {cmd}"); } } impl postgres_backend::Handler for SafekeeperPostgresHandler { // tenant_id and timeline_id are passed in connection string params - fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> { + fn startup( + &mut self, + _pgb: &mut PostgresBackend, + sm: &FeStartupPacket, + ) -> Result<(), QueryError> { if let FeStartupPacket::StartupMessage { params, .. } = sm { if let Some(options) = params.options_raw() { for opt in options { @@ -79,10 +82,14 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { // https://github.com/neondatabase/neon/pull/2433#discussion_r970005064 match opt.split_once('=') { Some(("ztenantid", value)) | Some(("tenant_id", value)) => { - self.tenant_id = Some(value.parse()?); + self.tenant_id = Some(value.parse().with_context(|| { + format!("Failed to parse {value} as tenant id") + })?); } Some(("ztimelineid", value)) | Some(("timeline_id", value)) => { - self.timeline_id = Some(value.parse()?); + self.timeline_id = Some(value.parse().with_context(|| { + format!("Failed to parse {value} as timeline id") + })?); } _ => continue, } @@ -95,7 +102,9 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { Ok(()) } else { - bail!("Safekeeper received unexpected initial message: {:?}", sm); + Err(QueryError::Other(anyhow::anyhow!( + "Safekeeper received unexpected initial message: {sm:?}" + ))) } } @@ -103,20 +112,20 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { &mut self, _pgb: &mut PostgresBackend, jwt_response: &[u8], - ) -> anyhow::Result<()> { + ) -> Result<(), QueryError> { // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT // which requires auth to be present let data = self + .conf .auth .as_ref() .unwrap() - .decode(str::from_utf8(jwt_response)?)?; + .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?; - if matches!(data.claims.scope, Scope::Tenant) { - ensure!( - data.claims.tenant_id.is_some(), + if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() { + return Err(QueryError::Other(anyhow::anyhow!( "jwt token scope is Tenant, but tenant id is missing" - ) + ))); } info!( @@ -128,7 +137,11 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { Ok(()) } - fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> { + fn process_query( + &mut self, + pgb: &mut PostgresBackend, + query_string: &str, + ) -> Result<(), QueryError> { if query_string .to_ascii_lowercase() .starts_with("set datestyle to ") @@ -149,39 +162,45 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { self.check_permission(Some(tenant_id))?; self.ttid = TenantTimelineId::new(tenant_id, timeline_id); - match cmd { + let res = match cmd { SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb).run(self), SafekeeperPostgresCommand::StartReplication { start_lsn } => { ReplicationConn::new(pgb).run(self, pgb, start_lsn) } SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb), SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd), - } - .context(format!( - "Failed to process query for timeline {timeline_id}" - ))?; + }; - Ok(()) + match res { + Ok(()) => Ok(()), + Err(QueryError::Disconnected(connection_error)) => { + info!("Timeline {tenant_id}/{timeline_id} query failed with connection error: {connection_error}"); + Err(QueryError::Disconnected(connection_error)) + } + Err(QueryError::Other(e)) => Err(QueryError::Other(e.context(format!( + "Failed to process query for timeline {}", + self.ttid + )))), + } } } impl SafekeeperPostgresHandler { - pub fn new(conf: SafeKeeperConf, auth: Option>) -> Self { + pub fn new(conf: SafeKeeperConf) -> Self { SafekeeperPostgresHandler { conf, appname: None, tenant_id: None, timeline_id: None, ttid: TenantTimelineId::empty(), - auth, claims: None, } } // when accessing management api supply None as an argument // when using to authorize tenant pass corresponding tenant id - fn check_permission(&self, tenant_id: Option) -> Result<()> { - if self.auth.is_none() { + fn check_permission(&self, tenant_id: Option) -> anyhow::Result<()> { + if self.conf.auth.is_none() { // auth is set to Trust, nothing to check so just return ok return Ok(()); } @@ -198,7 +217,7 @@ impl SafekeeperPostgresHandler { /// /// Handle IDENTIFY_SYSTEM replication command /// - fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> { + fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<(), QueryError> { let tli = GlobalTimelines::get(self.ttid)?; let lsn = if self.is_walproposer_recovery() { diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index a9a9eb3388..a917d61678 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -277,12 +277,9 @@ async fn record_safekeeper_info(mut request: Request) -> Result>, -) -> RouterBuilder { +pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder { let mut router = endpoint::make_router(); - if auth.is_some() { + if conf.auth.is_some() { router = router.middleware(auth_middleware(|request| { #[allow(clippy::mutable_key_type)] static ALLOWLIST_ROUTES: Lazy> = @@ -298,6 +295,7 @@ pub fn make_router( // NB: on any changes do not forget to update the OpenAPI spec // located nearby (/safekeeper/src/http/openapi_spec.yaml). + let auth = conf.auth.clone(); router .data(Arc::new(conf)) .data(auth) diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 746b4461b7..32a24a4978 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -8,11 +8,12 @@ use std::sync::Arc; -use anyhow::Result; +use anyhow::Context; use bytes::Bytes; use serde::{Deserialize, Serialize}; use tracing::*; use utils::id::TenantTimelineId; +use utils::postgres_backend_async::QueryError; use crate::handler::SafekeeperPostgresHandler; use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo}; @@ -47,7 +48,7 @@ pub struct AppendLogicalMessage { pg_version: u32, } -#[derive(Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize)] struct AppendResult { // safekeeper state after append state: SafeKeeperState, @@ -62,8 +63,8 @@ pub fn handle_json_ctrl( spg: &SafekeeperPostgresHandler, pgb: &mut PostgresBackend, append_request: &AppendLogicalMessage, -) -> Result<()> { - info!("JSON_CTRL request: {:?}", append_request); +) -> Result<(), QueryError> { + info!("JSON_CTRL request: {append_request:?}"); // need to init safekeeper state before AppendRequest let tli = prepare_safekeeper(spg.ttid, append_request.pg_version)?; @@ -78,7 +79,8 @@ pub fn handle_json_ctrl( state: tli.get_state().1, inserted_wal, }; - let response_data = serde_json::to_vec(&response)?; + let response_data = serde_json::to_vec(&response) + .with_context(|| format!("Response {response:?} is not a json array"))?; pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor { name: b"json", @@ -93,7 +95,7 @@ pub fn handle_json_ctrl( /// Prepare safekeeper to process append requests without crashes, /// by sending ProposerGreeting with default server.wal_seg_size. -fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result> { +fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> anyhow::Result> { GlobalTimelines::create( ttid, ServerInfo { @@ -106,7 +108,7 @@ fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result, term: Term, lsn: Lsn) -> Result<()> { +fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::Result<()> { // add new term to existing history let history = tli.get_state().1.acceptor_state.term_history; let history = history.up_to(lsn.checked_sub(1u64).unwrap()); @@ -125,7 +127,7 @@ fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> Result<() Ok(()) } -#[derive(Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize)] struct InsertedWAL { begin_lsn: Lsn, end_lsn: Lsn, @@ -134,7 +136,10 @@ struct InsertedWAL { /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. -fn append_logical_message(tli: &Arc, msg: &AppendLogicalMessage) -> Result { +fn append_logical_message( + tli: &Arc, + msg: &AppendLogicalMessage, +) -> anyhow::Result { let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); let sk_state = tli.get_state().1; diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 60a1911068..891d73533f 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -24,7 +24,9 @@ pub mod wal_service; pub mod wal_storage; mod timelines_global_map; +use std::sync::Arc; pub use timelines_global_map::GlobalTimelines; +use utils::auth::JwtAuth; pub mod defaults { pub use safekeeper_api::{ @@ -51,12 +53,13 @@ pub struct SafeKeeperConf { pub listen_http_addr: String, pub no_sync: bool, pub broker_endpoint: Uri, + pub broker_keepalive_interval: Duration, pub heartbeat_timeout: Duration, pub remote_storage: Option, pub max_offloader_lag_bytes: u64, pub backup_runtime_threads: Option, pub wal_backup_enabled: bool, - pub auth_validation_public_key_path: Option, + pub auth: Option>, } impl SafeKeeperConf { @@ -83,9 +86,10 @@ impl SafeKeeperConf { broker_endpoint: storage_broker::DEFAULT_ENDPOINT .parse() .expect("failed to parse default broker endpoint"), + broker_keepalive_interval: Duration::from_secs(5), backup_runtime_threads: None, wal_backup_enabled: true, - auth_validation_public_key_path: None, + auth: None, heartbeat_timeout: Duration::new(5, 0), max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index d4d3d37737..b21770686c 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -425,7 +425,7 @@ impl Collector for TimelineCollector { .set(tli.num_computes as i64); self.acceptor_term .with_label_values(labels) - .set(tli.persisted_state.acceptor_state.term as u64); + .set(tli.persisted_state.acceptor_state.term); self.written_wal_bytes .with_label_values(labels) .set(tli.wal_storage.write_wal_bytes); diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 6577e8c4d6..671e5470a0 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -2,11 +2,13 @@ //! Gets messages from the network, passes them down to consensus module and //! sends replies back. -use anyhow::{anyhow, bail, Result}; +use anyhow::anyhow; +use anyhow::Context; use bytes::BytesMut; use tracing::*; use utils::lsn::Lsn; +use utils::postgres_backend_async::QueryError; use crate::safekeeper::ServerInfo; use crate::timeline::Timeline; @@ -43,7 +45,7 @@ impl<'pg> ReceiveWalConn<'pg> { } // Send message to the postgres - fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> Result<()> { + fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> anyhow::Result<()> { let mut buf = BytesMut::with_capacity(128); msg.serialize(&mut buf)?; self.pg_backend.write_message(&BeMessage::CopyData(&buf))?; @@ -51,8 +53,8 @@ impl<'pg> ReceiveWalConn<'pg> { } /// Receive WAL from wal_proposer - pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<()> { - let _enter = info_span!("WAL acceptor", timeline = %spg.timeline_id.unwrap()).entered(); + pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<(), QueryError> { + let _enter = info_span!("WAL acceptor", ttid = %spg.ttid).entered(); // Notify the libpq client that it's allowed to send `CopyData` messages self.pg_backend @@ -69,7 +71,7 @@ impl<'pg> ReceiveWalConn<'pg> { let tli = match next_msg { ProposerAcceptorMessage::Greeting(ref greeting) => { info!( - "start handshake with wal proposer {} sysid {} timeline {}", + "start handshake with walproposer {} sysid {} timeline {}", self.peer_addr, greeting.system_id, greeting.tli, ); let server_info = ServerInfo { @@ -79,7 +81,11 @@ impl<'pg> ReceiveWalConn<'pg> { }; GlobalTimelines::create(spg.ttid, server_info, Lsn::INVALID, Lsn::INVALID)? } - _ => bail!("unexpected message {:?} instead of greeting", next_msg), + _ => { + return Err(QueryError::Other(anyhow::anyhow!( + "unexpected message {next_msg:?} instead of greeting" + ))) + } }; let mut next_msg = Some(next_msg); @@ -134,25 +140,32 @@ impl<'pg> ReceiveWalConn<'pg> { struct ProposerPollStream { msg_rx: Receiver, - read_thread: Option>>, + read_thread: Option>>, } impl ProposerPollStream { - fn new(mut r: ReadStream) -> Result { + fn new(mut r: ReadStream) -> anyhow::Result { let (msg_tx, msg_rx) = channel(); let read_thread = thread::Builder::new() .name("Read WAL thread".into()) - .spawn(move || -> Result<()> { + .spawn(move || -> Result<(), QueryError> { loop { let copy_data = match FeMessage::read(&mut r)? { - Some(FeMessage::CopyData(bytes)) => bytes, - Some(msg) => bail!("expected `CopyData` message, found {:?}", msg), - None => bail!("connection closed unexpectedly"), - }; + Some(FeMessage::CopyData(bytes)) => Ok(bytes), + Some(msg) => Err(QueryError::Other(anyhow::anyhow!( + "expected `CopyData` message, found {msg:?}" + ))), + None => Err(QueryError::from(std::io::Error::new( + std::io::ErrorKind::ConnectionAborted, + "walproposer closed the connection", + ))), + }?; let msg = ProposerAcceptorMessage::parse(copy_data)?; - msg_tx.send(msg)?; + msg_tx + .send(msg) + .context("Failed to send the proposer message")?; } // msg_tx will be dropped here, this will also close msg_rx })?; @@ -163,17 +176,19 @@ impl ProposerPollStream { }) } - fn recv_msg(&mut self) -> Result { + fn recv_msg(&mut self) -> Result { self.msg_rx.recv().map_err(|_| { // return error from the read thread let res = match self.read_thread.take() { Some(thread) => thread.join(), - None => return anyhow!("read thread is gone"), + None => return QueryError::Other(anyhow::anyhow!("read thread is gone")), }; match res { - Ok(Ok(())) => anyhow!("unexpected result from read thread"), - Err(err) => anyhow!("read thread panicked: {:?}", err), + Ok(Ok(())) => { + QueryError::Other(anyhow::anyhow!("unexpected result from read thread")) + } + Err(err) => QueryError::Other(anyhow::anyhow!("read thread panicked: {err:?}")), Ok(Err(err)) => err, } }) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 2c13f81476..fa973a3ede 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -182,7 +182,7 @@ pub struct SafeKeeperState { /// All WAL segments next to one containing local_start_lsn are /// filled with data from the beginning. pub local_start_lsn: Lsn, - /// Part of WAL acknowledged by quorum and available locally. Always points + /// Part of WAL acknowledged by quorum *and available locally*. Always points /// to record boundary. pub commit_lsn: Lsn, /// LSN that points to the end of the last backed up segment. Useful to @@ -501,10 +501,6 @@ impl AcceptorProposerMessage { /// - messages from compute (proposers) and provides replies /// - messages from broker peers pub struct SafeKeeper { - /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn. - /// Note: be careful to set only if we are sure our WAL (term history) matches - /// committed one. - pub global_commit_lsn: Lsn, /// LSN since the proposer safekeeper currently talking to appends WAL; /// determines epoch switch point. pub epoch_start_lsn: Lsn, @@ -537,7 +533,6 @@ where } Ok(SafeKeeper { - global_commit_lsn: state.commit_lsn, epoch_start_lsn: Lsn(0), inmem: SafekeeperMemState { commit_lsn: state.commit_lsn, @@ -639,10 +634,12 @@ where // system_id will be updated on mismatch if self.state.server.system_id != msg.system_id { - warn!( - "unexpected system ID arrived, got {}, expected {}", - msg.system_id, self.state.server.system_id - ); + if self.state.server.system_id != 0 { + warn!( + "unexpected system ID arrived, got {}, expected {}", + msg.system_id, self.state.server.system_id + ); + } let mut state = self.state.clone(); state.server.system_id = msg.system_id; @@ -653,8 +650,9 @@ where } info!( - "processed greeting from proposer {:?}, sending term {:?}", - msg.proposer_id, self.state.acceptor_state.term + "processed greeting from walproposer {}, sending term {:?}", + msg.proposer_id.map(|b| format!("{:X}", b)).join(""), + self.state.acceptor_state.term ); Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting { term: self.state.acceptor_state.term, @@ -727,6 +725,24 @@ where return Ok(None); } + // This might happen in a rare race when another (old) connection from + // the same walproposer writes + flushes WAL after this connection + // already sent flush_lsn in VoteRequest. It is generally safe to + // proceed, but to prevent commit_lsn surprisingly going down we should + // either refuse the session (simpler) or skip the part we already have + // from the stream (can be implemented). + if msg.term == self.get_epoch() && self.flush_lsn() > msg.start_streaming_at { + bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help", + msg.term, self.flush_lsn(), msg.start_streaming_at) + } + // Otherwise this shouldn't happen. + assert!( + msg.start_streaming_at >= self.inmem.commit_lsn, + "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}", + msg.start_streaming_at, + self.inmem.commit_lsn + ); + // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to // intersection of our history and history from msg @@ -759,7 +775,6 @@ where // NB: on new clusters, this happens at the same time as // timeline_start_lsn initialization, it is taken outside to provide // upgrade. - self.global_commit_lsn = max(self.global_commit_lsn, state.timeline_start_lsn); self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn); // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment. @@ -778,10 +793,21 @@ where Ok(None) } - /// Advance commit_lsn taking into account what we have locally - fn update_commit_lsn(&mut self) -> Result<()> { - let commit_lsn = min(self.global_commit_lsn, self.flush_lsn()); - assert!(commit_lsn >= self.inmem.commit_lsn); + /// Advance commit_lsn taking into account what we have locally. + /// + /// Note: it is assumed that 'WAL we have is from the right term' check has + /// already been done outside. + fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> { + // Both peers and walproposer communicate this value, we might already + // have a fresher (higher) version. + candidate = max(candidate, self.inmem.commit_lsn); + let commit_lsn = min(candidate, self.flush_lsn()); + assert!( + commit_lsn >= self.inmem.commit_lsn, + "commit_lsn monotonicity violated: old={} new={}", + self.inmem.commit_lsn, + commit_lsn + ); self.inmem.commit_lsn = commit_lsn; @@ -847,14 +873,11 @@ where self.wal_store.flush_wal()?; } - // Update global_commit_lsn + // Update commit_lsn. if msg.h.commit_lsn != Lsn(0) { - // We also obtain commit lsn from peers, so value arrived here might be stale (less) - self.global_commit_lsn = max(self.global_commit_lsn, msg.h.commit_lsn); + self.update_commit_lsn(msg.h.commit_lsn)?; } - self.inmem.peer_horizon_lsn = msg.h.truncate_lsn; - self.update_commit_lsn()?; // Update truncate and commit LSN in control file. // To avoid negative impact on performance of extra fsync, do it only @@ -886,10 +909,6 @@ where /// Flush WAL to disk. Return AppendResponse with latest LSNs. fn handle_flush(&mut self) -> Result> { self.wal_store.flush_wal()?; - - // commit_lsn can be updated because we have new flushed data locally. - self.update_commit_lsn()?; - Ok(Some(AcceptorProposerMessage::AppendResponse( self.append_response(), ))) @@ -904,8 +923,7 @@ where // commit_lsn if our history matches (is part of) history of advanced // commit_lsn provider. if sk_info.last_log_term == self.get_epoch() { - self.global_commit_lsn = max(Lsn(sk_info.commit_lsn), self.global_commit_lsn); - self.update_commit_lsn()?; + self.update_commit_lsn(Lsn(sk_info.commit_lsn))?; } } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index a3481430d0..20600ab694 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler; use crate::timeline::{ReplicaState, Timeline}; use crate::wal_storage::WalReader; use crate::GlobalTimelines; -use anyhow::{bail, Context, Result}; +use anyhow::Context; use bytes::Bytes; use postgres_ffi::get_current_timestamp; @@ -15,7 +15,8 @@ use std::cmp::min; use std::net::Shutdown; use std::sync::Arc; use std::time::Duration; -use std::{str, thread}; +use std::{io, str, thread}; +use utils::postgres_backend_async::QueryError; use pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody}; use tokio::sync::watch::Receiver; @@ -91,7 +92,7 @@ impl ReplicationConn { fn background_thread( mut stream_in: ReadStream, replica_guard: Arc, - ) -> Result<()> { + ) -> anyhow::Result<()> { let replica_id = replica_guard.replica; let timeline = &replica_guard.timeline; @@ -140,7 +141,7 @@ impl ReplicationConn { // Shutdown the connection, because rust-postgres client cannot be dropped // when connection is alive. let _ = stream_in.shutdown(Shutdown::Both); - bail!("Copy failed"); + anyhow::bail!("Copy failed"); } _ => { // We only handle `CopyData`, 'Sync', 'CopyFail' messages. Anything else is ignored. @@ -160,8 +161,8 @@ impl ReplicationConn { spg: &mut SafekeeperPostgresHandler, pgb: &mut PostgresBackend, mut start_pos: Lsn, - ) -> Result<()> { - let _enter = info_span!("WAL sender", timeline = %spg.timeline_id.unwrap()).entered(); + ) -> Result<(), QueryError> { + let _enter = info_span!("WAL sender", ttid = %spg.ttid).entered(); let tli = GlobalTimelines::get(spg.ttid)?; @@ -256,8 +257,10 @@ impl ReplicationConn { // to right pageserver. if tli.should_walsender_stop(replica_id) { // Shut down, timeline is suspended. - // TODO create proper error type for this - bail!("end streaming to {:?}", spg.appname); + return Err(QueryError::from(io::Error::new( + io::ErrorKind::ConnectionAborted, + format!("end streaming to {:?}", spg.appname), + ))); } // timeout expired: request pageserver status @@ -265,8 +268,7 @@ impl ReplicationConn { sent_ptr: end_pos.0, timestamp: get_current_timestamp(), request_reply: true, - })) - .context("Failed to send KeepAlive message")?; + }))?; continue; } } @@ -301,7 +303,7 @@ impl ReplicationConn { const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); // Wait until we have commit_lsn > lsn or timeout expires. Returns latest commit_lsn. -async fn wait_for_lsn(rx: &mut Receiver, lsn: Lsn) -> Result> { +async fn wait_for_lsn(rx: &mut Receiver, lsn: Lsn) -> anyhow::Result> { let commit_lsn: Lsn = *rx.borrow(); if commit_lsn > lsn { return Ok(Some(commit_lsn)); diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index ae4d4cce09..fc971ca753 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -346,9 +346,7 @@ impl WalBackupTask { backup_lsn, commit_lsn, e ); - if retry_attempt < u32::MAX { - retry_attempt += 1; - } + retry_attempt = retry_attempt.saturating_add(1); } } } @@ -387,7 +385,7 @@ async fn backup_single_segment( ) -> Result<()> { let segment_file_path = seg.file_path(timeline_dir)?; let remote_segment_path = segment_file_path - .strip_prefix(&workspace_dir) + .strip_prefix(workspace_dir) .context("Failed to strip workspace dir prefix") .and_then(RemotePath::new) .with_context(|| { diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index fd8f9d9dcf..3ca651d060 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -2,35 +2,28 @@ //! WAL service listens for client connections and //! receive WAL from wal_proposer and send it to WAL receivers //! -use anyhow::Result; use regex::Regex; use std::net::{TcpListener, TcpStream}; -use std::sync::Arc; use std::thread; use tracing::*; -use utils::auth::JwtAuth; +use utils::postgres_backend_async::QueryError; use crate::handler::SafekeeperPostgresHandler; use crate::SafeKeeperConf; use utils::postgres_backend::{AuthType, PostgresBackend}; /// Accept incoming TCP connections and spawn them into a background thread. -pub fn thread_main( - conf: SafeKeeperConf, - listener: TcpListener, - auth: Option>, -) -> Result<()> { +pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> ! { loop { match listener.accept() { Ok((socket, peer_addr)) => { debug!("accepted connection from {}", peer_addr); let conf = conf.clone(); - let auth = auth.clone(); let _ = thread::Builder::new() .name("WAL service thread".into()) .spawn(move || { - if let Err(err) = handle_socket(socket, conf, auth) { + if let Err(err) = handle_socket(socket, conf) { error!("connection handler exited: {}", err); } }) @@ -51,25 +44,17 @@ fn get_tid() -> u64 { /// This is run by `thread_main` above, inside a background thread. /// -fn handle_socket( - socket: TcpStream, - conf: SafeKeeperConf, - auth: Option>, -) -> Result<()> { +fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<(), QueryError> { let _enter = info_span!("", tid = ?get_tid()).entered(); socket.set_nodelay(true)?; - let mut conn_handler = SafekeeperPostgresHandler::new(conf, auth.clone()); - let pgbackend = PostgresBackend::new( - socket, - match auth { - None => AuthType::Trust, - Some(_) => AuthType::NeonJWT, - }, - None, - false, - )?; + let auth_type = match conf.auth { + None => AuthType::Trust, + Some(_) => AuthType::NeonJWT, + }; + let mut conn_handler = SafekeeperPostgresHandler::new(conf); + let pgbackend = PostgresBackend::new(socket, auth_type, None, false)?; // libpq replication protocol between safekeeper and replicas/pagers pgbackend.run(&mut conn_handler)?; diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 52368bb719..41457868fe 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -223,7 +223,7 @@ impl PhysicalStorage { // Rename partial file to completed file let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; - fs::rename(&wal_file_partial_path, &wal_file_path)?; + fs::rename(wal_file_partial_path, wal_file_path)?; } else { // otherwise, file can be reused later self.file = Some(file); @@ -249,7 +249,7 @@ impl PhysicalStorage { while !buf.is_empty() { // Extract WAL location for this block - let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size) as usize; + let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size); let segno = self.write_lsn.segment_number(self.wal_seg_size); // If crossing a WAL boundary, only write up until we reach wal segment size. @@ -366,7 +366,7 @@ impl Storage for PhysicalStorage { self.fdatasync_file(&mut unflushed_file)?; } - let xlogoff = end_pos.segment_offset(self.wal_seg_size) as usize; + let xlogoff = end_pos.segment_offset(self.wal_seg_size); let segno = end_pos.segment_number(self.wal_seg_size); // Remove all segments after the given LSN. @@ -383,7 +383,7 @@ impl Storage for PhysicalStorage { // Make segment partial once again let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; - fs::rename(&wal_file_path, &wal_file_partial_path)?; + fs::rename(wal_file_path, wal_file_partial_path)?; } // Update LSNs @@ -416,7 +416,7 @@ fn remove_segments_from_disk( let mut min_removed = u64::MAX; let mut max_removed = u64::MIN; - for entry in fs::read_dir(&timeline_dir)? { + for entry in fs::read_dir(timeline_dir)? { let entry = entry?; let entry_path = entry.path(); let fname = entry_path.file_name().unwrap(); @@ -499,7 +499,7 @@ impl WalReader { // How much to read and send in message? We cannot cross the WAL file // boundary, and we don't want send more than provided buffer. - let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize; + let xlogoff = self.pos.segment_offset(self.wal_seg_size); let send_size = min(buf.len(), self.wal_seg_size - xlogoff); // Read some data from the file. @@ -518,7 +518,7 @@ impl WalReader { /// Open WAL segment at the current position of the reader. async fn open_segment(&self) -> Result>> { - let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize; + let xlogoff = self.pos.segment_offset(self.wal_seg_size); let segno = self.pos.segment_number(self.wal_seg_size); let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size); let wal_file_path = self.timeline_dir.join(wal_file_name); diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 1734038661..d83a74ae14 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -318,14 +318,8 @@ def remote_consistent_lsn( detail = pageserver_http_client.timeline_detail(tenant, timeline) lsn_str = detail["remote_consistent_lsn"] - if lsn_str is None: - # No remote information at all. This happens right after creating - # a timeline, before any part of it has been uploaded to remote - # storage yet. - return 0 - else: - assert isinstance(lsn_str, str) - return lsn_from_hex(lsn_str) + assert isinstance(lsn_str, str) + return lsn_from_hex(lsn_str) def wait_for_upload( @@ -448,15 +442,15 @@ def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int): def get_rlsn(pageserver_connstr, tenant_id, timeline_id): - conn = psycopg2.connect(pageserver_connstr) - conn.autocommit = True - with conn.cursor() as cur: - cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}" - cur.execute(cmd) - res = cur.fetchone() - prev_lsn = res[0] - last_lsn = res[1] - conn.close() + with closing(psycopg2.connect(pageserver_connstr)) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}" + cur.execute(cmd) + res = cur.fetchone() + assert res is not None + prev_lsn = res[0] + last_lsn = res[1] return last_lsn, prev_lsn diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml index 7aa33a5234..180c506254 100644 --- a/storage_broker/Cargo.toml +++ b/storage_broker/Cargo.toml @@ -2,6 +2,7 @@ name = "storage_broker" version = "0.1.0" edition = "2021" +license = "Apache-2.0" [features] bench = [] diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index 73141318b8..f3544a7cb8 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -88,7 +88,7 @@ fn tli_from_u64(i: u64) -> Vec { async fn subscribe(client: Option, counter: Arc, i: u64) { let mut client = match client { Some(c) => c, - None => storage_broker::connect(DEFAULT_ENDPOINT).unwrap(), + None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(), }; let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId { @@ -112,7 +112,7 @@ async fn subscribe(client: Option, counter: Arc, async fn publish(client: Option, n_keys: u64) { let mut client = match client { Some(c) => c, - None => storage_broker::connect(DEFAULT_ENDPOINT).unwrap(), + None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(), }; let mut counter: u64 = 0; @@ -152,7 +152,7 @@ async fn main() -> Result<(), Box> { } let h = tokio::spawn(progress_reporter(counters.clone())); - let c = storage_broker::connect(DEFAULT_ENDPOINT).unwrap(); + let c = storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(); for i in 0..args.num_subs { let c = Some(c.clone()); @@ -160,7 +160,7 @@ async fn main() -> Result<(), Box> { } for _i in 0..args.num_pubs { let c = None; - tokio::spawn(publish(c, args.num_subs as u64)); + tokio::spawn(publish(c, args.num_subs)); } h.await?; diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 1a743394ad..6d80e96bf1 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -39,7 +39,9 @@ use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE}; use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer}; use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest}; -use storage_broker::{parse_proto_ttid, EitherBody, DEFAULT_LISTEN_ADDR}; +use storage_broker::{ + parse_proto_ttid, EitherBody, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR, +}; use utils::id::TenantTimelineId; use utils::logging::{self, LogFormat}; use utils::project_git_version; @@ -47,8 +49,8 @@ use utils::sentry_init::{init_sentry, release_name}; project_git_version!(GIT_VERSION); -const DEFAULT_CHAN_SIZE: usize = 128; -const DEFAULT_HTTP2_KEEPALIVE_INTERVAL: &str = "5000ms"; +const DEFAULT_CHAN_SIZE: usize = 32; +const DEFAULT_ALL_KEYS_CHAN_SIZE: usize = 16384; #[derive(Parser, Debug)] #[command(version = GIT_VERSION, about = "Broker for neon storage nodes communication", long_about = None)] @@ -56,11 +58,14 @@ struct Args { /// Endpoint to listen on. #[arg(short, long, default_value = DEFAULT_LISTEN_ADDR)] listen_addr: SocketAddr, - /// Size of the queue to the subscriber. + /// Size of the queue to the per timeline subscriber. #[arg(long, default_value_t = DEFAULT_CHAN_SIZE)] - chan_size: usize, + timeline_chan_size: usize, + /// Size of the queue to the all keys subscriber. + #[arg(long, default_value_t = DEFAULT_ALL_KEYS_CHAN_SIZE)] + all_keys_chan_size: usize, /// HTTP/2 keepalive interval. - #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HTTP2_KEEPALIVE_INTERVAL)] + #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_KEEPALIVE_INTERVAL)] http2_keepalive_interval: Duration, /// Format for logging, either 'plain' or 'json'. #[arg(long, default_value = "plain")] @@ -108,7 +113,7 @@ struct SharedState { } impl SharedState { - pub fn new(chan_size: usize) -> Self { + pub fn new(all_keys_chan_size: usize) -> Self { SharedState { next_pub_id: 0, num_pubs: 0, @@ -116,7 +121,7 @@ impl SharedState { num_subs_to_timelines: 0, chans_to_timeline_subs: HashMap::new(), num_subs_to_all: 0, - chan_to_all_subs: broadcast::channel(chan_size).0, + chan_to_all_subs: broadcast::channel(all_keys_chan_size).0, } } @@ -139,7 +144,7 @@ impl SharedState { pub fn register_subscriber( &mut self, sub_key: SubscriptionKey, - chan_size: usize, + timeline_chan_size: usize, ) -> (SubId, broadcast::Receiver) { let sub_id = self.next_sub_id; self.next_sub_id += 1; @@ -158,7 +163,7 @@ impl SharedState { self.chans_to_timeline_subs .entry(ttid) .or_insert(ChanToTimelineSub { - chan: broadcast::channel(chan_size).0, + chan: broadcast::channel(timeline_chan_size).0, num_subscribers: 0, }); chan_to_timeline_sub.num_subscribers += 1; @@ -200,7 +205,7 @@ impl SharedState { #[derive(Clone)] struct Registry { shared_state: Arc>, - chan_size: usize, + timeline_chan_size: usize, } impl Registry { @@ -232,7 +237,7 @@ impl Registry { let (sub_id, sub_rx) = self .shared_state .write() - .register_subscriber(sub_key, self.chan_size); + .register_subscriber(sub_key, self.timeline_chan_size); info!( "subscription started id={}, key={:?}, addr={:?}", sub_id, sub_key, remote_addr @@ -369,9 +374,9 @@ impl BrokerService for Broker { Err(RecvError::Lagged(skipped_msg)) => { missed_msgs += skipped_msg; if let Poll::Ready(_) = futures::poll!(Box::pin(warn_interval.tick())) { - error!("subscription id={}, key={:?}, addr={:?} dropped {} messages, channel is full", + warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full", subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs); - Err(Status::new(Code::Internal, "full channel"))?; + missed_msgs = 0; } } Err(RecvError::Closed) => { @@ -428,8 +433,8 @@ async fn main() -> Result<(), Box> { info!("version: {GIT_VERSION}"); let registry = Registry { - shared_state: Arc::new(RwLock::new(SharedState::new(args.chan_size))), - chan_size: args.chan_size, + shared_state: Arc::new(RwLock::new(SharedState::new(args.all_keys_chan_size))), + timeline_chan_size: args.timeline_chan_size, }; let storage_broker_impl = Broker { registry: registry.clone(), @@ -523,7 +528,7 @@ mod tests { async fn test_registry() { let registry = Registry { shared_state: Arc::new(RwLock::new(SharedState::new(16))), - chan_size: 16, + timeline_chan_size: 16, }; // subscribe to timeline 2 diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs index 0629caa2fb..8441aaf625 100644 --- a/storage_broker/src/lib.rs +++ b/storage_broker/src/lib.rs @@ -1,6 +1,7 @@ use hyper::body::HttpBody; use std::pin::Pin; use std::task::{Context, Poll}; +use std::time::Duration; use tonic::codegen::StdError; use tonic::transport::{ClientTlsConfig, Endpoint}; use tonic::{transport::Channel, Code, Status}; @@ -12,6 +13,10 @@ use proto::{ // Code generated by protobuf. pub mod proto { + // Tonic does derives as `#[derive(Clone, PartialEq, ::prost::Message)]` + // we don't use these types for anything but broker data transmission, + // so it's ok to ignore this one. + #![allow(clippy::derive_partial_eq_without_eq)] tonic::include_proto!("storage_broker"); } @@ -26,6 +31,8 @@ pub use hyper::Uri; pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051"; pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}"); +pub const DEFAULT_KEEPALIVE_INTERVAL: &str = "5000 ms"; + // BrokerServiceClient charged with tonic provided Channel transport; helps to // avoid depending on tonic directly in user crates. pub type BrokerClientChannel = BrokerServiceClient; @@ -33,7 +40,7 @@ pub type BrokerClientChannel = BrokerServiceClient; // Create connection object configured to run TLS if schema starts with https:// // and plain text otherwise. Connection is lazy, only endpoint sanity is // validated here. -pub fn connect(endpoint: U) -> anyhow::Result +pub fn connect(endpoint: U, keepalive_interval: Duration) -> anyhow::Result where U: std::convert::TryInto, U::Error: std::error::Error + Send + Sync + 'static, @@ -46,6 +53,10 @@ where let tls = ClientTlsConfig::new(); tonic_endpoint = tonic_endpoint.tls_config(tls)?; } + tonic_endpoint = tonic_endpoint + .http2_keep_alive_interval(keepalive_interval) + .keep_alive_while_idle(true); + // keep_alive_timeout is 20s by default on both client and server side let channel = tonic_endpoint.connect_lazy(); Ok(BrokerClientChannel::new(channel)) } diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 27fb0a60b2..b1489b7ab1 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -11,7 +11,7 @@ from datetime import datetime from pathlib import Path # Type-related stuff -from typing import Callable, ClassVar, Iterator, Optional +from typing import Callable, ClassVar, Dict, Iterator, Optional import pytest from _pytest.config import Config @@ -135,23 +135,26 @@ class PgBenchRunResult: @dataclasses.dataclass class PgBenchInitResult: - REGEX: ClassVar[re.Pattern] = re.compile( # type: ignore[type-arg] - r"done in (\d+\.\d+) s " - r"\(" - r"(?:drop tables (\d+\.\d+) s)?(?:, )?" - r"(?:create tables (\d+\.\d+) s)?(?:, )?" - r"(?:client-side generate (\d+\.\d+) s)?(?:, )?" - r"(?:vacuum (\d+\.\d+) s)?(?:, )?" - r"(?:primary keys (\d+\.\d+) s)?(?:, )?" - r"\)\." - ) + # Taken from https://github.com/postgres/postgres/blob/REL_15_1/src/bin/pgbench/pgbench.c#L5144-L5171 + EXTRACTORS: ClassVar[Dict[str, re.Pattern]] = { # type: ignore[type-arg] + "drop_tables": re.compile(r"drop tables (\d+\.\d+) s"), + "create_tables": re.compile(r"create tables (\d+\.\d+) s"), + "client_side_generate": re.compile(r"client-side generate (\d+\.\d+) s"), + "server_side_generate": re.compile(r"server-side generate (\d+\.\d+) s"), + "vacuum": re.compile(r"vacuum (\d+\.\d+) s"), + "primary_keys": re.compile(r"primary keys (\d+\.\d+) s"), + "foreign_keys": re.compile(r"foreign keys (\d+\.\d+) s"), + "total": re.compile(r"done in (\d+\.\d+) s"), # Total time printed by pgbench + } - total: float + total: Optional[float] drop_tables: Optional[float] create_tables: Optional[float] client_side_generate: Optional[float] + server_side_generate: Optional[float] vacuum: Optional[float] primary_keys: Optional[float] + foreign_keys: Optional[float] duration: float start_timestamp: int end_timestamp: int @@ -164,25 +167,35 @@ class PgBenchInitResult: start_timestamp: int, end_timestamp: int, ): - # Parses pgbench initialize output for default initialization steps (dtgvp) + # Parses pgbench initialize output # Example: done in 5.66 s (drop tables 0.05 s, create tables 0.31 s, client-side generate 2.01 s, vacuum 0.53 s, primary keys 0.38 s). last_line = stderr.splitlines()[-1] - if (m := cls.REGEX.match(last_line)) is not None: - total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [ - float(v) for v in m.groups() if v is not None - ] - else: + timings: Dict[str, Optional[float]] = {} + last_line_items = re.split(r"\(|\)|,", last_line) + for item in last_line_items: + for key, regex in cls.EXTRACTORS.items(): + if (m := regex.match(item.strip())) is not None: + if key in timings: + raise RuntimeError( + f"can't store pgbench results for repeated action `{key}`" + ) + + timings[key] = float(m.group(1)) + + if not timings or "total" not in timings: raise RuntimeError(f"can't parse pgbench initialize results from `{last_line}`") return cls( - total=total, - drop_tables=drop_tables, - create_tables=create_tables, - client_side_generate=client_side_generate, - vacuum=vacuum, - primary_keys=primary_keys, + total=timings["total"], + drop_tables=timings.get("drop_tables", 0.0), + create_tables=timings.get("create_tables", 0.0), + client_side_generate=timings.get("client_side_generate", 0.0), + server_side_generate=timings.get("server_side_generate", 0.0), + vacuum=timings.get("vacuum", 0.0), + primary_keys=timings.get("primary_keys", 0.0), + foreign_keys=timings.get("foreign_keys", 0.0), duration=duration, start_timestamp=start_timestamp, end_timestamp=end_timestamp, @@ -326,8 +339,10 @@ class NeonBenchmarker: "drop_tables", "create_tables", "client_side_generate", + "server_side_generate", "vacuum", "primary_keys", + "foreign_keys", ] for metric in metrics: if (value := getattr(result, metric)) is not None: diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 291f924379..be1f146735 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -115,6 +115,7 @@ class NeonCompare(PgCompare): return self._pg_bin def flush(self): + self.pageserver_http_client.timeline_checkpoint(self.env.initial_tenant, self.timeline) self.pageserver_http_client.timeline_gc(self.env.initial_tenant, self.timeline, 0) def compact(self): @@ -176,7 +177,7 @@ class VanillaCompare(PgCompare): self.cur = self.conn.cursor() @property - def pg(self) -> PgProtocol: + def pg(self) -> VanillaPostgres: return self._pg @property diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 86ab4425ed..8b78e06c22 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -39,9 +39,16 @@ def parse_metrics(text: str, name: str = "") -> Metrics: return metrics +PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = ( + "pageserver_remote_timeline_client_calls_unfinished", + *[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]], + *[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]], + "pageserver_remote_physical_size", +) + PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_current_logical_size", - "pageserver_current_physical_size", + "pageserver_resident_physical_size", "pageserver_getpage_reconstruct_seconds_bucket", "pageserver_getpage_reconstruct_seconds_count", "pageserver_getpage_reconstruct_seconds_sum", @@ -62,4 +69,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_wait_lsn_seconds_sum", "pageserver_created_persistent_files_total", "pageserver_written_persistent_bytes_total", + *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, ) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 818853a4ac..f284be8753 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -18,6 +18,7 @@ from contextlib import closing, contextmanager from dataclasses import dataclass, field from enum import Flag, auto from functools import cached_property +from itertools import chain, product from pathlib import Path from types import TracebackType from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast @@ -26,6 +27,7 @@ import asyncpg import backoff # type: ignore import boto3 import jwt +import prometheus_client import psycopg2 import pytest import requests @@ -33,6 +35,7 @@ from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest from fixtures.log_helper import log +from fixtures.metrics import parse_metrics from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( ATTACHMENT_NAME_REGEX, @@ -41,6 +44,7 @@ from fixtures.utils import ( get_self_dir, subprocess_capture, ) +from prometheus_client.parser import text_string_to_metric_families # Type-related stuff from psycopg2.extensions import connection as PgConnection @@ -286,24 +290,19 @@ def port_distributor(worker_base_port: int) -> PortDistributor: return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM) -@pytest.fixture(scope="session") +@pytest.fixture(scope="function") def default_broker( - request: FixtureRequest, port_distributor: PortDistributor, - top_output_dir: Path, + test_output_dir: Path, neon_binpath: Path, ) -> Iterator[NeonBroker]: # multiple pytest sessions could get launched in parallel, get them different ports/datadirs client_port = port_distributor.get_port() - broker_logfile = ( - get_test_output_dir(request, top_output_dir) / f"storage_broker_{client_port}.log" - ) - broker_logfile.parents[0].mkdir(exist_ok=True, parents=True) + broker_logfile = test_output_dir / "repo" / "storage_broker.log" broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath) yield broker broker.stop() - allure_attach_from_dir(Path(broker_logfile)) @pytest.fixture(scope="session") @@ -598,6 +597,7 @@ class NeonEnvBuilder: rust_log_override: Optional[str] = None, default_branch_name: str = DEFAULT_BRANCH_NAME, preserve_database_files: bool = False, + initial_tenant: Optional[TenantId] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -620,19 +620,30 @@ class NeonEnvBuilder: self.pg_distrib_dir = pg_distrib_dir self.pg_version = pg_version self.preserve_database_files = preserve_database_files + self.initial_tenant = initial_tenant or TenantId.generate() - def init(self) -> NeonEnv: + def init_configs(self) -> NeonEnv: # Cannot create more than one environment from one builder assert self.env is None, "environment already initialized" self.env = NeonEnv(self) return self.env def start(self): + assert self.env is not None, "environment is not already initialized, call init() first" self.env.start() def init_start(self) -> NeonEnv: - env = self.init() + env = self.init_configs() self.start() + + # Prepare the default branch to start the postgres on later. + # Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API. + log.info( + f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline" + ) + initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant) + log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully") + return env def enable_remote_storage( @@ -756,6 +767,11 @@ class NeonEnvBuilder: log.info("no remote storage was set up, skipping cleanup") return + # Making mypy happy with allowing only `S3Storage` further. + # `self.remote_storage_prefix` is coupled with `S3Storage` storage type, + # so this line effectively a no-op + assert isinstance(self.remote_storage, S3Storage) + if self.keep_remote_storage_contents: log.info("keep_remote_storage_contents skipping remote storage cleanup") return @@ -771,7 +787,8 @@ class NeonEnvBuilder: Prefix=self.remote_storage_prefix, ) - objects_to_delete = {"Objects": []} + # Using Any because DeleteTypeDef (from boto3-stubs) doesn't fit our case + objects_to_delete: Any = {"Objects": []} cnt = 0 for item in pages.search("Contents"): # weirdly when nothing is found it returns [None] @@ -786,16 +803,17 @@ class NeonEnvBuilder: Bucket=self.remote_storage.bucket_name, Delete=objects_to_delete, ) - objects_to_delete = dict(Objects=[]) + objects_to_delete = {"Objects": []} cnt += 1 # flush rest if len(objects_to_delete["Objects"]): self.remote_storage_client.delete_objects( - Bucket=self.remote_storage.bucket_name, Delete=objects_to_delete + Bucket=self.remote_storage.bucket_name, + Delete=objects_to_delete, ) - log.info("deleted %s objects from remote storage", cnt) + log.info(f"deleted {cnt} objects from remote storage") def __enter__(self) -> "NeonEnvBuilder": return self @@ -884,12 +902,12 @@ class NeonEnv: # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. - self.initial_tenant = TenantId.generate() + self.initial_tenant = config.initial_tenant # Create a config file corresponding to the options toml = textwrap.dedent( f""" - default_tenant_id = '{self.initial_tenant}' + default_tenant_id = '{config.initial_tenant}' """ ) @@ -1012,7 +1030,7 @@ def _shared_simple_env( if os.environ.get("TEST_SHARED_FIXTURES") is None: # Create the environment in the per-test output directory - repo_dir = get_test_output_dir(request, top_output_dir) / "repo" + repo_dir = get_test_repo_dir(request, top_output_dir) else: # We're running shared fixtures. Share a single directory. repo_dir = top_output_dir / "shared_repo" @@ -1201,8 +1219,22 @@ class PageserverHttpClient(requests.Session): # there are no tests for those right now. return size - def timeline_list(self, tenant_id: TenantId) -> List[Dict[str, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline") + def timeline_list( + self, + tenant_id: TenantId, + include_non_incremental_logical_size: bool = False, + include_timeline_dir_layer_file_size_sum: bool = False, + ) -> List[Dict[str, Any]]: + + params = {} + if include_non_incremental_logical_size: + params["include-non-incremental-logical-size"] = "yes" + if include_timeline_dir_layer_file_size_sum: + params["include-timeline-dir-layer-file-size-sum"] = "yes" + + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params + ) self.verbose_error(res) res_json = res.json() assert isinstance(res_json, list) @@ -1236,13 +1268,13 @@ class PageserverHttpClient(requests.Session): tenant_id: TenantId, timeline_id: TimelineId, include_non_incremental_logical_size: bool = False, - include_non_incremental_physical_size: bool = False, + include_timeline_dir_layer_file_size_sum: bool = False, ) -> Dict[Any, Any]: params = {} if include_non_incremental_logical_size: params["include-non-incremental-logical-size"] = "yes" - if include_non_incremental_physical_size: - params["include-non-incremental-physical-size"] = "yes" + if include_timeline_dir_layer_file_size_sum: + params["include-timeline-dir-layer-file-size-sum"] = "yes" res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", @@ -1317,11 +1349,115 @@ class PageserverHttpClient(requests.Session): res_json = res.json() assert res_json is None + def timeline_spawn_download_remote_layers( + self, tenant_id: TenantId, timeline_id: TimelineId + ) -> dict[str, Any]: + + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", + ) + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + return res_json + + def timeline_poll_download_remote_layers_status( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + spawn_response: dict[str, Any], + poll_state=None, + ) -> None | dict[str, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", + ) + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + + # assumption in this API client here is that nobody else spawns the task + assert res_json["task_id"] == spawn_response["task_id"] + + if poll_state is None or res_json["state"] == poll_state: + return res_json + return None + + def timeline_download_remote_layers( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + errors_ok=False, + at_least_one_download=True, + ): + res = self.timeline_spawn_download_remote_layers(tenant_id, timeline_id) + while True: + completed = self.timeline_poll_download_remote_layers_status( + tenant_id, timeline_id, res, poll_state="Completed" + ) + if not completed: + time.sleep(0.1) + continue + if not errors_ok: + assert completed["failed_download_count"] == 0 + if at_least_one_download: + assert completed["successful_download_count"] > 0 + return completed + def get_metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") self.verbose_error(res) return res.text + def get_timeline_metric(self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str): + raw = self.get_metrics() + family: List[prometheus_client.Metric] = list(text_string_to_metric_families(raw)) + [metric] = [m for m in family if m.name == metric_name] + [sample] = [ + s + for s in metric.samples + if s.labels["tenant_id"] == str(tenant_id) + and s.labels["timeline_id"] == str(timeline_id) + ] + return sample.value + + def get_remote_timeline_client_metric( + self, + metric_name: str, + tenant_id: TenantId, + timeline_id: TimelineId, + file_kind: str, + op_kind: str, + ) -> Optional[float]: + metrics = parse_metrics(self.get_metrics(), "pageserver") + matches = metrics.query_all( + name=metric_name, + filter={ + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "file_kind": str(file_kind), + "op_kind": str(op_kind), + }, + ) + if len(matches) == 0: + value = None + elif len(matches) == 1: + value = matches[0].value + assert value is not None + else: + assert len(matches) < 2, "above filter should uniquely identify metric" + return value + + def get_metric_value(self, name: str) -> Optional[str]: + metrics = self.get_metrics() + relevant = [line for line in metrics.splitlines() if line.startswith(name)] + if len(relevant) == 0: + log.info(f'could not find metric "{name}"') + return None + assert len(relevant) == 1 + return relevant[0].lstrip(name).strip() + @dataclass class PageserverPort: @@ -1432,6 +1568,7 @@ class NeonCli(AbstractNeonCli): tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None, conf: Optional[Dict[str, str]] = None, + set_default: bool = False, ) -> Tuple[TenantId, TimelineId]: """ Creates a new tenant, returns its id and its initial timeline's id. @@ -1440,47 +1577,51 @@ class NeonCli(AbstractNeonCli): tenant_id = TenantId.generate() if timeline_id is None: timeline_id = TimelineId.generate() - if conf is None: - res = self.raw_cli( - [ - "tenant", - "create", - "--tenant-id", - str(tenant_id), - "--timeline-id", - str(timeline_id), - "--pg-version", - self.env.pg_version, - ] - ) - else: - res = self.raw_cli( - [ - "tenant", - "create", - "--tenant-id", - str(tenant_id), - "--timeline-id", - str(timeline_id), - "--pg-version", - self.env.pg_version, - ] - + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) + + args = [ + "tenant", + "create", + "--tenant-id", + str(tenant_id), + "--timeline-id", + str(timeline_id), + "--pg-version", + self.env.pg_version, + ] + if conf is not None: + args.extend( + chain.from_iterable( + product(["-c"], (f"{key}:{value}" for key, value in conf.items())) + ) ) + if set_default: + args.append("--set-default") + + res = self.raw_cli(args) res.check_returncode() return tenant_id, timeline_id + def set_default(self, tenant_id: TenantId): + """ + Update default tenant for future operations that require tenant_id. + """ + res = self.raw_cli(["tenant", "set-default", "--tenant-id", str(tenant_id)]) + res.check_returncode() + def config_tenant(self, tenant_id: TenantId, conf: Dict[str, str]): """ Update tenant config. """ - if conf is None: - res = self.raw_cli(["tenant", "config", "--tenant-id", str(tenant_id)]) - else: - res = self.raw_cli( - ["tenant", "config", "--tenant-id", str(tenant_id)] - + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) + + args = ["tenant", "config", "--tenant-id", str(tenant_id)] + if conf is not None: + args.extend( + chain.from_iterable( + product(["-c"], (f"{key}:{value}" for key, value in conf.items())) + ) ) + + res = self.raw_cli(args) res.check_returncode() def list_tenants(self) -> "subprocess.CompletedProcess[str]": @@ -1515,36 +1656,6 @@ class NeonCli(AbstractNeonCli): return TimelineId(str(created_timeline_id)) - def create_root_branch( - self, - branch_name: str, - tenant_id: Optional[TenantId] = None, - ): - cmd = [ - "timeline", - "create", - "--branch-name", - branch_name, - "--tenant-id", - str(tenant_id or self.env.initial_tenant), - "--pg-version", - self.env.pg_version, - ] - - res = self.raw_cli(cmd) - res.check_returncode() - - matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) - - created_timeline_id = None - if matches is not None: - created_timeline_id = matches.group("timeline_id") - - if created_timeline_id is None: - raise Exception("could not find timeline id after `neon timeline create` invocation") - else: - return TimelineId(created_timeline_id) - def create_branch( self, new_branch_name: str = DEFAULT_BRANCH_NAME, @@ -1600,17 +1711,12 @@ class NeonCli(AbstractNeonCli): def init( self, config_toml: str, - initial_timeline_id: Optional[TimelineId] = None, ) -> "subprocess.CompletedProcess[str]": with tempfile.NamedTemporaryFile(mode="w+") as tmp: tmp.write(config_toml) tmp.flush() - cmd = ["init", f"--config={tmp.name}"] - if initial_timeline_id: - cmd.extend(["--timeline-id", str(initial_timeline_id)]) - - cmd.extend(["--pg-version", self.env.pg_version]) + cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version] append_pageserver_param_overrides( params_to_update=cmd, @@ -1619,7 +1725,12 @@ class NeonCli(AbstractNeonCli): pageserver_config_override=self.env.pageserver.config_override, ) - res = self.raw_cli(cmd) + s3_env_vars = None + if self.env.remote_storage is not None and isinstance( + self.env.remote_storage, S3Storage + ): + s3_env_vars = self.env.remote_storage.access_env_vars() + res = self.raw_cli(cmd, extra_env_vars=s3_env_vars) res.check_returncode() return res @@ -1742,6 +1853,12 @@ class NeonCli(AbstractNeonCli): return self.raw_cli(args, check_return_code=check_return_code) + def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]": + return self.raw_cli(["start"], check_return_code=check_return_code) + + def stop(self, check_return_code=True) -> "subprocess.CompletedProcess[str]": + return self.raw_cli(["stop"], check_return_code=check_return_code) + class WalCraft(AbstractNeonCli): """ @@ -1796,14 +1913,17 @@ class NeonPageserver(PgProtocol): ".*wal receiver task finished with an error: walreceiver connection handling failure.*", ".*Shutdown task error: walreceiver connection handling failure.*", ".*wal_connection_manager.*tcp connect error: Connection refused.*", - ".*query handler for .* failed: Connection reset by peer.*", - ".*serving compute connection task.*exited with error: Broken pipe.*", - ".*Connection aborted: error communicating with the server: Broken pipe.*", - ".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*", - ".*Connection aborted: error communicating with the server: Connection reset by peer.*", + ".*query handler for .* failed: Socket IO error: Connection reset by peer.*", + ".*serving compute connection task.*exited with error: Postgres connection error.*", + ".*serving compute connection task.*exited with error: Connection reset by peer.*", + ".*serving compute connection task.*exited with error: Postgres query error.*", + ".*Connection aborted: connection error: error communicating with the server: Broken pipe.*", + ".*Connection aborted: connection error: error communicating with the server: Transport endpoint is not connected.*", + ".*Connection aborted: connection error: error communicating with the server: Connection reset by peer.*", ".*kill_and_wait_impl.*: wait successful.*", - ".*end streaming to Some.*", + ".*Replication stream finished: db error: ERROR: Socket IO error: end streaming to Some.*", ".*query handler for 'pagestream.*failed: Broken pipe.*", # pageserver notices compute shut down + ".*query handler for 'pagestream.*failed: Connection reset by peer.*", # pageserver notices compute shut down # safekeeper connection can fail with this, in the window between timeline creation # and streaming start ".*Failed to process query for timeline .*: state uninitialized, no data to read.*", @@ -1873,10 +1993,6 @@ class NeonPageserver(PgProtocol): if '"testing"' not in self.version: pytest.skip("pageserver was built without 'testing' feature") - def is_profiling_enabled_or_skip(self): - if '"profiling"' not in self.version: - pytest.skip("pageserver was built without 'profiling' feature") - def http_client(self, auth_token: Optional[str] = None) -> PageserverHttpClient: return PageserverHttpClient( port=self.service_port.http, @@ -1903,6 +2019,28 @@ class NeonPageserver(PgProtocol): assert not errors + def log_contains(self, pattern: str) -> Optional[str]: + """Check that the pageserver log contains a line that matches the given regex""" + logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r") + + contains_re = re.compile(pattern) + + # XXX: Our rust logging machinery buffers the messages, so if you + # call this function immediately after it's been logged, there is + # no guarantee it is already present in the log file. This hasn't + # been a problem in practice, our python tests are not fast enough + # to hit that race condition. + while True: + line = logfile.readline() + if not line: + break + + if contains_re.search(line): + # found it! + return line + + return None + def append_pageserver_param_overrides( params_to_update: List[str], @@ -2749,7 +2887,7 @@ class NeonBroker: log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"') with open(self.logfile, "wb") as logfile: args = [ - self.neon_binpath / "storage_broker", + str(self.neon_binpath / "storage_broker"), f"--listen-addr={listen_addr}", ] self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile) @@ -2785,6 +2923,10 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: return test_dir +def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + return get_test_output_dir(request, top_output_dir) / "repo" + + def pytest_addoption(parser: Parser): parser.addoption( "--preserve-database-files", @@ -2961,13 +3103,55 @@ def check_restored_datadir_content( assert (mismatch, error) == ([], []) -def assert_no_in_progress_downloads_for_tenant( - pageserver_http_client: PageserverHttpClient, - tenant: TenantId, +def wait_until(number_of_iterations: int, interval: float, func): + """ + Wait until 'func' returns successfully, without exception. Returns the + last return value from the function. + """ + last_exception = None + for i in range(number_of_iterations): + try: + res = func() + except Exception as e: + log.info("waiting for %s iteration %s failed", func, i + 1) + last_exception = e + time.sleep(interval) + continue + return res + raise Exception("timed out while waiting for %s" % func) from last_exception + + +def wait_while(number_of_iterations: int, interval: float, func): + """ + Wait until 'func' returns false, or throws an exception. + """ + for i in range(number_of_iterations): + try: + if not func(): + return + log.info("waiting for %s iteration %s failed", func, i + 1) + time.sleep(interval) + continue + except Exception: + return + raise Exception("timed out while waiting for %s" % func) + + +def assert_tenant_status( + pageserver_http_client: PageserverHttpClient, tenant: TenantId, expected_status: str ): tenant_status = pageserver_http_client.tenant_status(tenant) - assert tenant_status["has_in_progress_downloads"] is False, tenant_status - assert tenant_status["state"] == "Active" + log.info(f"tenant_status: {tenant_status}") + assert tenant_status["state"] == expected_status, tenant_status + + +def tenant_exists(ps_http: PageserverHttpClient, tenant_id: TenantId): + tenants = ps_http.tenant_list() + matching = [t for t in tenants if TenantId(t["id"]) == tenant_id] + assert len(matching) < 2 + if len(matching) == 0: + return None + return matching[0] def remote_consistent_lsn( @@ -2975,14 +3159,15 @@ def remote_consistent_lsn( ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) - lsn_str = detail["remote_consistent_lsn"] - if lsn_str is None: + if detail["remote_consistent_lsn"] is None: # No remote information at all. This happens right after creating # a timeline, before any part of it has been uploaded to remote # storage yet. return Lsn(0) - assert isinstance(lsn_str, str) - return Lsn(lsn_str) + else: + lsn_str = detail["remote_consistent_lsn"] + assert isinstance(lsn_str, str) + return Lsn(lsn_str) def wait_for_upload( @@ -2995,6 +3180,7 @@ def wait_for_upload( for i in range(20): current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) if current_lsn >= lsn: + log.info("wait finished") return log.info( "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( @@ -3084,3 +3270,34 @@ def fork_at_current_lsn( """ current_lsn = pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0] return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn) + + +def wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn( + tenant_id: TenantId, + timeline_id: TimelineId, + safekeepers: List[Safekeeper], + pageserver: NeonPageserver, +): + sk_commit_lsns = [ + sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn for sk in safekeepers + ] + lsn = max(sk_commit_lsns) + ps_http = pageserver.http_client() + wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, lsn) + return lsn + + +def wait_for_sk_commit_lsn_to_reach_remote_storage( + tenant_id: TenantId, + timeline_id: TimelineId, + safekeepers: List[Safekeeper], + pageserver: NeonPageserver, +): + lsn = wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn( + tenant_id, timeline_id, safekeepers, pageserver + ) + ps_http = pageserver.http_client() + # force a checkpoint to trigger upload + ps_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(ps_http, tenant_id, timeline_id, lsn) + return lsn diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 1fb9eb72e6..df83fc6377 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -148,7 +148,7 @@ def get_scale_for_db(size_mb: int) -> int: ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] - r"flamegraph\.svg|regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)" + r"regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)" ) diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index a32ce87c33..c1a57fb28b 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -1,12 +1,8 @@ # Running locally -First make a release build. The profiling flag is optional, used only for tests that -generate flame graphs. The `-s` flag just silences a lot of output, and makes it +First make a release build. The `-s` flag silences a lot of output, and makes it easier to see if you have compile errors without scrolling up. -`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing,profiling" make -s -j8` - -NOTE: the `profiling` flag only works on linux because we use linux-specific -libc APIs like `libc::timer_t`. +`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing" make -s -j8` Then run the tests `NEON_BIN=./target/release poetry run pytest test_runner/performance"` diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py index 01b2097112..a91c78e867 100644 --- a/test_runner/performance/test_copy.py +++ b/test_runner/performance/test_copy.py @@ -1,5 +1,6 @@ from contextlib import closing from io import BufferedReader, RawIOBase +from typing import Optional from fixtures.compare_fixtures import PgCompare @@ -8,7 +9,7 @@ class CopyTestData(RawIOBase): def __init__(self, rows: int): self.rows = rows self.rownum = 0 - self.linebuf = None + self.linebuf: Optional[bytes] = None self.ptr = 0 def readable(self): diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 015cc40a72..2b8760dff2 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -8,14 +8,14 @@ from typing import Dict, List import pytest from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult -from fixtures.compare_fixtures import NeonCompare, PgCompare +from fixtures.compare_fixtures import PgCompare from fixtures.utils import get_scale_for_db @enum.unique class PgBenchLoadType(enum.Enum): INIT = "init" - SIMPLE_UPDATE = "simple_update" + SIMPLE_UPDATE = "simple-update" SELECT_ONLY = "select-only" @@ -94,7 +94,9 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P if workload_type == PgBenchLoadType.INIT: # Run initialize - init_pgbench(env, ["pgbench", f"-s{scale}", "-i", connstr], password=password) + init_pgbench( + env, ["pgbench", f"-s{scale}", "-i", "-I", "dtGvp", connstr], password=password + ) if workload_type == PgBenchLoadType.SIMPLE_UPDATE: # Run simple-update workload @@ -174,28 +176,6 @@ def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int): run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.SELECT_ONLY) -# Run the pgbench tests, and generate a flamegraph from it -# This requires that the pageserver was built with the 'profiling' feature. -# -# TODO: If the profiling is cheap enough, there's no need to run the same test -# twice, with and without profiling. But for now, run it separately, so that we -# can see how much overhead the profiling adds. -@pytest.mark.parametrize("scale", get_scales_matrix()) -@pytest.mark.parametrize("duration", get_durations_matrix()) -def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int, duration: int): - neon_env_builder.pageserver_config_override = """ -profiling="page_requests" -""" - env = neon_env_builder.init_start() - env.pageserver.is_profiling_enabled_or_skip() - env.neon_cli.create_branch("empty", "main") - - neon_compare = NeonCompare(zenbenchmark, env, pg_bin, "pgbench") - run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.INIT) - run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SIMPLE_UPDATE) - run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SELECT_ONLY) - - # The following 3 tests run on an existing database as it was set up by previous tests, # and leaves the database in a state that would be used in the next tests. # Modifying the definition order of these functions or adding other remote tests in between will alter results. diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py index a61d64553d..bd84724405 100644 --- a/test_runner/performance/test_seqscans.py +++ b/test_runner/performance/test_seqscans.py @@ -22,15 +22,16 @@ from pytest_lazyfixture import lazy_fixture # type: ignore ], ) @pytest.mark.parametrize( - "env, scale", + "env,scale", [ # Run on all envs. Use 200x larger table on remote cluster to make sure # it doesn't fit in shared buffers, which are larger on remote than local. pytest.param(lazy_fixture("neon_compare"), 1, id="neon"), pytest.param(lazy_fixture("vanilla_compare"), 1, id="vanilla"), - pytest.param( - lazy_fixture("remote_compare"), 200, id="remote", marks=pytest.mark.remote_cluster - ), + # Reenable after switching per-test projects created via API + # pytest.param( + # lazy_fixture("remote_compare"), 200, id="remote", marks=pytest.mark.remote_cluster + # ), ], ) def test_seqscans(env: PgCompare, scale: int, rows: int, iters: int, workers: int): @@ -45,7 +46,7 @@ def test_seqscans(env: PgCompare, scale: int, rows: int, iters: int, workers: in # Verify that the table is larger than shared_buffers cur.execute( """ - select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('t') as tbl_ize + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('t') as tbl_size from pg_settings where name = 'shared_buffers' """ ) diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index dfbf956568..cc807b7ff3 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -84,6 +84,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): # Set the GC horizon so that lsn1 is inside the horizon, which means # we can create a new branch starting from lsn1. + pageserver_http_client.timeline_checkpoint(tenant, timeline_main) pageserver_http_client.timeline_gc(tenant, timeline_main, lsn2 - lsn1 + 1024) env.neon_cli.create_branch( @@ -156,6 +157,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): # branch creation task but the individual timeline GC iteration happens *after* # the branch creation task. pageserver_http_client.configure_failpoints(("before-timeline-gc", "sleep(2000)")) + pageserver_http_client.timeline_checkpoint(tenant, b0) def do_gc(): pageserver_http_client.timeline_gc(tenant, b0, 0) diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index a841e3ced2..d19f6a7d39 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -109,6 +109,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): # check that we cannot create branch based on garbage collected data with env.pageserver.http_client() as pageserver_http: + pageserver_http.timeline_checkpoint(env.initial_tenant, timeline) gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) print_gc_result(gc_result) diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 71964f622f..05d5788028 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -15,7 +15,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ - ".*Failed to load delta layer.*", + ".*Failed to reconstruct the page.*", ".*could not find data for key.*", ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", @@ -87,9 +87,9 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}" ) - # Second timeline has no ancestors, only the metadata file and no layer files. - # That is checked explicitly in the pageserver, and causes the tenant to be marked - # as broken. + # Second timeline has no ancestors, only the metadata file and no layer files locally, + # and we don't have the remote storage enabled. It is loaded into memory, but getting + # the basebackup from it will fail. with pytest.raises( Exception, match=f"Tenant {tenant2} will not become active. Current state: Broken" ) as err: @@ -97,8 +97,9 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): log.info(f"As expected, compute startup failed for timeline with missing layers: {err}") # Third timeline will also fail during basebackup, because the layer file is corrupt. + # It will fail when we try to read (and reconstruct) a page from it, ergo the error message. # (We don't check layer file contents on startup, when loading the timeline) - with pytest.raises(Exception, match="Failed to load delta layer") as err: + with pytest.raises(Exception, match="Failed to reconstruct the page") as err: pg3.start() log.info( f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}" diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py index 74ee2a89d4..f973bd8e60 100644 --- a/test_runner/regress/test_compute_ctl.py +++ b/test_runner/regress/test_compute_ctl.py @@ -193,8 +193,8 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): timeout=10, ) except TimeoutExpired as exc: - ctl_logs = exc.stderr.decode("utf-8") - log.info("compute_ctl output:\n" + ctl_logs) + ctl_logs = (exc.stderr or b"").decode("utf-8") + log.info("compute_ctl output:\n{ctl_logs}") with ExternalProcessManager(Path(pgdata) / "postmaster.pid"): start = "starting safekeepers syncing" @@ -240,7 +240,7 @@ class ExternalProcessManager: with self.pid_file: try: os.kill(self.pid, signal.SIGTERM) - except os.OsError as e: + except OSError as e: if not self.path.is_file(): return log.info(f"Failed to kill {self.pid}, but the pidfile remains: {e}") diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py old mode 100644 new mode 100755 diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 332bef225f..5f052bf81a 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -2,9 +2,17 @@ import asyncio import concurrent.futures import random +import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres -from fixtures.types import TimelineId +from fixtures.metrics import parse_metrics +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + Postgres, + RemoteStorageKind, + wait_for_last_flush_lsn, +) +from fixtures.types import TenantId, TimelineId from fixtures.utils import query_scalar # Test configuration @@ -35,11 +43,13 @@ async def gc(env: NeonEnv, timeline: TimelineId): loop = asyncio.get_running_loop() + def do_gc(): + pageserver_http.timeline_checkpoint(env.initial_tenant, timeline) + pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + with concurrent.futures.ThreadPoolExecutor() as pool: while updates_performed < updates_to_perform: - await loop.run_in_executor( - pool, lambda: pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) - ) + await loop.run_in_executor(pool, do_gc) # At the same time, run UPDATEs and GC @@ -87,3 +97,81 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): r = cur.fetchone() assert r is not None assert r == (num_rows, updates_to_perform) + + +# +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): + + # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls + neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_gc_index_upload", + ) + + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_gc_index_upload", "main") + pg = env.postgres.create_start("test_gc_index_upload") + + pageserver_http = env.pageserver.http_client() + + pg_conn = pg.connect() + cur = pg_conn.cursor() + + tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id")) + timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) + + cur.execute("CREATE TABLE foo (id int, counter int, t text)") + cur.execute( + """ + INSERT INTO foo + SELECT g, 0, 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Helper function that gets the number of given kind of remote ops from the metrics + def get_num_remote_ops(file_kind: str, op_kind: str) -> int: + ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver") + total = 0.0 + for sample in ps_metrics.query_all( + name="pageserver_remote_operation_seconds_count", + filter={ + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "file_kind": str(file_kind), + "op_kind": str(op_kind), + }, + ): + total += sample[2] + return int(total) + + # Sanity check that the metric works + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + pageserver_http.timeline_gc(tenant_id, timeline_id, 10000) + before = get_num_remote_ops("index", "upload") + assert before > 0 + + # Run many cycles of GC. Then check that the number of index files + # uploads didn't grow much. In particular we don't want to re-upload the + # index file on every GC iteration, when it has no work to do. + # + # On each iteration, we use a slightly smaller GC horizon, so that the GC + # at least needs to check if it has work to do. + for i in range(100): + cur.execute("INSERT INTO foo VALUES (0, 0, 'foo')") + pageserver_http.timeline_gc(tenant_id, timeline_id, 10000 - i * 32) + num_index_uploads = get_num_remote_ops("index", "upload") + + # Also make sure that a no-op compaction doesn't upload the index + # file unnecessarily. + pageserver_http.timeline_compact(tenant_id, timeline_id) + + log.info(f"{num_index_uploads} index uploads after GC iteration {i}") + + after = num_index_uploads + log.info(f"{after-before} new index uploads during test") + assert after - before < 5 diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 1a99d13a0b..0388e24e98 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -53,10 +53,10 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build unpacked_base = os.path.join(basebackup_dir, "unpacked-base") corrupt_base_tar = os.path.join(unpacked_base, "corrupt-base.tar") os.mkdir(unpacked_base, 0o750) - subprocess_capture(str(test_output_dir), ["tar", "-xf", base_tar, "-C", unpacked_base]) + subprocess_capture(test_output_dir, ["tar", "-xf", base_tar, "-C", unpacked_base]) os.remove(os.path.join(unpacked_base, "global/pg_control")) subprocess_capture( - str(test_output_dir), + test_output_dir, ["tar", "-cf", "corrupt-base.tar"] + os.listdir(unpacked_base), cwd=unpacked_base, ) @@ -306,6 +306,7 @@ def _import( # Check that gc works pageserver_http = env.pageserver.http_client() + pageserver_http.timeline_checkpoint(tenant, timeline) pageserver_http.timeline_gc(tenant, timeline, 0) return tar_output_file diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py new file mode 100644 index 0000000000..d1fcab7a62 --- /dev/null +++ b/test_runner/regress/test_metric_collection.py @@ -0,0 +1,163 @@ +import time + +import pytest +from fixtures.log_helper import log +from fixtures.metrics import parse_metrics +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PortDistributor, + RemoteStorageKind, + wait_for_last_flush_lsn, +) +from fixtures.types import TenantId, TimelineId +from fixtures.utils import query_scalar +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + + +@pytest.fixture(scope="session") +def httpserver_listen_address(port_distributor: PortDistributor): + port = port_distributor.get_port() + return ("localhost", port) + + +initial_tenant = TenantId.generate() +remote_uploaded = 0 +checks = { + "written_size": lambda value: value > 0, + "resident_size": lambda value: value >= 0, + # >= 0 check here is to avoid race condition when we receive metrics before + # remote_uploaded is updated + "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0, + # logical size may lag behind the actual size, so allow 0 here + "timeline_logical_size": lambda value: value >= 0, +} + +metric_kinds_checked = set([]) + + +# +# verify that metrics look minilally sane +# +def metrics_handler(request: Request) -> Response: + if request.json is None: + return Response(status=400) + + events = request.json["events"] + log.info("received events:") + log.info(events) + + for event in events: + assert event["tenant_id"] == str( + initial_tenant + ), "Expecting metrics only from the initial tenant" + metric_name = event["metric"] + + check = checks.get(metric_name) + # calm down mypy + if check is not None: + assert check(event["value"]), f"{metric_name} isn't valid" + global metric_kinds_checked + metric_kinds_checked.add(metric_name) + + return Response(status=200) + + +@pytest.mark.parametrize( + "remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS] +) +def test_metric_collection( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address, + remote_storage_kind: RemoteStorageKind, +): + (host, port) = httpserver_listen_address + metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" + + # Require collecting metrics frequently, since we change + # the timeline and want something to be logged about it. + # + # Disable time-based pitr, we will use the manual GC calls + # to trigger remote storage operations in a controlled way + neon_env_builder.pageserver_config_override = ( + f""" + metric_collection_interval="1s" + metric_collection_endpoint="{metric_collection_endpoint}" + """ + + "tenant_config={pitr_interval = '0 sec'}" + ) + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_metric_collection", + ) + + log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}") + + # Set initial tenant of the test, that we expect the logs from + global initial_tenant + initial_tenant = neon_env_builder.initial_tenant + # mock http server that returns OK for the metrics + httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler( + metrics_handler + ) + + # spin up neon, after http server is ready + env = neon_env_builder.init_start() + # Order of fixtures shutdown is not specified, and if http server gets down + # before pageserver, pageserver log might contain such errors in the end. + env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*") + env.neon_cli.create_branch("test_metric_collection") + pg = env.postgres.create_start("test_metric_collection") + + pg_conn = pg.connect() + cur = pg_conn.cursor() + + tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id")) + timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) + + cur.execute("CREATE TABLE foo (id int, counter int, t text)") + cur.execute( + """ + INSERT INTO foo + SELECT g, 0, 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Helper function that gets the number of given kind of remote ops from the metrics + def get_num_remote_ops(file_kind: str, op_kind: str) -> int: + ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver") + total = 0.0 + for sample in ps_metrics.query_all( + name="pageserver_remote_operation_seconds_count", + filter={ + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "file_kind": str(file_kind), + "op_kind": str(op_kind), + }, + ): + total += sample[2] + return int(total) + + # upload some data to remote storage + if remote_storage_kind == RemoteStorageKind.LOCAL_FS: + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + pageserver_http = env.pageserver.http_client() + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + pageserver_http.timeline_gc(tenant_id, timeline_id, 10000) + global remote_uploaded + remote_uploaded = get_num_remote_ops("index", "upload") + assert remote_uploaded > 0 + + # wait longer than collecting interval and check that all requests are served + time.sleep(3) + httpserver.check() + global metric_kinds_checked, checks + expected_checks = set(checks.keys()) + assert len(metric_kinds_checked) == len( + checks + ), f"Expected to receive and check all kind of metrics, but {expected_checks - metric_kinds_checked} got uncovered" diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py new file mode 100644 index 0000000000..bd0f550ba5 --- /dev/null +++ b/test_runner/regress/test_neon_local_cli.py @@ -0,0 +1,17 @@ +from fixtures.neon_fixtures import NeonEnvBuilder, PortDistributor + + +# Test that neon cli is able to start and stop all processes with the user defaults. +# Repeats the example from README.md as close as it can +def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: PortDistributor): + env = neon_env_builder.init_configs() + # Skipping the init step that creates a local tenant in Pytest tests + try: + env.neon_cli.start() + env.neon_cli.create_tenant(tenant_id=env.initial_tenant, set_default=True) + env.neon_cli.pg_start(node_name="main", port=port_distributor.get_port()) + + env.neon_cli.create_branch(new_branch_name="migration_check") + env.neon_cli.pg_start(node_name="migration_check", port=port_distributor.get_port()) + finally: + env.neon_cli.stop() diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index 3e387bb6cc..9885a811e1 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -45,7 +45,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # will cause GetPage requests. cur.execute( """ - select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' """ ) @@ -59,6 +59,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Make a lot of updates on a single row, generating a lot of WAL. Trigger # garbage collections so that the page server will remove old page versions. for i in range(10): + pageserver_http.timeline_checkpoint(env.initial_tenant, timeline) gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) print_gc_result(gc_result) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py new file mode 100644 index 0000000000..184dc13888 --- /dev/null +++ b/test_runner/regress/test_ondemand_download.py @@ -0,0 +1,440 @@ +# It's possible to run any regular test with the local fs remote storage via +# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... + +from pathlib import Path + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + RemoteStorageKind, + assert_tenant_status, + available_remote_storages, + wait_for_last_record_lsn, + wait_for_sk_commit_lsn_to_reach_remote_storage, + wait_for_upload, + wait_until, +) +from fixtures.types import Lsn +from fixtures.utils import query_scalar + + +def get_num_downloaded_layers(client, tenant_id, timeline_id): + value = client.get_metric_value( + f'pageserver_remote_operation_seconds_count{{file_kind="layer",op_kind="download",status="success",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}}' + ) + if value is None: + return 0 + return int(value) + + +# +# If you have a large relation, check that the pageserver downloads parts of it as +# require by queries. +# +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_ondemand_download_large_rel( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_ondemand_download_large_rel", + ) + + ##### First start, insert secret data and upload it to the remote storage + env = neon_env_builder.init_start() + + # Override defaults, to create more layers + tenant, _ = env.neon_cli.create_tenant( + conf={ + # disable background GC + "gc_period": "10 m", + "gc_horizon": f"{10 * 1024 ** 3}", # 10 GB + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{10 * 1024 ** 2}", # 10 MB + "compaction_threshold": "3", + "compaction_target_size": f"{10 * 1024 ** 2}", # 10 MB + } + ) + env.initial_tenant = tenant + + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + # We want to make sure that the data is large enough that the keyspace is partitioned. + num_rows = 1000000 + + with pg.cursor() as cur: + # data loading may take a while, so increase statement timeout + cur.execute("SET statement_timeout='300s'") + cur.execute( + f"""CREATE TABLE tbl AS SELECT g as id, 'long string to consume some space' || g + from generate_series(1,{num_rows}) g""" + ) + cur.execute("CREATE INDEX ON tbl (id)") + cur.execute("VACUUM tbl") + + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + client.timeline_checkpoint(tenant_id, timeline_id) + + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, tenant_id, timeline_id, current_lsn) + log.info("uploads have finished") + + ##### Stop the first pageserver instance, erase all its data + pg.stop() + env.pageserver.stop() + + # remove all the layer files + for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"): + log.info(f"unlinking layer {layer}") + layer.unlink() + + ##### Second start, restore the data and ensure it's the same + env.pageserver.start() + + pg.start() + before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + + # Probe in the middle of the table. There's a high chance that the beginning + # and end of the table was stored together in the same layer files with data + # from other tables, and with the entry that stores the size of the + # relation, so they are likely already downloaded. But the middle of the + # table should not have been needed by anything yet. + with pg.cursor() as cur: + assert query_scalar(cur, "select count(*) from tbl where id = 500000") == 1 + + after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + log.info(f"layers downloaded before {before_downloads} and after {after_downloads}") + assert after_downloads > before_downloads + + +# +# If you have a relation with a long history of updates, the pageserver downloads the layer +# files containing the history as needed by timetravel queries. +# +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_ondemand_download_timetravel( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_ondemand_download_timetravel", + ) + + ##### First start, insert data and upload it to the remote storage + env = neon_env_builder.init_start() + + # Override defaults, to create more layers + tenant, _ = env.neon_cli.create_tenant( + conf={ + # Disable background GC & compaction + # We don't want GC, that would break the assertion about num downloads. + # We don't want background compaction, we force a compaction every time we do explicit checkpoint. + "gc_period": "0s", + "compaction_period": "0s", + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB + "compaction_threshold": "1", + "image_creation_threshold": "1", + "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB + } + ) + env.initial_tenant = tenant + + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + lsns = [] + + table_len = 10000 + with pg.cursor() as cur: + cur.execute( + f""" + CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text); + INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len}); + """ + ) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + # run checkpoint manually to be sure that data landed in remote storage + client.timeline_checkpoint(tenant_id, timeline_id) + lsns.append((0, current_lsn)) + + for checkpoint_number in range(1, 20): + with pg.cursor() as cur: + cur.execute(f"UPDATE testtab SET checkpoint_number = {checkpoint_number}") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + lsns.append((checkpoint_number, current_lsn)) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + client.timeline_checkpoint(tenant_id, timeline_id) + + ##### Stop the first pageserver instance, erase all its data + env.postgres.stop_all() + + # wait until pageserver has successfully uploaded all the data to remote storage + wait_for_sk_commit_lsn_to_reach_remote_storage( + tenant_id, timeline_id, env.safekeepers, env.pageserver + ) + + def get_api_current_physical_size(): + d = client.timeline_detail(tenant_id, timeline_id) + return d["current_physical_size"] + + def get_resident_physical_size(): + return client.get_timeline_metric( + tenant_id, timeline_id, "pageserver_resident_physical_size" + ) + + filled_current_physical = get_api_current_physical_size() + log.info(filled_current_physical) + filled_size = get_resident_physical_size() + log.info(filled_size) + assert filled_current_physical == filled_size, "we don't yet do layer eviction" + + env.pageserver.stop() + + # remove all the layer files + for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"): + log.info(f"unlinking layer {layer}") + layer.unlink() + + ##### Second start, restore the data and ensure it's the same + env.pageserver.start() + + wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active")) + + # The current_physical_size reports the sum of layers loaded in the layer + # map, regardless of where the layer files are located. So even though we + # just removed the local files, they still count towards + # current_physical_size because they are loaded as `RemoteLayer`s. + assert filled_current_physical == get_api_current_physical_size() + + # Run queries at different points in time + num_layers_downloaded = [0] + resident_size = [get_resident_physical_size()] + for (checkpoint_number, lsn) in lsns: + pg_old = env.postgres.create_start( + branch_name="main", node_name=f"test_old_lsn_{checkpoint_number}", lsn=lsn + ) + with pg_old.cursor() as cur: + # assert query_scalar(cur, f"select count(*) from testtab where checkpoint_number={checkpoint_number}") == 100000 + assert ( + query_scalar( + cur, + f"select count(*) from testtab where checkpoint_number<>{checkpoint_number}", + ) + == 0 + ) + assert ( + query_scalar( + cur, + f"select count(*) from testtab where checkpoint_number={checkpoint_number}", + ) + == table_len + ) + + after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + num_layers_downloaded.append(after_downloads) + log.info(f"num_layers_downloaded[-1]={num_layers_downloaded[-1]}") + + # Check that on each query, we need to download at least one more layer file. However in + # practice, thanks to compaction and the fact that some requests need to download + # more history, some points-in-time are covered by earlier downloads already. But + # in broad strokes, as we query more points-in-time, more layers need to be downloaded. + # + # Do a fuzzy check on that, by checking that after each point-in-time, we have downloaded + # more files than we had three iterations ago. + log.info(f"layers downloaded after checkpoint {checkpoint_number}: {after_downloads}") + if len(num_layers_downloaded) > 4: + assert after_downloads > num_layers_downloaded[len(num_layers_downloaded) - 4] + + # Likewise, assert that the resident_physical_size metric grows as layers are downloaded + resident_size.append(get_resident_physical_size()) + log.info(f"resident_size[-1]={resident_size[-1]}") + if len(resident_size) > 4: + assert resident_size[-1] > resident_size[len(resident_size) - 4] + + # current_physical_size reports the total size of all layer files, whether + # they are present only in the remote storage, only locally, or both. + # It should not change. + assert filled_current_physical == get_api_current_physical_size() + + +# +# Ensure that the `download_remote_layers` API works +# +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_download_remote_layers_api( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_download_remote_layers_api", + ) + + ##### First start, insert data and upload it to the remote storage + env = neon_env_builder.init_start() + + # Override defaults, to create more layers + tenant, _ = env.neon_cli.create_tenant( + conf={ + # Disable background GC & compaction + # We don't want GC, that would break the assertion about num downloads. + # We don't want background compaction, we force a compaction every time we do explicit checkpoint. + "gc_period": "0s", + "compaction_period": "0s", + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB + "compaction_threshold": "1", + "image_creation_threshold": "1", + "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB + } + ) + env.initial_tenant = tenant + + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + table_len = 10000 + with pg.cursor() as cur: + cur.execute( + f""" + CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text); + INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len}); + """ + ) + + env.postgres.stop_all() + + wait_for_sk_commit_lsn_to_reach_remote_storage( + tenant_id, timeline_id, env.safekeepers, env.pageserver + ) + + def get_api_current_physical_size(): + d = client.timeline_detail(tenant_id, timeline_id) + return d["current_physical_size"] + + def get_resident_physical_size(): + return client.get_timeline_metric( + tenant_id, timeline_id, "pageserver_resident_physical_size" + ) + + filled_current_physical = get_api_current_physical_size() + log.info(filled_current_physical) + filled_size = get_resident_physical_size() + log.info(filled_size) + assert filled_current_physical == filled_size, "we don't yet do layer eviction" + + env.pageserver.stop() + + # remove all the layer files + # XXX only delete some of the layer files, to show that it really just downloads all the layers + for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"): + log.info(f"unlinking layer {layer}") + layer.unlink() + + # Shut down safekeepers before starting the pageserver. + # If we don't, the tenant's walreceiver handler will trigger the + # the logical size computation task, and that downloads layes, + # which makes our assertions on size fail. + for sk in env.safekeepers: + sk.stop(immediate=True) + + ##### Second start, restore the data and ensure it's the same + env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"}) + env.pageserver.allowed_errors.extend( + [ + f".*download_all_remote_layers.*{tenant_id}.*{timeline_id}.*layer download failed.*remote-storage-download-pre-rename failpoint", + f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size", + ] + ) + + wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active")) + + ###### Phase 1: exercise download error code path + assert ( + filled_current_physical == get_api_current_physical_size() + ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote" + post_unlink_size = get_resident_physical_size() + log.info(post_unlink_size) + assert ( + post_unlink_size < filled_size + ), "we just deleted layers and didn't cause anything to re-download them yet" + assert filled_size - post_unlink_size > 5 * ( + 1024**2 + ), "we may be downloading some layers as part of tenant activation" + + # issue downloads that we know will fail + info = client.timeline_download_remote_layers( + tenant_id, timeline_id, errors_ok=True, at_least_one_download=False + ) + log.info(f"info={info}") + assert info["state"] == "Completed" + assert info["total_layer_count"] > 0 + assert info["successful_download_count"] == 0 + assert ( + info["failed_download_count"] > 0 + ) # can't assert == total_layer_count because attach + tenant status downloads some layers + assert ( + info["total_layer_count"] + == info["successful_download_count"] + info["failed_download_count"] + ) + assert get_api_current_physical_size() == filled_current_physical + assert ( + get_resident_physical_size() == post_unlink_size + ), "didn't download anything new due to failpoint" + # would be nice to assert that the layers in the layer map are still RemoteLayer + + ##### Retry, this time without failpoints + client.configure_failpoints(("remote-storage-download-pre-rename", "off")) + info = client.timeline_download_remote_layers(tenant_id, timeline_id, errors_ok=False) + log.info(f"info={info}") + + assert info["state"] == "Completed" + assert info["total_layer_count"] > 0 + assert info["successful_download_count"] > 0 + assert info["failed_download_count"] == 0 + assert ( + info["total_layer_count"] + == info["successful_download_count"] + info["failed_download_count"] + ) + + refilled_size = get_resident_physical_size() + log.info(refilled_size) + + assert filled_size == refilled_size, "we redownloaded all the layers" + assert get_api_current_physical_size() == filled_current_physical + + for sk in env.safekeepers: + sk.start() + + # ensure that all the data is back + pg_old = env.postgres.create_start(branch_name="main") + with pg_old.cursor() as cur: + assert query_scalar(cur, "select count(*) from testtab") == table_len diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index e48815906b..6388e979e5 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -32,7 +32,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # Verify that the table is larger than shared_buffers cur.execute( """ - select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' """ ) @@ -115,7 +115,7 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder): # Verify that the table is larger than shared_buffers cur.execute( """ - select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' """ ) diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index d8b7256577..fe4fbc0927 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -52,6 +52,7 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): # run GC with env.pageserver.http_client() as pageserver_http: + pageserver_http.timeline_checkpoint(env.initial_tenant, timeline) pageserver_http.timeline_compact(env.initial_tenant, timeline) # perform aggressive GC. Data still should be kept because of the PITR setting. gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index eab9505fbb..e13ba51f4b 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -63,7 +63,11 @@ async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProx "port": local_vanilla_pg.default_options["port"], "dbname": local_vanilla_pg.default_options["dbname"], "user": pg_user, - "project": "irrelevant", + "aux": { + "project_id": "project", + "endpoint_id": "endpoint", + "branch_id": "branch", + }, } }, } @@ -71,6 +75,7 @@ async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProx log.info("sending session activation message") psql = await PSQL(host=link_proxy.host, port=link_proxy.mgmt_port).run(db_info) + assert psql.stdout is not None out = (await psql.stdout.read()).decode("utf-8").strip() assert out == "ok" @@ -122,3 +127,33 @@ def test_auth_errors(static_proxy: NeonProxy): # Finally, check that the user can connect with static_proxy.connect(user="pinocchio", password="magic", options="project=irrelevant"): pass + + +def test_forward_params_to_client(static_proxy: NeonProxy): + # A subset of parameters (GUCs) which postgres + # sends to the client during connection setup. + # Unfortunately, `GUC_REPORT` can't be queried. + # Proxy *should* forward them, otherwise client library + # might misbehave (e.g. parse timestamps incorrectly). + reported_params_subset = [ + "client_encoding", + "integer_datetimes", + "is_superuser", + "server_encoding", + "server_version", + "session_authorization", + "standard_conforming_strings", + ] + + query = """ + select name, setting + from pg_catalog.pg_settings + where name = any(%s) + """ + + with static_proxy.connect(options="project=irrelevant") as conn: + with conn.cursor() as cur: + cur.execute(query, (reported_params_subset,)) + for name, value in cur.fetchall(): + # Check that proxy has forwarded this parameter. + assert conn.get_parameter_status(name) == value diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 1e93958e98..09644eaaa1 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -12,11 +12,9 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): # Override default checkpointer settings to run it more often neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" - env = neon_env_builder.init() + env = neon_env_builder.init_start() env.pageserver.is_testing_enabled_or_skip() - neon_env_builder.start() - # These warnings are expected, when the pageserver is restarted abruptly env.pageserver.allowed_errors.append(".*found future delta layer.*") env.pageserver.allowed_errors.append(".*found future image layer.*") diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 7152bc8b6a..82bf741a8f 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -2,11 +2,11 @@ # env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... import os -import re import shutil import threading import time from pathlib import Path +from typing import Dict, List, Tuple import pytest from fixtures.log_helper import log @@ -14,7 +14,6 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PageserverApiException, RemoteStorageKind, - assert_no_in_progress_downloads_for_tenant, available_remote_storages, wait_for_last_flush_lsn, wait_for_last_record_lsn, @@ -56,10 +55,15 @@ def test_remote_storage_backup_and_restore( test_name="test_remote_storage_backup_and_restore", ) - data_id = 1 - data_secret = "very secret secret" + # Exercise retry code path by making all uploads and downloads fail for the + # first time. The retries print INFO-messages to the log; we will check + # that they are present after the test. + neon_env_builder.pageserver_config_override = "test_remote_failures=1" - ##### First start, insert secret data and upload it to the remote storage + data_id = 1 + data = "just some data" + + ##### First start, insert data and upload it to the remote storage env = neon_env_builder.init_start() # FIXME: Is this expected? @@ -76,6 +80,7 @@ def test_remote_storage_backup_and_restore( env.pageserver.allowed_errors.append( ".*Cannot attach tenant .*?, local tenant directory already exists.*" ) + env.pageserver.allowed_errors.append(".*simulated failure of remote operation.*") pageserver_http = env.pageserver.http_client() pg = env.postgres.create_start("main") @@ -87,22 +92,12 @@ def test_remote_storage_backup_and_restore( checkpoint_numbers = range(1, 3) - # On the first iteration, exercise retry code path by making the uploads - # fail for the first 3 times - action = "3*return->off" - pageserver_http.configure_failpoints( - [ - ("before-upload-layer", action), - ("before-upload-index", action), - ] - ) - for checkpoint_number in checkpoint_numbers: with pg.cursor() as cur: cur.execute( f""" - CREATE TABLE t{checkpoint_number}(id int primary key, secret text); - INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); + CREATE TABLE t{checkpoint_number}(id int primary key, data text); + INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data}|{checkpoint_number}'); """ ) current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) @@ -118,6 +113,14 @@ def test_remote_storage_backup_and_restore( wait_for_upload(client, tenant_id, timeline_id, current_lsn) log.info(f"upload of checkpoint {checkpoint_number} is done") + # Check that we had to retry the uploads + assert env.pageserver.log_contains( + ".*failed to perform remote task UploadLayer.*, will retry.*" + ) + assert env.pageserver.log_contains( + ".*failed to perform remote task UploadMetadata.*, will retry.*" + ) + ##### Stop the first pageserver instance, erase all its data env.postgres.stop_all() env.pageserver.stop() @@ -129,36 +132,53 @@ def test_remote_storage_backup_and_restore( ##### Second start, restore the data and ensure it's the same env.pageserver.start() - # Introduce failpoint in download - pageserver_http.configure_failpoints(("remote-storage-download-pre-rename", "return")) - + # Introduce failpoint in list remote timelines code path to make tenant_attach fail. + # This is before the failures injected by test_remote_failures, so it's a permanent error. + pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return")) + env.pageserver.allowed_errors.append( + ".*error attaching tenant: storage-sync-list-remote-timelines", + ) + # Attach it. This HTTP request will succeed and launch a + # background task to load the tenant. In that background task, + # listing the remote timelines will fail because of the failpoint, + # and the tenant will be marked as Broken. client.tenant_attach(tenant_id) - - # is there a better way to assert that failpoint triggered? wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) - # assert cannot attach timeline that is scheduled for download - # FIXME implement layer download retries + # Ensure that even though the tenant is broken, we can't attach it again. with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"): client.tenant_attach(tenant_id) - tenant_status = client.tenant_status(tenant_id) - log.info("Tenant status with active failpoint: %s", tenant_status) - # FIXME implement layer download retries - # assert tenant_status["has_in_progress_downloads"] is True - - # trigger temporary download files removal + # Restart again, this implicitly clears the failpoint. + # test_remote_failures=1 remains active, though, as it's in the pageserver config. + # This means that any of the remote client operations after restart will exercise the + # retry code path. + # + # The initiated attach operation should survive the restart, and continue from where it was. env.pageserver.stop() + layer_download_failed_regex = ( + r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure" + ) + assert not env.pageserver.log_contains( + layer_download_failed_regex + ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint" env.pageserver.start() - # ensure that an initiated attach operation survives pageserver restart + # Ensure that the pageserver remembers that the tenant was attaching, by + # trying to attach it again. It should fail. with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"): client.tenant_attach(tenant_id) - log.info("waiting for timeline redownload") + log.info("waiting for tenant to become active. this should be quick with on-demand download") + + def tenant_active(): + all_states = client.tenant_list() + [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] + assert tenant["state"] == "Active" + wait_until( - number_of_iterations=20, + number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), + func=tenant_active, ) detail = client.timeline_detail(tenant_id, timeline_id) @@ -167,14 +187,18 @@ def test_remote_storage_backup_and_restore( Lsn(detail["last_record_lsn"]) >= current_lsn ), "current db Lsn should should not be less than the one stored on remote storage" + log.info("select some data, this will cause layers to be downloaded") pg = env.postgres.create_start("main") with pg.cursor() as cur: for checkpoint_number in checkpoint_numbers: assert ( - query_scalar(cur, f"SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};") - == f"{data_secret}|{checkpoint_number}" + query_scalar(cur, f"SELECT data FROM t{checkpoint_number} WHERE id = {data_id};") + == f"{data}|{checkpoint_number}" ) + log.info("ensure that we neede to retry downloads due to test_remote_failures=1") + assert env.pageserver.log_contains(layer_download_failed_regex) + # Exercises the upload queue retry code paths. # - Use failpoints to cause all storage ops to fail @@ -247,14 +271,15 @@ def test_remote_storage_upload_queue_retries( wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) def get_queued_count(file_kind, op_kind): - metrics = client.get_metrics() - matches = re.search( - f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$', - metrics, - re.MULTILINE, + val = client.get_remote_timeline_client_metric( + "pageserver_remote_timeline_client_calls_unfinished", + tenant_id, + timeline_id, + file_kind, + op_kind, ) - assert matches - return int(matches[1]) + assert val is not None, "expecting metric to be present" + return int(val) # create some layers & wait for uploads to finish overwrite_data_and_wait_for_it_to_arrive_at_pageserver("a") @@ -334,7 +359,6 @@ def test_remote_storage_upload_queue_retries( def tenant_active(): all_states = client.tenant_list() [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] - assert tenant["has_in_progress_downloads"] is False assert tenant["state"] == "Active" wait_until(30, 1, tenant_active) @@ -345,6 +369,168 @@ def test_remote_storage_upload_queue_retries( assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000 +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_remote_timeline_client_calls_started_metric( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_remote_timeline_client_metrics", + ) + + env = neon_env_builder.init_start() + + # create tenant with config that will determinstically allow + # compaction and gc + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # don't create image layers, that causes just noise + "image_creation_threshold": "10000", + } + ) + + client = env.pageserver.http_client() + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + + pg.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + + def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data): + # create initial set of layers & upload them with failpoints configured + pg.safe_psql_many( + [ + f""" + INSERT INTO foo (id, val) + SELECT g, '{data}' + FROM generate_series(1, 10000) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + # to ensure that GC can actually remove some layers + "VACUUM foo", + ] + ) + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + def get_queued_count(file_kind, op_kind): + val = client.get_remote_timeline_client_metric( + "pageserver_remote_timeline_client_calls_unfinished", + tenant_id, + timeline_id, + file_kind, + op_kind, + ) + if val is None: + return val + return int(val) + + def wait_upload_queue_empty(): + wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0) + wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0) + wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0) + + calls_started: Dict[Tuple[str, str], List[int]] = { + ("layer", "upload"): [0], + ("index", "upload"): [0], + ("layer", "delete"): [0], + } + + def fetch_calls_started(): + for (file_kind, op_kind), observations in calls_started.items(): + val = client.get_remote_timeline_client_metric( + "pageserver_remote_timeline_client_calls_started_count", + tenant_id, + timeline_id, + file_kind, + op_kind, + ) + assert val is not None, f"expecting metric to be present: {file_kind} {op_kind}" + val = int(val) + observations.append(val) + + def ensure_calls_started_grew(): + for (file_kind, op_kind), observations in calls_started.items(): + log.info(f"ensure_calls_started_grew: {file_kind} {op_kind}: {observations}") + assert all( + x < y for x, y in zip(observations, observations[1:]) + ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}" + + def churn(data_pass1, data_pass2): + overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1) + client.timeline_checkpoint(tenant_id, timeline_id) + client.timeline_compact(tenant_id, timeline_id) + overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2) + client.timeline_checkpoint(tenant_id, timeline_id) + client.timeline_compact(tenant_id, timeline_id) + gc_result = client.timeline_gc(tenant_id, timeline_id, 0) + print_gc_result(gc_result) + assert gc_result["layers_removed"] > 0 + + # create some layers & wait for uploads to finish + churn("a", "b") + + wait_upload_queue_empty() + + # ensure that we updated the calls_started metric + fetch_calls_started() + ensure_calls_started_grew() + + # more churn to cause more operations + churn("c", "d") + + # ensure that the calls_started metric continued to be updated + fetch_calls_started() + ensure_calls_started_grew() + + ### now we exercise the download path + calls_started.clear() + calls_started.update( + { + ("index", "download"): [0], + ("layer", "download"): [0], + } + ) + + env.pageserver.stop(immediate=True) + env.postgres.stop_all() + + dir_to_clear = Path(env.repo_dir) / "tenants" + shutil.rmtree(dir_to_clear) + os.mkdir(dir_to_clear) + + env.pageserver.start() + client = env.pageserver.http_client() + + client.tenant_attach(tenant_id) + + def tenant_active(): + all_states = client.tenant_list() + [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] + assert tenant["state"] == "Active" + + wait_until(30, 1, tenant_active) + + log.info("restarting postgres to validate") + pg = env.postgres.create_start("main", tenant_id=tenant_id) + with pg.cursor() as cur: + assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000 + + # ensure that we updated the calls_started download metric + fetch_calls_started() + ensure_calls_started_grew() + + # Test that we correctly handle timeline with layers stuck in upload queue @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) def test_timeline_deletion_with_files_stuck_in_upload_queue( @@ -378,14 +564,14 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( client = env.pageserver.http_client() def get_queued_count(file_kind, op_kind): - metrics = client.get_metrics() - matches = re.search( - f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$', - metrics, - re.MULTILINE, + val = client.get_remote_timeline_client_metric( + "pageserver_remote_timeline_client_calls_unfinished", + tenant_id, + timeline_id, + file_kind, + op_kind, ) - assert matches - return int(matches[1]) + return int(val) if val is not None else val pg = env.postgres.create_start("main", tenant_id=tenant_id) @@ -436,8 +622,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( assert not timeline_path.exists() - # timeline deletion should kill ongoing uploads - assert get_queued_count(file_kind="index", op_kind="upload") == 0 + # timeline deletion should kill ongoing uploads, so, the metric will be gone + assert get_queued_count(file_kind="index", op_kind="upload") is None # timeline deletion should be unblocking checkpoint ops checkpoint_thread.join(2.0) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 6d621fbb77..29cdcb18ce 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -59,7 +59,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "gc_horizon": 67108864, "gc_period": 100, "image_creation_threshold": 3, - "pitr_interval": 2592000, + "pitr_interval": 604800, # 7 days }.items() ) @@ -79,7 +79,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "gc_horizon": 67108864, "gc_period": 30, "image_creation_threshold": 3, - "pitr_interval": 2592000, + "pitr_interval": 604800, }.items() ) @@ -107,7 +107,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "gc_horizon": 67108864, "gc_period": 80, "image_creation_threshold": 3, - "pitr_interval": 2592000, + "pitr_interval": 604800, }.items() ) @@ -130,7 +130,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "gc_horizon": 67108864, "gc_period": 80, "image_creation_threshold": 3, - "pitr_interval": 2592000, + "pitr_interval": 604800, }.items() ) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 59811c565c..db5bb679f2 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -1,9 +1,13 @@ +import asyncio +import random import time from threading import Thread +import asyncpg import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( + NeonEnv, NeonEnvBuilder, PageserverApiException, PageserverHttpClient, @@ -12,6 +16,7 @@ from fixtures.neon_fixtures import ( available_remote_storages, wait_for_last_record_lsn, wait_for_upload, + wait_until, wait_until_tenant_state, ) from fixtures.types import Lsn, TenantId, TimelineId @@ -24,6 +29,7 @@ def do_gc_target( """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" try: log.info("sending gc http request") + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) pageserver_http.timeline_gc(tenant_id, timeline_id, 0) except Exception as e: log.error("do_gc failed: %s", e) @@ -31,6 +37,202 @@ def do_gc_target( log.info("gc http thread returning") +# Basic detach and re-attach test +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_tenant_reattach( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_tenant_reattach", + ) + + # Exercise retry code path by making all uploads and downloads fail for the + # first time. The retries print INFO-messages to the log; we will check + # that they are present after the test. + neon_env_builder.pageserver_config_override = "test_remote_failures=1" + + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + # create new nenant + tenant_id, timeline_id = env.neon_cli.create_tenant() + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + with pg.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # Wait for the all data to be processed by the pageserver and uploaded in remote storage + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + # Check that we had to retry the uploads + assert env.pageserver.log_contains( + ".*failed to perform remote task UploadLayer.*, will retry.*" + ) + assert env.pageserver.log_contains( + ".*failed to perform remote task UploadMetadata.*, will retry.*" + ) + + pageserver_http.tenant_detach(tenant_id) + pageserver_http.tenant_attach(tenant_id) + + with pg.cursor() as cur: + assert query_scalar(cur, "SELECT count(*) FROM t") == 100000 + + # Check that we had to retry the downloads + assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*") + assert env.pageserver.log_contains(".*download.*failed, will retry.*") + + +num_connections = 10 +num_rows = 100000 +updates_to_perform = 0 + +updates_started = 0 +updates_finished = 0 + + +# Run random UPDATEs on test table. On failure, try again. +async def update_table(pg_conn: asyncpg.Connection): + global updates_started, updates_finished, updates_to_perform + + while updates_started < updates_to_perform or updates_to_perform == 0: + updates_started += 1 + id = random.randrange(1, num_rows) + + # Loop to retry until the UPDATE succeeds + while True: + try: + await pg_conn.fetchrow(f"UPDATE t SET counter = counter + 1 WHERE id = {id}") + updates_finished += 1 + if updates_finished % 1000 == 0: + log.info(f"update {updates_finished} / {updates_to_perform}") + break + except asyncpg.PostgresError as e: + # Received error from Postgres. Log it, sleep a little, and continue + log.info(f"UPDATE error: {e}") + await asyncio.sleep(0.1) + + +async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: TenantId): + global updates_started, updates_finished, updates_to_perform + + # Wait until we have performed some updates + wait_until(20, 0.5, lambda: updates_finished > 500) + + log.info("Detaching tenant") + pageserver_http.tenant_detach(tenant_id) + await asyncio.sleep(1) + log.info("Re-attaching tenant") + pageserver_http.tenant_attach(tenant_id) + log.info("Re-attach finished") + + # Continue with 5000 more updates + updates_to_perform = updates_started + 5000 + + +# async guts of test_tenant_reattach_while_bysy test +async def reattach_while_busy( + env: NeonEnv, pg: Postgres, pageserver_http: PageserverHttpClient, tenant_id: TenantId +): + workers = [] + for worker_id in range(num_connections): + pg_conn = await pg.connect_async() + workers.append(asyncio.create_task(update_table(pg_conn))) + + workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id))) + await asyncio.gather(*workers) + + assert updates_finished == updates_to_perform + + +# Detach and re-attach tenant, while compute is busy running queries. +# +# Some of the queries may fail, in the window that the tenant has been +# detached but not yet re-attached. But Postgres itself should keep +# running, and when we retry the queries, they should start working +# after the attach has finished. + +# FIXME: +# +# This is pretty unstable at the moment. I've seen it fail with a warning like this: +# +# AssertionError: assert not ['2023-01-05T13:09:40.708303Z WARN remote_upload{tenant=c3fc41f6cf29a7626b90316e3518cd4b timeline=7978246f85faa71ab03...1282b/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001716699-0000000001736681"\n'] +# +# (https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3232/debug/3846817847/index.html#suites/f9eba3cfdb71aa6e2b54f6466222829b/470fc62b5db7d7d7/) +# I believe that failure happened because there is a race condition +# between detach and starting remote upload tasks: +# +# 1. detach_timeline calls task_mgr::shutdown_tasks(), sending shutdown +# signal to all in-progress tasks associated with the tenant. +# 2. Just after shutdown_tasks() has collected the list of tasks, +# a new remote-upload task is spawned. +# +# See https://github.com/neondatabase/neon/issues/3273 +# +# +# I also saw this failure: +# +# test_runner/regress/test_tenant_detach.py:194: in test_tenant_reattach_while_busy +# asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id)) +# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/runners.py:44: in run +# return loop.run_until_complete(main) +# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/base_events.py:642: in run_until_complete +# return future.result() +# test_runner/regress/test_tenant_detach.py:151: in reattach_while_busy +# assert updates_finished == updates_to_perform +# E assert 5010 == 10010 +# E +5010 +# E -10010 +# +# I don't know what's causing that... +@pytest.mark.skip(reason="fixme") +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_tenant_reattach_while_busy( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_tenant_reattach_while_busy", + ) + env = neon_env_builder.init_start() + + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*") + env.pageserver.allowed_errors.append( + ".*Tenant .* will not become active\\. Current state: Stopping.*" + ) + + pageserver_http = env.pageserver.http_client() + + # create new nenant + tenant_id, timeline_id = env.neon_cli.create_tenant( + # Create layers aggressively + conf={"checkpoint_distance": "100000"} + ) + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + + cur = pg.connect().cursor() + + cur.execute("CREATE TABLE t(id int primary key, counter int)") + cur.execute(f"INSERT INTO t SELECT generate_series(1,{num_rows}), 0") + + # Run the test + asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id)) + + # Verify table contents + assert query_scalar(cur, "SELECT count(*) FROM t") == num_rows + assert query_scalar(cur, "SELECT sum(counter) FROM t") == updates_to_perform + + def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -417,7 +619,7 @@ def test_ignore_while_attaching( pageserver_http.tenant_attach(tenant_id) # Run ignore on the task, thereby cancelling the attach. # XXX This should take priority over attach, i.e., it should cancel the attach task. - # But neither the failpoint, nor the proper storage_sync2 download functions, + # But neither the failpoint, nor the proper storage_sync download functions, # are sensitive to task_mgr::shutdown. # This problem is tracked in https://github.com/neondatabase/neon/issues/2996 . # So, for now, effectively, this ignore here will block until attach task completes. diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 081fd0fc2f..1b58937e2a 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -13,12 +13,15 @@ from fixtures.neon_fixtures import ( PageserverHttpClient, PortDistributor, Postgres, - assert_no_in_progress_downloads_for_tenant, + assert_tenant_status, + tenant_exists, wait_for_last_record_lsn, wait_for_upload, + wait_until, + wait_while, ) from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import query_scalar, start_in_background, subprocess_capture, wait_until +from fixtures.utils import query_scalar, start_in_background, subprocess_capture def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -406,17 +409,13 @@ def test_tenant_relocation( # call to attach timeline to new pageserver new_pageserver_http.tenant_attach(tenant_id) - # check that it shows that download is in progress + # wait for tenant to finish attaching tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id) - assert tenant_status.get("has_in_progress_downloads"), tenant_status - - # wait until tenant is downloaded + assert tenant_status["state"] in ["Attaching", "Active"] wait_until( number_of_iterations=10, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant( - new_pageserver_http, tenant_id - ), + func=lambda: assert_tenant_status(new_pageserver_http, tenant_id, "Active"), ) check_timeline_attached( @@ -459,9 +458,15 @@ def test_tenant_relocation( # detach tenant from old pageserver before we check # that all the data is there to be sure that old pageserver - # is no longer involved, and if it is, we will see the errors + # is no longer involved, and if it is, we will see the error pageserver_http.tenant_detach(tenant_id) + # Wait a little, so that the detach operation has time to finish. + wait_while( + number_of_iterations=100, + interval=1, + func=lambda: tenant_exists(pageserver_http, tenant_id), + ) post_migration_check(pg_main, 500500, old_local_path_main) post_migration_check(pg_second, 1001000, old_local_path_second) diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index ddae1a67ff..4eba4ce942 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -20,44 +20,48 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): matching = [t for t in all_states if TenantId(t["id"]) == tenant] return get_only_element(matching)["state"] - def get_metric_value(name): - metrics = client.get_metrics() - relevant = [line for line in metrics.splitlines() if line.startswith(name)] - if len(relevant) == 0: - return 0 - line = get_only_element(relevant) - value = line.lstrip(name).strip() - return int(value) - def delete_all_timelines(tenant: TenantId): timelines = [TimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)] for t in timelines: client.timeline_delete(tenant, t) + def assert_active(tenant): + assert get_state(tenant) == "Active" + # Create tenant, start compute tenant, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline(name, tenant_id=tenant) pg = env.postgres.create_start(name, tenant_id=tenant) + assert ( + get_state(tenant) == "Active" + ), "Pageserver should activate a tenant and start background jobs if timelines are loaded" # Stop compute pg.stop() - # Delete all timelines on all tenants + # Delete all timelines on all tenants. + # + # FIXME: we used to check that the background jobs are stopped when all timelines + # are removed, but we don't stop them anymore. Not sure if this test still makes sense + # or we should just remove it. for tenant_info in client.tenant_list(): tenant_id = TenantId(tenant_info["id"]) delete_all_timelines(tenant_id) + wait_until(10, 0.2, lambda: assert_active(tenant_id)) # Assert that all tasks finish quickly after tenant is detached - assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0 + task_starts = client.get_metric_value('pageserver_tenant_task_events{event="start"}') + assert task_starts is not None + assert int(task_starts) > 0 client.tenant_detach(tenant) client.tenant_detach(env.initial_tenant) def assert_tasks_finish(): - tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}') - tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}') - tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}') + tasks_started = client.get_metric_value('pageserver_tenant_task_events{event="start"}') + tasks_ended = client.get_metric_value('pageserver_tenant_task_events{event="stop"}') + tasks_panicked = client.get_metric_value('pageserver_tenant_task_events{event="panic"}') log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}") assert tasks_started == tasks_ended - assert tasks_panicked == 0 + assert tasks_panicked is None or int(tasks_panicked) == 0 wait_until(10, 0.2, assert_tasks_finish) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 0b20afefc3..9477ae3c25 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -7,7 +7,11 @@ from typing import List import pytest from fixtures.log_helper import log -from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics +from fixtures.metrics import ( + PAGESERVER_PER_TENANT_METRICS, + PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, + parse_metrics, +) from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, @@ -157,9 +161,21 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): ) -def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize( + "remote_storage_kind", + # exercise both the code paths where remote_storage=None and remote_storage=Some(...) + [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3], +) +def test_pageserver_metrics_removed_after_detach( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): """Tests that when a tenant is detached, the tenant specific metrics are not left behind""" + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_pageserver_metrics_removed_after_detach", + ) + neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() @@ -192,7 +208,11 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde for tenant in [tenant_1, tenant_2]: pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)]) - assert pre_detach_samples == set(PAGESERVER_PER_TENANT_METRICS) + expected = set(PAGESERVER_PER_TENANT_METRICS) + if remote_storage_kind == RemoteStorageKind.NOOP: + # if there's no remote storage configured, we don't expose the remote timeline client metrics + expected -= set(PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS) + assert pre_detach_samples == expected env.pageserver.http_client().tenant_detach(tenant) diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index afc413f3e3..6da6a4d446 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -21,9 +21,10 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, Postgres, RemoteStorageKind, - assert_no_in_progress_downloads_for_tenant, + assert_tenant_status, available_remote_storages, wait_for_last_record_lsn, + wait_for_sk_commit_lsn_to_reach_remote_storage, wait_for_upload, ) from fixtures.types import Lsn, TenantId, TimelineId @@ -120,6 +121,11 @@ def test_tenants_attached_after_download( data_id = 1 data_secret = "very secret secret" + # Exercise retry code path by making all uploads and downloads fail for the + # first time. The retries print INFO-messages to the log; we will check + # that they are present after the test. + neon_env_builder.pageserver_config_override = "test_remote_failures=1" + ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() @@ -158,26 +164,19 @@ def test_tenants_attached_after_download( wait_for_upload(client, tenant_id, timeline_id, current_lsn) log.info(f"upload of checkpoint {checkpoint_number} is done") + # Check that we had to retry the uploads + assert env.pageserver.log_contains( + ".*failed to perform remote task UploadLayer.*, will retry.*" + ) + assert env.pageserver.log_contains( + ".*failed to perform remote task UploadMetadata.*, will retry.*" + ) + ##### Stop the pageserver, erase its layer file to force it being downloaded from S3 env.postgres.stop_all() - sk_commit_lsns = [ - sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn - for sk in env.safekeepers - ] - log.info("wait for pageserver to process all the WAL") - wait_for_last_record_lsn(client, tenant_id, timeline_id, max(sk_commit_lsns)) - log.info("wait for it to reach remote storage") - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload(client, tenant_id, timeline_id, max(sk_commit_lsns)) - log.info("latest safekeeper_commit_lsn reached remote storage") - - detail_before = client.timeline_detail( - tenant_id, timeline_id, include_non_incremental_physical_size=True - ) - assert ( - detail_before["current_physical_size_non_incremental"] - == detail_before["current_physical_size"] + wait_for_sk_commit_lsn_to_reach_remote_storage( + tenant_id, timeline_id, env.safekeepers, env.pageserver ) env.pageserver.stop() @@ -193,13 +192,16 @@ def test_tenants_attached_after_download( assert local_layer_deleted, f"Found no local layer files to delete in directory {timeline_dir}" ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory + # FIXME: just starting the pageserver no longer downloads the + # layer files. Do we want to force download, or maybe run some + # queries, or is it enough that it starts up without layer files? env.pageserver.start() client = env.pageserver.http_client() wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), + func=lambda: assert_tenant_status(client, tenant_id, "Active"), ) restored_timelines = client.timeline_list(tenant_id) @@ -211,11 +213,8 @@ def test_tenants_attached_after_download( timeline_id ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" - # Check that the physical size matches after re-downloading - detail_after = client.timeline_detail( - tenant_id, timeline_id, include_non_incremental_physical_size=True - ) - assert detail_before["current_physical_size"] == detail_after["current_physical_size"] + # Check that we had to retry the downloads + assert env.pageserver.log_contains(".*download .* succeeded after 1 retries.*") @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) @@ -230,7 +229,7 @@ def test_tenant_upgrades_index_json_from_v0( "timeline_layers":[ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9" ], - "missing_layers":[], + "missing_layers":["This should not fail as its not used anymore"], "disk_consistent_lsn":"0/16960E8", "metadata_bytes":[] }""" @@ -262,7 +261,6 @@ def test_tenant_upgrades_index_json_from_v0( wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) pageserver_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) - env.postgres.stop_all() env.pageserver.stop() @@ -275,7 +273,10 @@ def test_tenant_upgrades_index_json_from_v0( # keep the deserialized for later inspection orig_index_part = json.load(timeline_file) - v0_index_part = {key: orig_index_part[key] for key in v0_skeleton} + v0_index_part = { + key: orig_index_part[key] + for key in v0_skeleton.keys() - ["missing_layers"] # pgserver doesn't have it anymore + } timeline_file.seek(0) json.dump(v0_index_part, timeline_file) @@ -287,7 +288,7 @@ def test_tenant_upgrades_index_json_from_v0( wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id), + func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"), ) pg = env.postgres.create_start("main") @@ -307,7 +308,7 @@ def test_tenant_upgrades_index_json_from_v0( # make sure the file has been upgraded back to how it started index_part = local_fs_index_part(env, tenant_id, timeline_id) assert index_part["version"] == orig_index_part["version"] - assert index_part["missing_layers"] == orig_index_part["missing_layers"] + assert "missing_layers" not in index_part.keys() # expect one more layer because of the forced checkpoint assert len(index_part["timeline_layers"]) == len(orig_index_part["timeline_layers"]) + 1 @@ -394,7 +395,7 @@ def test_tenant_ignores_backup_file( wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id), + func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"), ) pg = env.postgres.create_start("main") @@ -474,14 +475,15 @@ def test_tenant_redownloads_truncated_file_on_startup( index_part = local_fs_index_part(env, tenant_id, timeline_id) assert index_part["layer_metadata"][path.name]["file_size"] == expected_size - ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory + ## Start the pageserver. It will notice that the file size doesn't match, and + ## rename away the local file. It will be re-downloaded when it's needed. env.pageserver.start() client = env.pageserver.http_client() wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), + func=lambda: assert_tenant_status(client, tenant_id, "Active"), ) restored_timelines = client.timeline_list(tenant_id) @@ -493,6 +495,10 @@ def test_tenant_redownloads_truncated_file_on_startup( timeline_id ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" + # Request non-incremental logical size. Calculating it needs the layer file that + # we corrupted, forcing it to be redownloaded. + client.timeline_detail(tenant_id, timeline_id, include_non_incremental_logical_size=True) + assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded" # the remote side of local_layer_truncated diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index cef1f365cd..3b41cc5c90 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -1,22 +1,28 @@ import math +import queue import random import re +import threading import time from contextlib import closing from pathlib import Path import psycopg2.errors import psycopg2.extras +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + PageserverApiException, PageserverHttpClient, PgBin, PortDistributor, Postgres, VanillaPostgres, + assert_tenant_status, wait_for_last_flush_lsn, + wait_until, ) from fixtures.types import TenantId, TimelineId from fixtures.utils import get_timeline_dir_size @@ -213,6 +219,89 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): ), "after the WAL is streamed, current_logical_size is expected to be calculated and to be equal its non-incremental value" +@pytest.mark.parametrize("deletion_method", ["tenant_detach", "timeline_delete"]) +def test_timeline_initial_logical_size_calculation_cancellation( + neon_env_builder: NeonEnvBuilder, deletion_method: str +): + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant() + + # load in some data + pg = env.postgres.create_start("main", tenant_id=tenant_id) + pg.safe_psql_many( + [ + "CREATE TABLE foo (x INTEGER)", + "INSERT INTO foo SELECT g FROM generate_series(1, 10000) g", + ] + ) + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + pg.stop() + + # restart with failpoint inside initial size calculation task + env.pageserver.stop() + env.pageserver.start( + extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"} + ) + + def tenant_active(): + all_states = client.tenant_list() + [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] + assert tenant["state"] == "Active" + + wait_until(30, 1, tenant_active) + + # kick off initial size calculation task (the response we get here is the estimated size) + def assert_size_calculation_not_done(): + details = client.timeline_detail( + tenant_id, timeline_id, include_non_incremental_logical_size=True + ) + assert details["current_logical_size"] != details["current_logical_size_non_incremental"] + + assert_size_calculation_not_done() + # ensure we're really stuck + time.sleep(5) + assert_size_calculation_not_done() + + log.info( + f"try to delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish" + ) + delete_timeline_success: queue.Queue[bool] = queue.Queue(maxsize=1) + + def delete_timeline_thread_fn(): + try: + if deletion_method == "tenant_detach": + client.tenant_detach(tenant_id) + elif deletion_method == "timeline_delete": + client.timeline_delete(tenant_id, timeline_id) + delete_timeline_success.put(True) + except PageserverApiException: + delete_timeline_success.put(False) + raise + + delete_timeline_thread = threading.Thread(target=delete_timeline_thread_fn) + delete_timeline_thread.start() + # give it some time to settle in the state where it waits for size computation task + time.sleep(5) + if not delete_timeline_success.empty(): + assert ( + False + ), f"test is broken, the {deletion_method} should be stuck waiting for size computation task, got result {delete_timeline_success.get()}" + + log.info( + "resume the size calculation. The failpoint checks that the timeline directory still exists." + ) + client.configure_failpoints(("timeline-calculate-logical-size-check-dir-exists", "return")) + client.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) + + log.info("wait for delete timeline thread to finish and assert that it succeeded") + assert delete_timeline_success.get() + + # if the implementation is incorrect, the teardown would complain about an error log + # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists" + + def test_timeline_physical_size_init(neon_simple_env: NeonEnv): env = neon_simple_env new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_init") @@ -233,7 +322,17 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv): env.pageserver.stop() env.pageserver.start() - assert_physical_size(env, env.initial_tenant, new_timeline_id) + # Wait for the tenant to be loaded + client = env.pageserver.http_client() + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: assert_tenant_status(client, env.initial_tenant, "Active"), + ) + + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): @@ -254,7 +353,9 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) - assert_physical_size(env, env.initial_tenant, new_timeline_id) + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder): @@ -289,7 +390,9 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id) - assert_physical_size(env, env.initial_tenant, new_timeline_id) + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): @@ -326,10 +429,11 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) - pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None) - assert_physical_size(env, env.initial_tenant, new_timeline_id) + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) # The timeline logical and physical sizes are also exposed as prometheus metrics. @@ -362,7 +466,7 @@ def test_timeline_size_metrics( # get the metrics and parse the metric for the current timeline's physical size metrics = env.pageserver.http_client().get_metrics() matches = re.search( - f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', + f'^pageserver_resident_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', metrics, re.MULTILINE, ) @@ -421,11 +525,12 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): tenant, timeline = env.neon_cli.create_tenant() - def get_timeline_physical_size(timeline: TimelineId): - res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True) - return res["current_physical_size_non_incremental"] + def get_timeline_resident_physical_size(timeline: TimelineId): + sizes = get_physical_size_values(env, tenant, timeline) + assert_physical_size_invariants(sizes) + return sizes.prometheus_resident_physical - timeline_total_size = get_timeline_physical_size(timeline) + timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline) for i in range(10): n_rows = random.randint(100, 1000) @@ -442,22 +547,54 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): wait_for_last_flush_lsn(env, pg, tenant, timeline) pageserver_http.timeline_checkpoint(tenant, timeline) - timeline_total_size += get_timeline_physical_size(timeline) + timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline) pg.stop() - tenant_physical_size = int(client.tenant_status(tenant_id=tenant)["current_physical_size"]) - assert tenant_physical_size == timeline_total_size + # ensure that tenant_status current_physical size reports sum of timeline current_physical_size + tenant_current_physical_size = int( + client.tenant_status(tenant_id=tenant)["current_physical_size"] + ) + assert tenant_current_physical_size == sum( + [tl["current_physical_size"] for tl in client.timeline_list(tenant_id=tenant)] + ) + # since we don't do layer eviction, current_physical_size is identical to resident physical size + assert timeline_total_resident_physical_size == tenant_current_physical_size -def assert_physical_size(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): - """Check the current physical size returned from timeline API - matches the total physical size of the timeline on disk""" +class TimelinePhysicalSizeValues: + api_current_physical: int + prometheus_resident_physical: int + python_timelinedir_layerfiles_physical: int + + +def get_physical_size_values( + env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId +) -> TimelinePhysicalSizeValues: + res = TimelinePhysicalSizeValues() + client = env.pageserver.http_client() - res = client.timeline_detail(tenant_id, timeline_id, include_non_incremental_physical_size=True) + + res.prometheus_resident_physical = client.get_timeline_metric( + tenant_id, timeline_id, "pageserver_resident_physical_size" + ) + + detail = client.timeline_detail( + tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True + ) + res.api_current_physical = detail["current_physical_size"] + timeline_path = env.timeline_dir(tenant_id, timeline_id) - assert res["current_physical_size"] == res["current_physical_size_non_incremental"] - assert res["current_physical_size"] == get_timeline_dir_size(timeline_path) + res.python_timelinedir_layerfiles_physical = get_timeline_dir_size(timeline_path) + + return res + + +def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues): + # resident phyiscal size is defined as + assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical + # we don't do layer eviction, so, all layers are resident + assert sizes.api_current_physical == sizes.prometheus_resident_physical # Timeline logical size initialization is an asynchronous background task that runs once, diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 3b72aba422..72d27c3aba 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -585,17 +585,23 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re if elapsed > wait_lsn_timeout: raise RuntimeError("Timed out waiting for WAL redo") - pageserver_lsn = Lsn( - env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["last_record_lsn"] - ) - lag = last_lsn - pageserver_lsn + tenant_status = ps_cli.tenant_status(tenant_id) + if tenant_status["state"] == "Loading": + log.debug(f"Tenant {tenant_id} is still loading, retrying") + else: + pageserver_lsn = Lsn( + env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[ + "last_record_lsn" + ] + ) + lag = last_lsn - pageserver_lsn - if time.time() > last_debug_print + 10 or lag <= 0: - last_debug_print = time.time() - log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb") + if time.time() > last_debug_print + 10 or lag <= 0: + last_debug_print = time.time() + log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb") - if lag <= 0: - break + if lag <= 0: + break time.sleep(1) @@ -883,9 +889,12 @@ class SafekeeperEnv: raise Exception(f"Failed to start safekepeer as {cmd}, reason: {e}") def get_safekeeper_connstrs(self): + assert self.safekeepers is not None, "safekeepers are not initialized" return ",".join([sk_proc.args[2] for sk_proc in self.safekeepers]) def create_postgres(self): + assert self.tenant_id is not None, "tenant_id is not initialized" + assert self.timeline_id is not None, "tenant_id is not initialized" pgdata_dir = os.path.join(self.repo_dir, "proposer_pgdata") pg = ProposerPostgres( pgdata_dir, @@ -1096,7 +1105,6 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): env.pageserver.allowed_errors.extend( [ ".*Failed to process query for timeline .*: Timeline .* was not found in global map.*", - ".*end streaming to Some.*", ] ) diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index aaaa8893a5..24045e2eb7 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -65,7 +65,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): # Verify that the table is larger than shared_buffers cur.execute( """ - select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' """ ) diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index de9a26513d..3aff839b81 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -13,9 +13,9 @@ publish = false ### BEGIN HAKARI SECTION [dependencies] -ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } +chrono = { version = "0.4", default-features = false, features = ["clock", "iana-time-zone", "serde", "std", "winapi"] } clap = { version = "4", features = ["color", "derive", "error-context", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8", features = ["once_cell", "std"] } either = { version = "1", features = ["use_std"] } @@ -36,12 +36,11 @@ prost = { version = "0.11", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } -reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } +serde_json = { version = "1", features = ["raw_value", "std"] } socket2 = { version = "0.4", default-features = false, features = ["all"] } -stable_deref_trait = { version = "1", features = ["alloc", "std"] } -tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } +tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tower = { version = "0.4", features = ["__common", "balance", "buffer", "discover", "futures-core", "futures-util", "indexmap", "limit", "load", "log", "make", "pin-project", "pin-project-lite", "rand", "ready-cache", "retry", "slab", "timeout", "tokio", "tokio-util", "tracing", "util"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } @@ -49,7 +48,6 @@ tracing-core = { version = "0.1", features = ["once_cell", "std"] } url = { version = "2", features = ["serde"] } [build-dependencies] -ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } either = { version = "1", features = ["use_std"] }