Merge commit '95bf19b85a06b27a7fc3118dee03d48648efab15' into release-2023-01-10

Conflicts:
        .github/helm-values/neon-stress.proxy-scram.yaml
        .github/helm-values/neon-stress.proxy.yaml
        .github/helm-values/staging.proxy-scram.yaml
        .github/helm-values/staging.proxy.yaml
        All of the above were deleted in `main` after we hotfixed them
        in `release. Deleting them here
        storage_broker/src/bin/storage_broker.rs
        Hotfix toned down logging, but `main` has sinced implemented
        a proper fix. Taken `main`'s side, see
        https://neondb.slack.com/archives/C033RQ5SPDH/p1673354385387479?thread_ts=1673354306.474729&cid=C033RQ5SPDH

closes https://github.com/neondatabase/neon/issues/3287
This commit is contained in:
Christian Schwarz
2023-01-10 15:39:03 +01:00
201 changed files with 9867 additions and 5057 deletions

View File

@@ -0,0 +1,10 @@
## Describe your changes
## Issue ticket number and link
## Checklist before requesting a review
- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.

View File

@@ -14,7 +14,7 @@
- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
- [ ] Check [cloud SLO dashboard](https://observer.zenith.tech/d/_oWcBMJ7k/cloud-slos?orgId=1)
- [ ] Check [compute startup metrics dashboard](https://observer.zenith.tech/d/5OkYJEmVz/compute-startup-time)
- [ ] Check [cloud SLO dashboard](https://neonprod.grafana.net/d/_oWcBMJ7k/cloud-slos?orgId=1)
- [ ] Check [compute startup metrics dashboard](https://neonprod.grafana.net/d/5OkYJEmVz/compute-startup-time)
<!-- List everything that should be done **after** release, any admin UI configuration / Grafana dashboard / alert changes / setting changes / etc -->

View File

@@ -1,32 +0,0 @@
storage:
vars:
bucket_name: neon-storage-ireland
bucket_region: eu-west-1
console_mgmt_base_url: http://neon-stress-console.local
broker_endpoint: http://storage-broker.neon-stress.local:50051
safekeeper_enable_s3_offload: 'false'
pageserver_config_stub:
pg_distrib_dir: /usr/local
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "{{ inventory_hostname }}"
safekeeper_s3_prefix: neon-stress/wal
hostname_suffix: ".local"
remote_user: admin
sentry_environment: development
children:
pageservers:
hosts:
neon-stress-ps-1:
console_region_id: aws-eu-west-1
neon-stress-ps-2:
console_region_id: aws-eu-west-1
safekeepers:
hosts:
neon-stress-sk-1:
console_region_id: aws-eu-west-1
neon-stress-sk-2:
console_region_id: aws-eu-west-1
neon-stress-sk-3:
console_region_id: aws-eu-west-1

View File

@@ -6,6 +6,8 @@ storage:
broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
pageserver_config_stub:
pg_distrib_dir: /usr/local
metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
metric_collection_interval: 10min
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"

View File

@@ -1,35 +0,0 @@
storage:
vars:
bucket_name: zenith-staging-storage-us-east-1
bucket_region: us-east-1
console_mgmt_base_url: http://console-staging.local
broker_endpoint: http://storage-broker.staging.local:50051
pageserver_config_stub:
pg_distrib_dir: /usr/local
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "{{ inventory_hostname }}"
safekeeper_s3_prefix: us-stage/wal
hostname_suffix: ".local"
remote_user: admin
sentry_environment: development
children:
pageservers:
hosts:
zenith-us-stage-ps-2:
console_region_id: aws-us-east-1
zenith-us-stage-ps-3:
console_region_id: aws-us-east-1
zenith-us-stage-ps-4:
console_region_id: aws-us-east-1
safekeepers:
hosts:
zenith-us-stage-sk-4:
console_region_id: aws-us-east-1
zenith-us-stage-sk-5:
console_region_id: aws-us-east-1
zenith-us-stage-sk-6:
console_region_id: aws-us-east-1

View File

@@ -6,6 +6,8 @@ storage:
broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
pageserver_config_stub:
pg_distrib_dir: /usr/local
metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
metric_collection_interval: 10min
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
@@ -25,6 +27,8 @@ storage:
ansible_host: i-0c3e70929edb5d691
pageserver-1.us-east-2.aws.neon.build:
ansible_host: i-0565a8b4008aa3f40
pageserver-2.us-east-2.aws.neon.build:
ansible_host: i-01e31cdf7e970586a
safekeepers:
hosts:

View File

@@ -9,6 +9,7 @@ settings:
authEndpoint: "http://console-staging.local/management/api/v2"
domain: "*.eu-west-1.aws.neon.build"
sentryEnvironment: "development"
wssPort: 8443
# -- Additional labels for neon-proxy pods
podLabels:
@@ -23,6 +24,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build
httpsPort: 443
#metrics:
# enabled: true

View File

@@ -9,6 +9,7 @@ settings:
authEndpoint: "http://console-staging.local/management/api/v2"
domain: "*.cloud.stage.neon.tech"
sentryEnvironment: "development"
wssPort: 8443
# -- Additional labels for neon-proxy pods
podLabels:
@@ -23,6 +24,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.beta.us-east-2.aws.neon.build
httpsPort: 443
#metrics:
# enabled: true

View File

@@ -9,6 +9,7 @@ settings:
authEndpoint: "http://console-staging.local/management/api/v2"
domain: "*.us-east-2.aws.neon.build"
sentryEnvironment: "development"
wssPort: 8443
# -- Additional labels for neon-proxy pods
podLabels:
@@ -23,6 +24,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build
httpsPort: 443
#metrics:
# enabled: true

View File

@@ -1,56 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: neon-stress
neon_service: storage-broker
# Use L4 LB
service:
# service.annotations -- Annotations to add to the service
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
# assign service to this name at external-dns
external-dns.alpha.kubernetes.io/hostname: storage-broker.neon-stress.local
# service.type -- Service type
type: LoadBalancer
# service.port -- broker listen port
port: 50051
ingress:
enabled: false
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"
settings:
sentryEnvironment: "development"

View File

@@ -1,52 +0,0 @@
fullnameOverride: "neon-stress-proxy-scram"
settings:
authBackend: "console"
authEndpoint: "http://neon-stress-console.local/management/api/v2"
domain: "*.stress.neon.tech"
sentryEnvironment: "development"
podLabels:
zenith_service: proxy-scram
zenith_env: staging
zenith_region: eu-west-1
zenith_region_slug: ireland
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: '*.stress.neon.tech'
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-proxy.fullname\" . }}"
labels:
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
app.kubernetes.io/name: neon-proxy
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-proxy"
endpoints:
- port: http
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,61 +0,0 @@
fullnameOverride: "neon-stress-proxy"
settings:
authBackend: "link"
authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/"
uri: "https://console.dev.neon.tech/psql_session/"
sentryEnvironment: "development"
# -- Additional labels for zenith-proxy pods
podLabels:
zenith_service: proxy
zenith_env: staging
zenith_region: eu-west-1
zenith_region_slug: ireland
service:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internal
external-dns.alpha.kubernetes.io/hostname: neon-stress-proxy.local
type: LoadBalancer
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: connect.dev.neon.tech
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-proxy.fullname\" . }}"
labels:
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
app.kubernetes.io/name: neon-proxy
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-proxy"
endpoints:
- port: http
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -9,6 +9,7 @@ settings:
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.ap-southeast-1.aws.neon.tech"
sentryEnvironment: "production"
wssPort: 8443
# -- Additional labels for neon-proxy pods
podLabels:
@@ -23,6 +24,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
httpsPort: 443
#metrics:
# enabled: true

View File

@@ -9,6 +9,7 @@ settings:
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.eu-central-1.aws.neon.tech"
sentryEnvironment: "production"
wssPort: 8443
# -- Additional labels for neon-proxy pods
podLabels:
@@ -23,6 +24,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
httpsPort: 443
#metrics:
# enabled: true

View File

@@ -9,6 +9,7 @@ settings:
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.us-east-2.aws.neon.tech"
sentryEnvironment: "production"
wssPort: 8443
# -- Additional labels for neon-proxy pods
podLabels:
@@ -23,6 +24,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
httpsPort: 443
#metrics:
# enabled: true

View File

@@ -9,6 +9,7 @@ settings:
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.us-west-2.aws.neon.tech"
sentryEnvironment: "production"
wssPort: 8443
# -- Additional labels for neon-proxy pods
podLabels:
@@ -23,6 +24,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: us-west-2.aws.neon.tech
httpsPort: 443
#metrics:
# enabled: true

View File

@@ -3,6 +3,7 @@ settings:
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.cloud.neon.tech"
sentryEnvironment: "production"
wssPort: 8443
podLabels:
zenith_service: proxy-scram
@@ -16,6 +17,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech'
httpsPort: 443
metrics:
enabled: true

View File

@@ -1,56 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: staging
neon_service: storage-broker
# Use L4 LB
service:
# service.annotations -- Annotations to add to the service
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
# assign service to this name at external-dns
external-dns.alpha.kubernetes.io/hostname: storage-broker.staging.local
# service.type -- Service type
type: LoadBalancer
# service.port -- broker listen port
port: 50051
ingress:
enabled: false
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"
settings:
sentryEnvironment: "development"

View File

@@ -1,57 +0,0 @@
# Helm chart values for zenith-proxy.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "console"
authEndpoint: "http://console-staging.local/management/api/v2"
domain: "*.cloud.stage.neon.tech"
sentryEnvironment: "development"
# -- Additional labels for zenith-proxy pods
podLabels:
zenith_service: proxy-scram
zenith_env: staging
zenith_region: us-east-1
zenith_region_slug: virginia
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-proxy.fullname\" . }}"
labels:
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
app.kubernetes.io/name: neon-proxy
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-proxy"
endpoints:
- port: http
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,57 +0,0 @@
# Helm chart values for zenith-proxy.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "link"
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
uri: "https://console.stage.neon.tech/psql_session/"
sentryEnvironment: "development"
# -- Additional labels for zenith-proxy pods
podLabels:
zenith_service: proxy
zenith_env: staging
zenith_region: us-east-1
zenith_region_slug: virginia
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: connect.stage.neon.tech
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-proxy.fullname\" . }}"
labels:
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
app.kubernetes.io/name: neon-proxy
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-proxy"
endpoints:
- port: http
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -18,6 +18,7 @@ on:
region_id:
description: 'Use a particular region. If not set the default region will be used'
required: false
default: 'aws-us-east-2'
save_perf_report:
type: boolean
description: 'Publish perf report or not. If not set, the report is published only for the main branch'
@@ -115,13 +116,10 @@ jobs:
# neon-captest-prefetch: Same, with prefetching enabled (new project)
# rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
# rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
platform: [ neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
db_size: [ 10gb ]
runner: [ us-east-2 ]
include:
- platform: neon-captest-new
db_size: 50gb
runner: us-east-2
- platform: neon-captest-prefetch
db_size: 50gb
runner: us-east-2
@@ -409,7 +407,7 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
timeout-minutes: 360 # 6h

View File

@@ -111,6 +111,7 @@ jobs:
# Some of our rust modules use FFI and need those to be checked
- name: Get postgres headers
run: make postgres-headers -j$(nproc)
- name: Run cargo clippy
run: ./run_clippy.sh
@@ -126,6 +127,11 @@ jobs:
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
# https://github.com/EmbarkStudios/cargo-deny
- name: Check rust licenses/bans/advisories/sources
if: ${{ !cancelled() }}
run: cargo deny check
build-neon:
runs-on: [ self-hosted, dev, x64 ]
container:
@@ -177,13 +183,12 @@ jobs:
# corresponding Cargo.toml files for their descriptions.
- name: Set env variables
run: |
CARGO_FEATURES="--features testing"
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
CARGO_FEATURES="--features testing"
CARGO_FLAGS="--locked $CARGO_FEATURES"
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=""
CARGO_FEATURES="--features testing,profiling"
CARGO_FLAGS="--locked --release $CARGO_FEATURES"
fi
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
@@ -555,10 +560,14 @@ jobs:
- name: Kaniko build compute tools
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
compute-node-image-v14:
compute-node-image:
runs-on: [ self-hosted, dev, x64 ]
container: gcr.io/kaniko-project/executor:v1.9.0-debug
needs: [ tag ]
strategy:
fail-fast: false
matrix:
version: [ v14, v15 ]
defaults:
run:
shell: sh -eu {0}
@@ -573,32 +582,40 @@ jobs:
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build compute node with extensions v14
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}
- name: Kaniko build compute node with extensions
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-${{ matrix.version }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
compute-node-image-v15:
vm-compute-node-image:
runs-on: [ self-hosted, dev, x64 ]
container: gcr.io/kaniko-project/executor:v1.9.0-debug
needs: [ tag ]
needs: [ tag, compute-node-image ]
strategy:
fail-fast: false
matrix:
version: [ v14, v15 ]
defaults:
run:
shell: sh -eu {0}
steps:
- name: Checkout
uses: actions/checkout@v1 # v3 won't work with kaniko
with:
submodules: true
fetch-depth: 0
- name: Downloading latest vm-builder
run: |
curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder
chmod +x vm-builder
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Pulling compute-node image
run: |
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Kaniko build compute node with extensions v15
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}
- name: Build vm image
run: |
./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Pushing vm-compute-node image
run: |
docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
test-images:
needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
runs-on: [ self-hosted, dev, x64 ]
steps:
@@ -642,13 +659,13 @@ jobs:
promote-images:
runs-on: [ self-hosted, dev, x64 ]
needs: [ tag, test-images ]
needs: [ tag, test-images, vm-compute-node-image ]
if: github.event_name != 'workflow_dispatch'
container: amazon/aws-cli
strategy:
fail-fast: false
matrix:
name: [ neon, compute-node-v14, compute-node-v15, compute-tools ]
name: [ neon, compute-node-v14, vm-compute-node-v14, compute-node-v15, vm-compute-node-v15, compute-tools]
steps:
- name: Promote image to latest
@@ -681,9 +698,15 @@ jobs:
- name: Pull compute node v14 image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14
- name: Pull vm compute node v14 image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
- name: Pull compute node v15 image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15
- name: Pull vm compute node v15 image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
- name: Pull rust image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
@@ -695,7 +718,9 @@ jobs:
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
- name: Configure Docker Hub login
run: |
@@ -712,9 +737,15 @@ jobs:
- name: Push compute node v14 image to Docker Hub
run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
- name: Push vm compute node v14 image to Docker Hub
run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
- name: Push compute node v15 image to Docker Hub
run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}}
- name: Push vm compute node v15 image to Docker Hub
run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
- name: Push rust image to Docker Hub
run: crane push rust neondatabase/rust:pinned
@@ -726,26 +757,25 @@ jobs:
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
calculate-deploy-targets:
runs-on: [ self-hosted, dev, x64 ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.ref_name == 'release' &&
github.event_name != 'workflow_dispatch'
outputs:
matrix-include: ${{ steps.set-matrix.outputs.include }}
steps:
- id: set-matrix
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "staging.neon-storage-broker", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
echo "include=[$STAGING]" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
if [[ "$GITHUB_REF_NAME" == "release" ]]; then
PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to 'release'"
exit 1
fi
@@ -756,7 +786,7 @@ jobs:
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.ref_name == 'release' &&
github.event_name != 'workflow_dispatch'
defaults:
run:
@@ -764,6 +794,8 @@ jobs:
strategy:
matrix:
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
environment:
name: prod-old
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -800,7 +832,7 @@ jobs:
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
needs: [ push-docker-hub, tag, regress-tests ]
if: |
(github.ref_name == 'main') &&
github.event_name != 'workflow_dispatch'
@@ -809,7 +841,9 @@ jobs:
shell: bash
strategy:
matrix:
target_region: [ us-east-2 ]
target_region: [ eu-west-1, us-east-2 ]
environment:
name: dev-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -881,6 +915,8 @@ jobs:
strategy:
matrix:
target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
environment:
name: prod-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -912,7 +948,7 @@ jobs:
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.ref_name == 'release' &&
github.event_name != 'workflow_dispatch'
defaults:
run:
@@ -920,6 +956,8 @@ jobs:
strategy:
matrix:
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
environment:
name: prod-old
env:
KUBECONFIG: .kubeconfig
steps:
@@ -945,8 +983,8 @@ jobs:
- name: Re-deploy proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
deploy-storage-broker:
name: deploy storage broker on old staging and old prod
@@ -955,7 +993,7 @@ jobs:
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.ref_name == 'release' &&
github.event_name != 'workflow_dispatch'
defaults:
run:
@@ -963,6 +1001,8 @@ jobs:
strategy:
matrix:
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
environment:
name: prod-old
env:
KUBECONFIG: .kubeconfig
steps:
@@ -1011,6 +1051,8 @@ jobs:
target_cluster: dev-eu-west-1-zeta
deploy_link_proxy: false
deploy_legacy_scram_proxy: false
environment:
name: dev-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -1026,19 +1068,19 @@ jobs:
- name: Re-deploy scram proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
- name: Re-deploy link proxy
if: matrix.deploy_link_proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
- name: Re-deploy legacy scram proxy
if: matrix.deploy_legacy_scram_proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
deploy-storage-broker-dev-new:
runs-on: [ self-hosted, dev, x64 ]
@@ -1058,6 +1100,8 @@ jobs:
target_cluster: dev-us-east-2-beta
- target_region: eu-west-1
target_cluster: dev-eu-west-1-zeta
environment:
name: dev-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -1096,6 +1140,8 @@ jobs:
target_cluster: prod-eu-central-1-gamma
- target_region: ap-southeast-1
target_cluster: prod-ap-southeast-1-epsilon
environment:
name: prod-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -1111,7 +1157,7 @@ jobs:
- name: Re-deploy proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
deploy-storage-broker-prod-new:
runs-on: prod
@@ -1135,6 +1181,8 @@ jobs:
target_cluster: prod-eu-central-1-gamma
- target_region: ap-southeast-1
target_cluster: prod-ap-southeast-1-epsilon
environment:
name: prod-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3

487
Cargo.lock generated
View File

@@ -66,12 +66,6 @@ dependencies = [
"backtrace",
]
[[package]]
name = "arrayvec"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"
[[package]]
name = "asn1-rs"
version = "0.5.1"
@@ -563,6 +557,12 @@ version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
[[package]]
name = "base64"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
[[package]]
name = "bincode"
version = "1.3.3"
@@ -627,12 +627,6 @@ version = "3.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"
[[package]]
name = "bytemuck"
version = "1.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aaa3a8d9a1ca92e282c96a32d6511b695d7d994d1d102ba85d279f9b2756947f"
[[package]]
name = "byteorder"
version = "1.4.3"
@@ -893,7 +887,7 @@ dependencies = [
"clap 4.0.29",
"comfy-table",
"git-version",
"nix 0.25.1",
"nix",
"once_cell",
"pageserver_api",
"postgres",
@@ -928,15 +922,6 @@ version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
[[package]]
name = "cpp_demangle"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f"
dependencies = [
"cfg-if",
]
[[package]]
name = "cpufeatures"
version = "0.2.5"
@@ -1060,7 +1045,7 @@ dependencies = [
"crossterm_winapi",
"libc",
"mio",
"parking_lot 0.12.1",
"parking_lot",
"signal-hook",
"signal-hook-mio",
"winapi",
@@ -1170,15 +1155,6 @@ version = "2.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23d8666cb01533c39dde32bcbab8e227b4ed6679b2c925eba05feabea39508fb"
[[package]]
name = "debugid"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6ee87af31d84ef885378aebca32be3d682b0e0dc119d5b4860a2c5bb5046730"
dependencies = [
"uuid 0.8.2",
]
[[package]]
name = "debugid"
version = "0.8.0"
@@ -1186,7 +1162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d"
dependencies = [
"serde",
"uuid 1.2.2",
"uuid",
]
[[package]]
@@ -1312,18 +1288,6 @@ dependencies = [
"windows-sys 0.42.0",
]
[[package]]
name = "findshlibs"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64"
dependencies = [
"cc",
"lazy_static",
"libc",
"winapi",
]
[[package]]
name = "fixedbitset"
version = "0.4.2"
@@ -1336,21 +1300,6 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "foreign-types"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
dependencies = [
"foreign-types-shared",
]
[[package]]
name = "foreign-types-shared"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
[[package]]
name = "form_urlencoded"
version = "1.1.0"
@@ -1752,16 +1701,16 @@ dependencies = [
]
[[package]]
name = "hyper-tls"
version = "0.5.0"
name = "hyper-tungstenite"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
checksum = "d62004bcd4f6f85d9e2aa4206f1466ee67031f5ededcb6c6e62d48f9306ad879"
dependencies = [
"bytes",
"hyper",
"native-tls",
"pin-project",
"tokio",
"tokio-native-tls",
"tokio-tungstenite",
"tungstenite",
]
[[package]]
@@ -1815,24 +1764,6 @@ dependencies = [
"serde",
]
[[package]]
name = "inferno"
version = "0.10.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de3886428c6400486522cf44b8626e7b94ad794c14390290f2a274dcf728a58f"
dependencies = [
"ahash",
"atty",
"indexmap",
"itoa",
"lazy_static",
"log",
"num-format",
"quick-xml",
"rgb",
"str_stack",
]
[[package]]
name = "inotify"
version = "0.9.6"
@@ -1920,7 +1851,7 @@ version = "8.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09f4f04699947111ec1733e71778d763555737579e44b85844cae8e1940a1828"
dependencies = [
"base64",
"base64 0.13.1",
"pem",
"ring",
"serde",
@@ -2059,15 +1990,6 @@ version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "memmap2"
version = "0.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b182332558b18d807c4ce1ca8ca983b34c3ee32765e47b3f0f69b90355cc1dc"
dependencies = [
"libc",
]
[[package]]
name = "memoffset"
version = "0.6.5"
@@ -2135,37 +2057,6 @@ version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
[[package]]
name = "native-tls"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
dependencies = [
"lazy_static",
"libc",
"log",
"openssl",
"openssl-probe",
"openssl-sys",
"schannel",
"security-framework",
"security-framework-sys",
"tempfile",
]
[[package]]
name = "nix"
version = "0.23.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f3790c00a0150112de0f4cd161e3d7fc4b2d8a5542ffc35f099a2562aecb35c"
dependencies = [
"bitflags",
"cc",
"cfg-if",
"libc",
"memoffset 0.6.5",
]
[[package]]
name = "nix"
version = "0.25.1"
@@ -2229,16 +2120,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "num-format"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3"
dependencies = [
"arrayvec",
"itoa",
]
[[package]]
name = "num-integer"
version = "0.1.45"
@@ -2299,51 +2180,12 @@ version = "11.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
[[package]]
name = "openssl"
version = "0.10.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29d971fd5722fec23977260f6e81aa67d2f22cadbdc2aa049f1022d9a3be1566"
dependencies = [
"bitflags",
"cfg-if",
"foreign-types",
"libc",
"once_cell",
"openssl-macros",
"openssl-sys",
]
[[package]]
name = "openssl-macros"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "openssl-probe"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "openssl-sys"
version = "0.9.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5454462c0eced1e97f2ec09036abc8da362e66802f66fd20f86854d9d8cbcbc4"
dependencies = [
"autocfg",
"cc",
"libc",
"pkg-config",
"vcpkg",
]
[[package]]
name = "os_info"
version = "3.5.1"
@@ -2394,7 +2236,7 @@ dependencies = [
"hyper",
"itertools",
"metrics",
"nix 0.25.1",
"nix",
"num-traits",
"once_cell",
"pageserver_api",
@@ -2404,11 +2246,11 @@ dependencies = [
"postgres-types",
"postgres_connection",
"postgres_ffi",
"pprof",
"pq_proto",
"rand",
"regex",
"remote_storage",
"reqwest",
"rstar",
"scopeguard",
"serde",
@@ -2417,12 +2259,12 @@ dependencies = [
"signal-hook",
"storage_broker",
"svg_fmt",
"tar",
"tempfile",
"tenant_size_model",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-tar",
"tokio-util",
"toml_edit",
"tracing",
@@ -2447,17 +2289,6 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "parking_lot"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
dependencies = [
"instant",
"lock_api",
"parking_lot_core 0.8.5",
]
[[package]]
name = "parking_lot"
version = "0.12.1"
@@ -2465,21 +2296,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
dependencies = [
"lock_api",
"parking_lot_core 0.9.5",
]
[[package]]
name = "parking_lot_core"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216"
dependencies = [
"cfg-if",
"instant",
"libc",
"redox_syscall",
"smallvec",
"winapi",
"parking_lot_core",
]
[[package]]
@@ -2507,7 +2324,7 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03c64931a1a212348ec4f3b4362585eca7159d0d09cbdf4a7f74f02173596fd4"
dependencies = [
"base64",
"base64 0.13.1",
]
[[package]]
@@ -2528,18 +2345,18 @@ dependencies = [
[[package]]
name = "phf"
version = "0.10.1"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c"
dependencies = [
"phf_shared",
]
[[package]]
name = "phf_shared"
version = "0.10.0"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676"
dependencies = [
"siphasher",
]
@@ -2576,12 +2393,6 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "pkg-config"
version = "0.3.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
[[package]]
name = "plotters"
version = "0.3.4"
@@ -2612,12 +2423,12 @@ dependencies = [
[[package]]
name = "postgres"
version = "0.19.2"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
version = "0.19.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
dependencies = [
"bytes",
"fallible-iterator",
"futures",
"futures-util",
"log",
"tokio",
"tokio-postgres",
@@ -2626,9 +2437,9 @@ dependencies = [
[[package]]
name = "postgres-protocol"
version = "0.6.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
dependencies = [
"base64",
"base64 0.20.0",
"byteorder",
"bytes",
"fallible-iterator",
@@ -2643,8 +2454,8 @@ dependencies = [
[[package]]
name = "postgres-types"
version = "0.2.3"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
version = "0.2.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
dependencies = [
"bytes",
"fallible-iterator",
@@ -2688,25 +2499,6 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "pprof"
version = "0.6.1"
source = "git+https://github.com/neondatabase/pprof-rs.git?branch=wallclock-profiling#4e011a87d22fb4d21d15cc38bce81ff1c75e4bc9"
dependencies = [
"backtrace",
"cfg-if",
"findshlibs",
"inferno",
"lazy_static",
"libc",
"log",
"nix 0.23.2",
"parking_lot 0.11.2",
"symbolic-demangle",
"tempfile",
"thiserror",
]
[[package]]
name = "ppv-lite86"
version = "0.2.17"
@@ -2723,6 +2515,7 @@ dependencies = [
"postgres-protocol",
"rand",
"serde",
"thiserror",
"tokio",
"tracing",
"workspace_hack",
@@ -2801,7 +2594,7 @@ dependencies = [
"lazy_static",
"libc",
"memchr",
"parking_lot 0.12.1",
"parking_lot",
"procfs",
"thiserror",
]
@@ -2868,7 +2661,7 @@ dependencies = [
"anyhow",
"async-trait",
"atty",
"base64",
"base64 0.13.1",
"bstr",
"bytes",
"clap 4.0.29",
@@ -2878,15 +2671,17 @@ dependencies = [
"hex",
"hmac",
"hyper",
"hyper-tungstenite",
"itertools",
"md5",
"metrics",
"once_cell",
"parking_lot 0.12.1",
"parking_lot",
"pin-project-lite",
"pq_proto",
"rand",
"rcgen",
"regex",
"reqwest",
"routerify",
"rstest",
@@ -2898,6 +2693,7 @@ dependencies = [
"sha2",
"socket2",
"thiserror",
"tls-listener",
"tokio",
"tokio-postgres",
"tokio-postgres-rustls",
@@ -2906,20 +2702,12 @@ dependencies = [
"tracing-subscriber",
"url",
"utils",
"uuid 1.2.2",
"uuid",
"webpki-roots",
"workspace_hack",
"x509-parser",
]
[[package]]
name = "quick-xml"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8533f14c8382aaad0d592c812ac3b826162128b65662331e1127b45c3d18536b"
dependencies = [
"memchr",
]
[[package]]
name = "quote"
version = "1.0.21"
@@ -3078,7 +2866,7 @@ version = "0.11.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68cc60575865c7831548863cc02356512e3f1dc2f3f82cb837d7fc4cc8f3c97c"
dependencies = [
"base64",
"base64 0.13.1",
"bytes",
"encoding_rs",
"futures-core",
@@ -3088,12 +2876,10 @@ dependencies = [
"http-body",
"hyper",
"hyper-rustls",
"hyper-tls",
"ipnet",
"js-sys",
"log",
"mime",
"native-tls",
"once_cell",
"percent-encoding",
"pin-project-lite",
@@ -3103,7 +2889,6 @@ dependencies = [
"serde_json",
"serde_urlencoded",
"tokio",
"tokio-native-tls",
"tokio-rustls",
"tower-service",
"url",
@@ -3114,15 +2899,6 @@ dependencies = [
"winreg",
]
[[package]]
name = "rgb"
version = "0.8.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3603b7d71ca82644f79b5a06d1220e9a58ede60bd32255f698cb1af8838b8db3"
dependencies = [
"bytemuck",
]
[[package]]
name = "ring"
version = "0.16.20"
@@ -3261,7 +3037,7 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0864aeff53f8c05aa08d86e5ef839d3dfcf07aeba2db32f12db0ef716e87bd55"
dependencies = [
"base64",
"base64 0.13.1",
]
[[package]]
@@ -3303,9 +3079,9 @@ dependencies = [
"humantime",
"hyper",
"metrics",
"nix 0.25.1",
"nix",
"once_cell",
"parking_lot 0.12.1",
"parking_lot",
"postgres",
"postgres-protocol",
"postgres_ffi",
@@ -3417,14 +3193,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17ad137b9df78294b98cab1a650bef237cc6c950e82e5ce164655e674d07c5cc"
dependencies = [
"httpdate",
"native-tls",
"reqwest",
"rustls",
"sentry-backtrace",
"sentry-contexts",
"sentry-core",
"sentry-panic",
"tokio",
"ureq",
"webpki-roots",
]
[[package]]
@@ -3482,7 +3259,7 @@ version = "0.29.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccc95faa4078768a6bf8df45e2b894bbf372b3dbbfb364e9429c1c58ab7545c6"
dependencies = [
"debugid 0.8.0",
"debugid",
"getrandom",
"hex",
"serde",
@@ -3490,7 +3267,7 @@ dependencies = [
"thiserror",
"time",
"url",
"uuid 1.2.2",
"uuid",
]
[[package]]
@@ -3542,7 +3319,7 @@ version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25bf4a5a814902cd1014dbccfa4d4560fb8432c779471e96e035602519f82eef"
dependencies = [
"base64",
"base64 0.13.1",
"chrono",
"hex",
"indexmap",
@@ -3564,6 +3341,17 @@ dependencies = [
"syn",
]
[[package]]
name = "sha-1"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c"
dependencies = [
"cfg-if",
"cpufeatures",
"digest",
]
[[package]]
name = "sha1"
version = "0.10.5"
@@ -3712,7 +3500,7 @@ dependencies = [
"hyper",
"metrics",
"once_cell",
"parking_lot 0.12.1",
"parking_lot",
"prost",
"tokio",
"tokio-stream",
@@ -3723,12 +3511,6 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "str_stack"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb"
[[package]]
name = "stringprep"
version = "0.1.2"
@@ -3776,29 +3558,6 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
[[package]]
name = "symbolic-common"
version = "8.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f551f902d5642e58039aee6a9021a61037926af96e071816361644983966f540"
dependencies = [
"debugid 0.7.3",
"memmap2",
"stable_deref_trait",
"uuid 0.8.2",
]
[[package]]
name = "symbolic-demangle"
version = "8.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4564ca7b4e6eb14105aa8bbbce26e080f6b5d9c4373e67167ab31f7b86443750"
dependencies = [
"cpp_demangle",
"rustc-demangle",
"symbolic-common",
]
[[package]]
name = "syn"
version = "1.0.105"
@@ -3957,10 +3716,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "tokio"
version = "1.21.1"
name = "tls-listener"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0020c875007ad96677dcc890298f4b942882c5d4eb7cc8f439fc3bf813dc9c95"
checksum = "c9d4ff21187d434ac7709bfc7441ca88f63681247e5ad99f0f08c8c91ddc103d"
dependencies = [
"futures-util",
"hyper",
"pin-project-lite",
"thiserror",
"tokio",
"tokio-rustls",
]
[[package]]
name = "tokio"
version = "1.24.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae"
dependencies = [
"autocfg",
"bytes",
@@ -3968,12 +3741,11 @@ dependencies = [
"memchr",
"mio",
"num_cpus",
"once_cell",
"pin-project-lite",
"signal-hook-registry",
"socket2",
"tokio-macros",
"winapi",
"windows-sys 0.42.0",
]
[[package]]
@@ -3997,28 +3769,19 @@ dependencies = [
"syn",
]
[[package]]
name = "tokio-native-tls"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b"
dependencies = [
"native-tls",
"tokio",
]
[[package]]
name = "tokio-postgres"
version = "0.7.6"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
version = "0.7.7"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
dependencies = [
"async-trait",
"byteorder",
"bytes",
"fallible-iterator",
"futures",
"futures-channel",
"futures-util",
"log",
"parking_lot 0.12.1",
"parking_lot",
"percent-encoding",
"phf",
"pin-project-lite",
@@ -4065,6 +3828,32 @@ dependencies = [
"tokio",
]
[[package]]
name = "tokio-tar"
version = "0.3.0"
source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142"
dependencies = [
"filetime",
"futures-core",
"libc",
"redox_syscall",
"tokio",
"tokio-stream",
"xattr",
]
[[package]]
name = "tokio-tungstenite"
version = "0.17.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f714dd15bead90401d77e04243611caec13726c2408afd5b31901dfcdcb3b181"
dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite",
]
[[package]]
name = "tokio-util"
version = "0.7.4"
@@ -4109,7 +3898,7 @@ dependencies = [
"async-stream",
"async-trait",
"axum",
"base64",
"base64 0.13.1",
"bytes",
"futures-core",
"futures-util",
@@ -4291,6 +4080,25 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642"
[[package]]
name = "tungstenite"
version = "0.17.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e27992fd6a8c29ee7eef28fc78349aa244134e10ad447ce3b9f0ac0ed0fa4ce0"
dependencies = [
"base64 0.13.1",
"byteorder",
"bytes",
"http",
"httparse",
"log",
"rand",
"sha-1",
"thiserror",
"url",
"utf-8",
]
[[package]]
name = "typenum"
version = "1.16.0"
@@ -4351,12 +4159,14 @@ version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b97acb4c28a254fd7a4aeec976c46a7fa404eac4d7c134b30c75144846d7cb8f"
dependencies = [
"base64",
"base64 0.13.1",
"chunked_transfer",
"log",
"native-tls",
"once_cell",
"rustls",
"url",
"webpki",
"webpki-roots",
]
[[package]]
@@ -4377,6 +4187,12 @@ version = "2.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8db7427f936968176eaa7cdf81b7f98b980b18495ec28f1b5791ac3bfe3eea9"
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utils"
version = "0.1.0"
@@ -4393,7 +4209,7 @@ dependencies = [
"hyper",
"jsonwebtoken",
"metrics",
"nix 0.25.1",
"nix",
"once_cell",
"pq_proto",
"rand",
@@ -4417,12 +4233,6 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "uuid"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
[[package]]
name = "uuid"
version = "1.2.2"
@@ -4439,12 +4249,6 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
[[package]]
name = "vcpkg"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "version_check"
version = "0.9.4"
@@ -4743,9 +4547,9 @@ dependencies = [
name = "workspace_hack"
version = "0.1.0"
dependencies = [
"ahash",
"anyhow",
"bytes",
"chrono",
"clap 4.0.29",
"crossbeam-utils",
"either",
@@ -4766,11 +4570,10 @@ dependencies = [
"rand",
"regex",
"regex-syntax",
"reqwest",
"scopeguard",
"serde",
"serde_json",
"socket2",
"stable_deref_trait",
"syn",
"tokio",
"tokio-util",
@@ -4787,7 +4590,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0ecbeb7b67ce215e40e3cc7f2ff902f94a223acf44995934763467e7b1febc8"
dependencies = [
"asn1-rs",
"base64",
"base64 0.13.1",
"data-encoding",
"der-parser",
"lazy_static",

View File

@@ -86,4 +86,4 @@ lto = true
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
[patch.crates-io]
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }

View File

@@ -170,9 +170,6 @@ RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgto
# Remove headers that we won't need anymore - we've completed installation of all extensions
RUN rm -r /usr/local/pgsql/include
# Remove now-useless PGXS src infrastructure
RUN rm -r /usr/local/pgsql/lib/pgxs/src
# Remove static postgresql libraries - all compilation is finished, so we
# can now remove these files - they must be included in other binaries by now
# if they were to be used by other libraries.
@@ -207,7 +204,8 @@ RUN apt update && \
libgeos-c1v5 \
libgdal28 \
libproj19 \
libprotobuf-c1 && \
libprotobuf-c1 \
gdb && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
USER postgres

View File

@@ -170,9 +170,6 @@ RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgto
# Remove headers that we won't need anymore - we've completed installation of all extensions
RUN rm -r /usr/local/pgsql/include
# Remove now-useless PGXS src infrastructure
RUN rm -r /usr/local/pgsql/lib/pgxs/src
# Remove static postgresql libraries - all compilation is finished, so we
# can now remove these files - they must be included in other binaries by now
# if they were to be used by other libraries.
@@ -207,7 +204,8 @@ RUN apt update && \
libgeos-c1v5 \
libgdal28 \
libproj19 \
libprotobuf-c1 && \
libprotobuf-c1 \
gdb && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
USER postgres

199
Makefile
View File

@@ -61,146 +61,115 @@ all: neon postgres neon-pg-ext
#
# The 'postgres_ffi' depends on the Postgres headers.
.PHONY: neon
neon: postgres-v14-headers postgres-v15-headers
neon: postgres-headers
+@echo "Compiling Neon"
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
### PostgreSQL parts
# The rules are duplicated for Postgres v14 and 15. We may want to refactor
# Some rules are duplicated for Postgres v14 and 15. We may want to refactor
# to avoid the duplication in the future, but it's tolerable for now.
#
$(POSTGRES_INSTALL_DIR)/build/v14/config.status:
+@echo "Configuring Postgres v14 build"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14
(cd $(POSTGRES_INSTALL_DIR)/build/v14 && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure \
$(POSTGRES_INSTALL_DIR)/build/%/config.status:
+@echo "Configuring Postgres $* build"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
CFLAGS='$(PG_CFLAGS)' \
$(PG_CONFIGURE_OPTS) \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log)
$(POSTGRES_INSTALL_DIR)/build/v15/config.status:
+@echo "Configuring Postgres v15 build"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15
(cd $(POSTGRES_INSTALL_DIR)/build/v15 && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure \
CFLAGS='$(PG_CFLAGS)' \
$(PG_CONFIGURE_OPTS) \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log)
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log)
# nicer alias to run 'configure'
.PHONY: postgres-v14-configure
postgres-v14-configure: $(POSTGRES_INSTALL_DIR)/build/v14/config.status
.PHONY: postgres-v15-configure
postgres-v15-configure: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
# Note: I've been unable to use templates for this part of our configuration.
# I'm not sure why it wouldn't work, but this is the only place (apart from
# the "build-all-versions" entry points) where direct mention of PostgreSQL
# versions is used.
.PHONY: postgres-configure-v15
postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
.PHONY: postgres-configure-v14
postgres-configure-v14: $(POSTGRES_INSTALL_DIR)/build/v14/config.status
# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/<version>/include
.PHONY: postgres-v14-headers
postgres-v14-headers: postgres-v14-configure
+@echo "Installing PostgreSQL v14 headers"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/include MAKELEVEL=0 install
.PHONY: postgres-v15-headers
postgres-v15-headers: postgres-v15-configure
+@echo "Installing PostgreSQL v15 headers"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/include MAKELEVEL=0 install
.PHONY: postgres-headers-%
postgres-headers-%: postgres-configure-%
+@echo "Installing PostgreSQL $* headers"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/include MAKELEVEL=0 install
# Compile and install PostgreSQL
.PHONY: postgres-v14
postgres-v14: postgres-v14-configure \
postgres-v14-headers # to prevent `make install` conflicts with neon's `postgres-headers`
+@echo "Compiling PostgreSQL v14"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install
+@echo "Compiling libpq v14"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install
+@echo "Compiling pg_prewarm v14"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_prewarm install
+@echo "Compiling pg_buffercache v14"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install
+@echo "Compiling pageinspect v14"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect install
.PHONY: postgres-%
postgres-%: postgres-configure-% \
postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers`
+@echo "Compiling PostgreSQL $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 install
+@echo "Compiling libpq $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq install
+@echo "Compiling pg_prewarm $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install
+@echo "Compiling pg_buffercache $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
+@echo "Compiling pageinspect $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
.PHONY: postgres-v15
postgres-v15: postgres-v15-configure \
postgres-v15-headers # to prevent `make install` conflicts with neon's `postgres-headers`
+@echo "Compiling PostgreSQL v15"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install
+@echo "Compiling libpq v15"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install
+@echo "Compiling pg_prewarm v15"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_prewarm install
+@echo "Compiling pg_buffercache v15"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install
+@echo "Compiling pageinspect v15"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect install
.PHONY: postgres-clean-%
postgres-clean-%:
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean
# shorthand to build all Postgres versions
postgres: postgres-v14 postgres-v15
.PHONY: neon-pg-ext-%
neon-pg-ext-%: postgres-%
+@echo "Compiling neon $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
+@echo "Compiling neon_walredo $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
+@echo "Compiling neon_test_utils $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
.PHONY: postgres-v14-clean
postgres-v14-clean:
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq clean
.PHONY: neon-pg-ext-clean-%
neon-pg-ext-clean-%:
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
.PHONY: postgres-v15-clean
postgres-v15-clean:
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq clean
neon-pg-ext-v14: postgres-v14
+@echo "Compiling neon v14"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v14
(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+@echo "Compiling neon_walredo v14"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
+@echo "Compiling neon_test_utils" v14
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install)
neon-pg-ext-v15: postgres-v15
+@echo "Compiling neon v15"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v15
(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+@echo "Compiling neon_walredo v15"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
+@echo "Compiling neon_test_utils" v15
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install)
.PHONY: neon-pg-ext
neon-pg-ext: \
neon-pg-ext-v14 \
neon-pg-ext-v15
.PHONY: neon-pg-ext-clean
$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean
$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean
neon-pg-ext-clean: \
neon-pg-ext-clean-v14 \
neon-pg-ext-clean-v15
neon-pg-ext: neon-pg-ext-v14 neon-pg-ext-v15
postgres-headers: postgres-v14-headers postgres-v15-headers
postgres-clean: postgres-v14-clean postgres-v15-clean
# shorthand to build all Postgres versions
.PHONY: postgres
postgres: \
postgres-v14 \
postgres-v15
.PHONY: postgres-headers
postgres-headers: \
postgres-headers-v14 \
postgres-headers-v15
.PHONY: postgres-clean
postgres-clean: \
postgres-clean-v14 \
postgres-clean-v15
# This doesn't remove the effects of 'configure'.
.PHONY: clean
clean:
cd $(POSTGRES_INSTALL_DIR)/build/v14 && $(MAKE) clean
cd $(POSTGRES_INSTALL_DIR)/build/v15 && $(MAKE) clean
clean: postgres-clean neon-pg-ext-clean
$(CARGO_CMD_PREFIX) cargo clean
cd pgxn/neon && $(MAKE) clean
cd pgxn/neon_test_utils && $(MAKE) clean
# This removes everything
.PHONY: distclean

View File

@@ -31,7 +31,8 @@ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
* On Fedora, these packages are needed:
```bash
dnf install flex bison readline-devel zlib-devel openssl-devel \
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
protobuf-devel
```
2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -117,11 +118,8 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
# Later that would be responsibility of a package install script
> ./target/debug/neon_local init
Starting pageserver at '127.0.0.1:64000' in '.neon'.
pageserver started, pid: 2545906
Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9
Stopped pageserver 1 process with pid 2545906
# start pageserver and safekeeper
# start pageserver, safekeeper, and broker for their intercommunication
> ./target/debug/neon_local start
Starting neon broker at 127.0.0.1:50051
storage_broker started, pid: 2918372
@@ -130,6 +128,12 @@ pageserver started, pid: 2918386
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
safekeeper 1 started, pid: 2918437
# create initial tenant and use it as a default for every future neon_local invocation
> ./target/debug/neon_local tenant create --set-default
tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
# start postgres compute node
> ./target/debug/neon_local pg start main
Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ...

View File

@@ -2,6 +2,7 @@
name = "compute_tools"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
anyhow = "1.0"
@@ -12,12 +13,12 @@ futures = "0.3.13"
hyper = { version = "0.14", features = ["full"] }
log = { version = "0.4", features = ["std", "serde"] }
notify = "5.0.0"
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
regex = "1"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
tar = "0.4"
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
url = "2.2.2"
workspace_hack = { version = "0.1", path = "../workspace_hack" }

View File

@@ -105,7 +105,7 @@ fn main() -> Result<()> {
tenant,
timeline,
pageserver_connstr,
metrics: ComputeMetrics::new(),
metrics: ComputeMetrics::default(),
state: RwLock::new(ComputeState::new()),
};
let compute = Arc::new(compute_state);

View File

@@ -5,7 +5,7 @@ use tokio_postgres::NoTls;
use crate::compute::ComputeNode;
pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
let query = "
CREATE TABLE IF NOT EXISTS health_check (
id serial primary key,

View File

@@ -23,11 +23,11 @@ use std::sync::RwLock;
use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use log::info;
use log::{info, warn};
use postgres::{Client, NoTls};
use serde::{Serialize, Serializer};
use crate::checker::create_writablity_check_data;
use crate::checker::create_writability_check_data;
use crate::config;
use crate::pg_helpers::*;
use crate::spec::*;
@@ -91,7 +91,7 @@ pub enum ComputeStatus {
Failed,
}
#[derive(Serialize)]
#[derive(Default, Serialize)]
pub struct ComputeMetrics {
pub sync_safekeepers_ms: AtomicU64,
pub basebackup_ms: AtomicU64,
@@ -99,23 +99,6 @@ pub struct ComputeMetrics {
pub total_startup_ms: AtomicU64,
}
impl ComputeMetrics {
pub fn new() -> Self {
Self {
sync_safekeepers_ms: AtomicU64::new(0),
basebackup_ms: AtomicU64::new(0),
config_ms: AtomicU64::new(0),
total_startup_ms: AtomicU64::new(0),
}
}
}
impl Default for ComputeMetrics {
fn default() -> Self {
Self::new()
}
}
impl ComputeNode {
pub fn set_status(&self, status: ComputeStatus) {
self.state.write().unwrap().status = status;
@@ -175,7 +158,7 @@ impl ComputeNode {
let start_time = Utc::now();
let sync_handle = Command::new(&self.pgbin)
.args(&["--sync-safekeepers"])
.args(["--sync-safekeepers"])
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
.stdout(Stdio::piped())
.spawn()
@@ -253,7 +236,7 @@ impl ComputeNode {
// Run postgres as a child process.
let mut pg = Command::new(&self.pgbin)
.args(&["-D", &self.pgdata])
.args(["-D", &self.pgdata])
.spawn()
.expect("cannot start postgres process");
@@ -292,7 +275,7 @@ impl ComputeNode {
handle_databases(&self.spec, &mut client)?;
handle_role_deletions(self, &mut client)?;
handle_grants(self, &mut client)?;
create_writablity_check_data(&mut client)?;
create_writability_check_data(&mut client)?;
// 'Close' connection
drop(client);
@@ -328,6 +311,9 @@ impl ComputeNode {
.wait()
.expect("failed to start waiting on Postgres process");
self.check_for_core_dumps()
.expect("failed to check for core dumps");
Ok(ecode)
}
@@ -343,4 +329,68 @@ impl ComputeNode {
self.prepare_pgdata()?;
self.run()
}
// Look for core dumps and collect backtraces.
//
// EKS worker nodes have following core dump settings:
// /proc/sys/kernel/core_pattern -> core
// /proc/sys/kernel/core_uses_pid -> 1
// ulimint -c -> unlimited
// which results in core dumps being written to postgres data directory as core.<pid>.
//
// Use that as a default location and pattern, except macos where core dumps are written
// to /cores/ directory by default.
fn check_for_core_dumps(&self) -> Result<()> {
let core_dump_dir = match std::env::consts::OS {
"macos" => Path::new("/cores/"),
_ => Path::new(&self.pgdata),
};
// Collect core dump paths if any
info!("checking for core dumps in {}", core_dump_dir.display());
let files = fs::read_dir(core_dump_dir)?;
let cores = files.filter_map(|entry| {
let entry = entry.ok()?;
let _ = entry.file_name().to_str()?.strip_prefix("core.")?;
Some(entry.path())
});
// Print backtrace for each core dump
for core_path in cores {
warn!(
"core dump found: {}, collecting backtrace",
core_path.display()
);
// Try first with gdb
let backtrace = Command::new("gdb")
.args(["--batch", "-q", "-ex", "bt", &self.pgbin])
.arg(&core_path)
.output();
// Try lldb if no gdb is found -- that is handy for local testing on macOS
let backtrace = match backtrace {
Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {
warn!("cannot find gdb, trying lldb");
Command::new("lldb")
.arg("-c")
.arg(&core_path)
.args(["--batch", "-o", "bt all", "-o", "quit"])
.output()
}
_ => backtrace,
}?;
warn!(
"core dump backtrace: {}",
String::from_utf8_lossy(&backtrace.stdout)
);
warn!(
"debugger stderr: {}",
String::from_utf8_lossy(&backtrace.stderr)
);
}
Ok(())
}
}

View File

@@ -9,29 +9,11 @@ use hyper::{Body, Method, Request, Response, Server, StatusCode};
use log::{error, info};
use serde_json;
use crate::compute::{ComputeNode, ComputeStatus};
use crate::compute::ComputeNode;
// Service function to handle all available routes.
async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
match (req.method(), req.uri().path()) {
// Timestamp of the last Postgres activity in the plain text.
// DEPRECATED in favour of /status
(&Method::GET, "/last_activity") => {
info!("serving /last_active GET request");
let state = compute.state.read().unwrap();
// Use RFC3339 format for consistency.
Response::new(Body::from(state.last_active.to_rfc3339()))
}
// Has compute setup process finished? -> true/false.
// DEPRECATED in favour of /status
(&Method::GET, "/ready") => {
info!("serving /ready GET request");
let status = compute.get_status();
Response::new(Body::from(format!("{}", status == ComputeStatus::Running)))
}
// Serialized compute state.
(&Method::GET, "/status") => {
info!("serving /status GET request");
@@ -46,16 +28,6 @@ async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body>
Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
}
// DEPRECATED, use POST instead
(&Method::GET, "/check_writability") => {
info!("serving /check_writability GET request");
let res = crate::checker::check_writability(&compute).await;
match res {
Ok(_) => Response::new(Body::from("true")),
Err(e) => Response::new(Body::from(e.to_string())),
}
}
(&Method::POST, "/check_writability") => {
info!("serving /check_writability POST request");
let res = crate::checker::check_writability(&compute).await;

View File

@@ -37,58 +37,7 @@ paths:
schema:
$ref: "#/components/schemas/ComputeMetrics"
/ready:
get:
deprecated: true
tags:
- "info"
summary: Check whether compute startup process finished successfully
description: ""
operationId: computeIsReady
responses:
"200":
description: Compute is ready ('true') or not ('false')
content:
text/plain:
schema:
type: string
example: "true"
/last_activity:
get:
deprecated: true
tags:
- "info"
summary: Get timestamp of the last compute activity
description: ""
operationId: getLastComputeActivityTS
responses:
"200":
description: Timestamp of the last compute activity
content:
text/plain:
schema:
type: string
example: "2022-10-12T07:20:50.52Z"
/check_writability:
get:
deprecated: true
tags:
- "check"
summary: Check that we can write new data on this compute
description: ""
operationId: checkComputeWritabilityDeprecated
responses:
"200":
description: Check result
content:
text/plain:
schema:
type: string
description: Error text or 'true' if check passed
example: "true"
post:
tags:
- "check"

View File

@@ -52,10 +52,16 @@ fn watch_compute_activity(compute: &ComputeNode) {
let mut idle_backs: Vec<DateTime<Utc>> = vec![];
for b in backs.into_iter() {
let state: String = b.get("state");
let change: String = b.get("state_change");
let state: String = match b.try_get("state") {
Ok(state) => state,
Err(_) => continue,
};
if state == "idle" {
let change: String = match b.try_get("state_change") {
Ok(state_change) => state_change,
Err(_) => continue,
};
let change = DateTime::parse_from_rfc3339(&change);
match change {
Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
@@ -74,10 +80,8 @@ fn watch_compute_activity(compute: &ComputeNode) {
}
}
// Sort idle backend `state_change` timestamps. The last one corresponds
// to the last activity.
idle_backs.sort();
if let Some(last) = idle_backs.last() {
// Get idle backend `state_change` with the max timestamp.
if let Some(last) = idle_backs.iter().max() {
last_active = *last;
}
}

View File

@@ -119,16 +119,9 @@ pub trait GenericOptionsSearch {
impl GenericOptionsSearch for GenericOptions {
/// Lookup option by name
fn find(&self, name: &str) -> Option<String> {
match &self {
Some(ops) => {
let op = ops.iter().find(|s| s.name == name);
match op {
Some(op) => op.value.clone(),
None => None,
}
}
None => None,
}
let ops = self.as_ref()?;
let op = ops.iter().find(|s| s.name == name)?;
op.value.clone()
}
}
@@ -161,6 +154,14 @@ impl Role {
}
impl Database {
pub fn new(name: PgIdent, owner: PgIdent) -> Self {
Self {
name,
owner,
options: None,
}
}
/// Serialize a list of database parameters into a Postgres-acceptable
/// string of arguments.
/// NB: `TEMPLATE` is actually also an identifier, but so far we only need
@@ -219,11 +220,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
&[],
)?
.iter()
.map(|row| Database {
name: row.get("datname"),
owner: row.get("owner"),
options: None,
})
.map(|row| Database::new(row.get("datname"), row.get("owner")))
.collect();
Ok(postgres_dbs)

View File

@@ -1,5 +1,6 @@
use std::path::Path;
use std::str::FromStr;
use std::time::Instant;
use anyhow::Result;
use log::{info, log_enabled, warn, Level};
@@ -197,22 +198,18 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
/// Reassign all dependent objects and delete requested roles.
pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
let spec = &node.spec;
// First, reassign all dependent objects to db owners.
if let Some(ops) = &spec.delta_operations {
if let Some(ops) = &node.spec.delta_operations {
// First, reassign all dependent objects to db owners.
info!("reassigning dependent objects of to-be-deleted roles");
for op in ops {
if op.action == "delete_role" {
reassign_owned_objects(node, &op.name)?;
}
}
}
// Second, proceed with role deletions.
let mut xact = client.transaction()?;
if let Some(ops) = &spec.delta_operations {
// Second, proceed with role deletions.
info!("processing role deletions");
let mut xact = client.transaction()?;
for op in ops {
// We do not check either role exists or not,
// Postgres will take care of it for us
@@ -223,6 +220,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
xact.execute(query.as_str(), &[])?;
}
}
xact.commit()?;
}
Ok(())
@@ -317,6 +315,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
// XXX: with a limited number of databases it is fine, but consider making it a HashMap
let pg_db = existing_dbs.iter().find(|r| r.name == *name);
let start_time = Instant::now();
if let Some(r) = pg_db {
// XXX: db owner name is returned as quoted string from Postgres,
// when quoting is needed.
@@ -335,6 +334,8 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
info_print!(" -> update");
client.execute(query.as_str(), &[])?;
let elapsed = start_time.elapsed().as_millis();
info_print!(" ({} ms)", elapsed);
}
} else {
let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
@@ -342,6 +343,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
query.push_str(&db.to_pg_options());
client.execute(query.as_str(), &[])?;
let elapsed = start_time.elapsed().as_millis();
info_print!(" ({} ms)", elapsed);
}
info_print!("\n");

View File

@@ -38,4 +38,33 @@ mod pg_helpers_tests {
assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
}
#[test]
fn generic_options_search() {
let generic_options: GenericOptions = Some(vec![
GenericOption {
name: "present_value".into(),
value: Some("value".into()),
vartype: "string".into(),
},
GenericOption {
name: "missed_value".into(),
value: None,
vartype: "int".into(),
},
]);
assert_eq!(generic_options.find("present_value"), Some("value".into()));
assert_eq!(generic_options.find("missed_value"), None);
assert_eq!(generic_options.find("invalid_value"), None);
let empty_generic_options: GenericOptions = Some(vec![]);
assert_eq!(empty_generic_options.find("present_value"), None);
assert_eq!(empty_generic_options.find("missed_value"), None);
assert_eq!(empty_generic_options.find("invalid_value"), None);
let none_generic_options: GenericOptions = None;
assert_eq!(none_generic_options.find("present_value"), None);
assert_eq!(none_generic_options.find("missed_value"), None);
assert_eq!(none_generic_options.find("invalid_value"), None);
}
}

View File

@@ -2,6 +2,7 @@
name = "control_plane"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
anyhow = "1.0"
@@ -10,7 +11,7 @@ comfy-table = "6.1"
git-version = "0.3.5"
nix = "0.25"
once_cell = "1.13.0"
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" }
regex = "1"
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
serde = { version = "1.0", features = ["derive"] }

View File

@@ -136,22 +136,6 @@ where
anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
}
/// Send SIGTERM to child process
pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<()> {
let pid = child.id();
match kill(
nix::unistd::Pid::from_raw(pid.try_into().unwrap()),
Signal::SIGTERM,
) {
Ok(()) => Ok(()),
Err(Errno::ESRCH) => {
println!("child process with pid {pid} does not exist");
Ok(())
}
Err(e) => anyhow::bail!("Failed to send signal to child process with pid {pid}: {e}"),
}
}
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
let pid = match pid_file::read(pid_file)

View File

@@ -263,7 +263,7 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
} else if let Some(default_id) = env.default_tenant_id {
Ok(default_id)
} else {
bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file");
anyhow::bail!("No tenant id. Use --tenant-id, or set a default tenant");
}
}
@@ -284,8 +284,6 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
}
fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
let initial_timeline_id_arg = parse_timeline_id(init_match)?;
// Create config file
let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
// load and parse the file
@@ -309,30 +307,16 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
env.init(pg_version)
.context("Failed to initialize neon repository")?;
let initial_tenant_id = env
.default_tenant_id
.expect("default_tenant_id should be generated by the `env.init()` call above");
// Initialize pageserver, create initial tenant and timeline.
let pageserver = PageServerNode::from_env(&env);
let initial_timeline_id = pageserver
.initialize(
Some(initial_tenant_id),
initial_timeline_id_arg,
&pageserver_config_overrides(init_match),
pg_version,
)
pageserver
.initialize(&pageserver_config_overrides(init_match))
.unwrap_or_else(|e| {
eprintln!("pageserver init failed: {e:?}");
exit(1);
});
env.register_branch_mapping(
DEFAULT_BRANCH_NAME.to_owned(),
initial_tenant_id,
initial_timeline_id,
)?;
Ok(env)
}
@@ -388,6 +372,17 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
println!(
"Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}",
);
if create_match.get_flag("set-default") {
println!("Setting tenant {new_tenant_id} as a default one");
env.default_tenant_id = Some(new_tenant_id);
}
}
Some(("set-default", set_default_match)) => {
let tenant_id =
parse_tenant_id(set_default_match)?.context("No tenant id specified")?;
println!("Setting tenant {tenant_id} as a default one");
env.default_tenant_id = Some(tenant_id);
}
Some(("config", create_match)) => {
let tenant_id = get_tenant_id(create_match, env)?;
@@ -549,7 +544,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
table.load_preset(comfy_table::presets::NOTHING);
table.set_header(&[
table.set_header([
"NODE",
"ADDRESS",
"TIMELINE",
@@ -584,7 +579,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
.map(|name| name.as_str())
.unwrap_or("?");
table.add_row(&[
table.add_row([
node_name.as_str(),
&node.address.to_string(),
&node.timeline_id.to_string(),
@@ -747,7 +742,7 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) {
Ok(SafekeeperNode::from_env(env, node))
} else {
bail!("could not find safekeeper '{}'", id)
bail!("could not find safekeeper {id}")
}
}
@@ -806,22 +801,22 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
}
fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
broker::start_broker_process(env)?;
let pageserver = PageServerNode::from_env(env);
// Postgres nodes are not started automatically
broker::start_broker_process(env)?;
let pageserver = PageServerNode::from_env(env);
if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
eprintln!("pageserver start failed: {e}");
try_stop_storage_broker_process(env);
eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e);
try_stop_all(env, true);
exit(1);
}
for node in env.safekeepers.iter() {
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.start() {
eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id);
try_stop_storage_broker_process(env);
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
try_stop_all(env, false);
exit(1);
}
}
@@ -832,35 +827,41 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
let immediate =
sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
try_stop_all(env, immediate);
Ok(())
}
fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
let pageserver = PageServerNode::from_env(env);
// Stop all compute nodes
let cplane = ComputeControlPlane::load(env.clone())?;
for (_k, node) in cplane.nodes {
if let Err(e) = node.stop(false) {
eprintln!("postgres stop failed: {}", e);
match ComputeControlPlane::load(env.clone()) {
Ok(cplane) => {
for (_k, node) in cplane.nodes {
if let Err(e) = node.stop(false) {
eprintln!("postgres stop failed: {e:#}");
}
}
}
Err(e) => {
eprintln!("postgres stop failed, could not restore control plane data from env: {e:#}")
}
}
if let Err(e) = pageserver.stop(immediate) {
eprintln!("pageserver stop failed: {}", e);
eprintln!("pageserver {} stop failed: {:#}", env.pageserver.id, e);
}
for node in env.safekeepers.iter() {
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.stop(immediate) {
eprintln!("safekeeper '{}' stop failed: {}", safekeeper.id, e);
eprintln!("safekeeper {} stop failed: {:#}", safekeeper.id, e);
}
}
try_stop_storage_broker_process(env);
Ok(())
}
fn try_stop_storage_broker_process(env: &local_env::LocalEnv) {
if let Err(e) = broker::stop_broker_process(env) {
eprintln!("neon broker stop failed: {e}");
eprintln!("neon broker stop failed: {e:#}");
}
}
@@ -900,6 +901,7 @@ fn cli() -> Command {
let stop_mode_arg = Arg::new("stop-mode")
.short('m')
.value_parser(["fast", "immediate"])
.default_value("fast")
.help("If 'immediate', don't flush repository data at shutdown")
.required(false)
.value_name("stop-mode");
@@ -921,9 +923,8 @@ fn cli() -> Command {
.version(GIT_VERSION)
.subcommand(
Command::new("init")
.about("Initialize a new Neon repository")
.about("Initialize a new Neon repository, preparing configs for services to start with")
.arg(pageserver_config_args.clone())
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
.arg(
Arg::new("config")
.long("config")
@@ -985,11 +986,14 @@ fn cli() -> Command {
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
.arg(pg_version_arg.clone())
.arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
.help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
)
.subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
.about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
.subcommand(Command::new("config")
.arg(tenant_id_arg.clone())
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
)
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
)
.subcommand(
Command::new("pageserver")

View File

@@ -17,7 +17,7 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
"storage_broker",
&env.base_data_dir,
&env.storage_broker_bin(),
&args,
args,
[],
background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
|| {

View File

@@ -44,7 +44,7 @@ impl ComputeControlPlane {
let mut nodes = BTreeMap::default();
let pgdatadirspath = &env.pg_data_dirs_path();
for tenant_dir in fs::read_dir(&pgdatadirspath)
for tenant_dir in fs::read_dir(pgdatadirspath)
.with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
{
let tenant_dir = tenant_dir?;
@@ -67,8 +67,8 @@ impl ComputeControlPlane {
fn get_port(&mut self) -> u16 {
1 + self
.nodes
.iter()
.map(|(_name, node)| node.address.port())
.values()
.map(|node| node.address.port())
.max()
.unwrap_or(self.base_port)
}
@@ -183,7 +183,7 @@ impl PostgresNode {
fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
let mut cmd = Command::new(&pg_path);
let mut cmd = Command::new(pg_path);
cmd.arg("--sync-safekeepers")
.env_clear()
@@ -201,7 +201,7 @@ impl PostgresNode {
.stderr(Stdio::piped());
if let Some(token) = auth_token {
cmd.env("ZENITH_AUTH_TOKEN", token);
cmd.env("NEON_AUTH_TOKEN", token);
}
let sync_handle = cmd
@@ -261,7 +261,7 @@ impl PostgresNode {
}
fn create_pgdata(&self) -> Result<()> {
fs::create_dir_all(&self.pgdata()).with_context(|| {
fs::create_dir_all(self.pgdata()).with_context(|| {
format!(
"could not create data directory {}",
self.pgdata().display()
@@ -304,17 +304,17 @@ impl PostgresNode {
// Set up authentication
//
// $ZENITH_AUTH_TOKEN will be replaced with value from environment
// $NEON_AUTH_TOKEN will be replaced with value from environment
// variable during compute pg startup. It is done this way because
// otherwise user will be able to retrieve the value using SHOW
// command or pg_settings
let password = if let AuthType::NeonJWT = auth_type {
"$ZENITH_AUTH_TOKEN"
"$NEON_AUTH_TOKEN"
} else {
""
};
// NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere.
// Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN
// Also note that not all parameters are supported here. Because in compute we substitute $NEON_AUTH_TOKEN
// We parse this string and build it back with token from env var, and for simplicity rebuild
// uses only needed variables namely host, port, user, password.
format!("postgresql://no_user:{password}@{host}:{port}")
@@ -323,7 +323,7 @@ impl PostgresNode {
conf.append_line("");
conf.append("neon.pageserver_connstring", &pageserver_connstr);
if let AuthType::NeonJWT = auth_type {
conf.append("neon.safekeeper_token_env", "$ZENITH_AUTH_TOKEN");
conf.append("neon.safekeeper_token_env", "$NEON_AUTH_TOKEN");
}
conf.append("neon.tenant_id", &self.tenant_id.to_string());
conf.append("neon.timeline_id", &self.timeline_id.to_string());
@@ -448,7 +448,7 @@ impl PostgresNode {
self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
);
if let Some(token) = auth_token {
cmd.env("ZENITH_AUTH_TOKEN", token);
cmd.env("NEON_AUTH_TOKEN", token);
}
let pg_ctl = cmd.output().context("pg_ctl failed")?;
@@ -478,7 +478,7 @@ impl PostgresNode {
postgresql_conf_path.to_str().unwrap()
)
})?;
fs::remove_dir_all(&self.pgdata())?;
fs::remove_dir_all(self.pgdata())?;
self.create_pgdata()?;
// 2. Bring back config files
@@ -514,7 +514,7 @@ impl PostgresNode {
"Destroying postgres data directory '{}'",
self.pgdata().to_str().unwrap()
);
fs::remove_dir_all(&self.pgdata())?;
fs::remove_dir_all(self.pgdata())?;
} else {
self.pg_ctl(&["stop"], &None)?;
}

View File

@@ -296,11 +296,6 @@ impl LocalEnv {
env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
}
// If no initial tenant ID was given, generate it.
if env.default_tenant_id.is_none() {
env.default_tenant_id = Some(TenantId::generate());
}
env.base_data_dir = base_path();
Ok(env)
@@ -404,7 +399,7 @@ impl LocalEnv {
}
}
fs::create_dir(&base_path)?;
fs::create_dir(base_path)?;
// generate keys for jwt
// openssl genrsa -out private_key.pem 2048
@@ -413,7 +408,7 @@ impl LocalEnv {
private_key_path = base_path.join("auth_private_key.pem");
let keygen_output = Command::new("openssl")
.arg("genrsa")
.args(&["-out", private_key_path.to_str().unwrap()])
.args(["-out", private_key_path.to_str().unwrap()])
.arg("2048")
.stdout(Stdio::null())
.output()
@@ -430,10 +425,10 @@ impl LocalEnv {
// openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
let keygen_output = Command::new("openssl")
.arg("rsa")
.args(&["-in", private_key_path.to_str().unwrap()])
.args(["-in", private_key_path.to_str().unwrap()])
.arg("-pubout")
.args(&["-outform", "PEM"])
.args(&["-out", public_key_path.to_str().unwrap()])
.args(["-outform", "PEM"])
.args(["-out", public_key_path.to_str().unwrap()])
.stdout(Stdio::null())
.output()
.context("failed to generate auth private key")?;

View File

@@ -7,7 +7,7 @@ use std::path::PathBuf;
use std::process::{Child, Command};
use std::{io, result};
use anyhow::{bail, ensure, Context};
use anyhow::{bail, Context};
use pageserver_api::models::{
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
};
@@ -130,83 +130,15 @@ impl PageServerNode {
overrides
}
/// Initializes a pageserver node by creating its config with the overrides provided,
/// and creating an initial tenant and timeline afterwards.
pub fn initialize(
&self,
create_tenant: Option<TenantId>,
initial_timeline_id: Option<TimelineId>,
config_overrides: &[&str],
pg_version: u32,
) -> anyhow::Result<TimelineId> {
/// Initializes a pageserver node by creating its config with the overrides provided.
pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
// First, run `pageserver --init` and wait for it to write a config into FS and exit.
self.pageserver_init(config_overrides).with_context(|| {
format!(
"Failed to run init for pageserver node {}",
self.env.pageserver.id,
)
})?;
// Then, briefly start it fully to run HTTP commands on it,
// to create initial tenant and timeline.
// We disable the remote storage, since we stop pageserver right after the timeline creation,
// hence most of the uploads will either aborted or not started: no point to start them at all.
let disabled_remote_storage_override = "remote_storage={}";
let mut pageserver_process = self
.start_node(
&[disabled_remote_storage_override],
// Previous overrides will be taken from the config created before, don't overwrite them.
false,
)
.with_context(|| {
format!(
"Failed to start a process for pageserver node {}",
self.env.pageserver.id,
)
})?;
let init_result = self
.try_init_timeline(create_tenant, initial_timeline_id, pg_version)
.context("Failed to create initial tenant and timeline for pageserver");
match &init_result {
Ok(initial_timeline_id) => {
println!("Successfully initialized timeline {initial_timeline_id}")
}
Err(e) => eprintln!("{e:#}"),
}
background_process::send_stop_child_process(&pageserver_process)?;
let exit_code = pageserver_process.wait()?;
ensure!(
exit_code.success(),
format!(
"pageserver init failed with exit code {:?}",
exit_code.code()
)
);
println!(
"Stopped pageserver {} process with pid {}",
self.env.pageserver.id,
pageserver_process.id(),
);
init_result
}
fn try_init_timeline(
&self,
new_tenant_id: Option<TenantId>,
new_timeline_id: Option<TimelineId>,
pg_version: u32,
) -> anyhow::Result<TimelineId> {
let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?;
let initial_timeline_info = self.timeline_create(
initial_tenant_id,
new_timeline_id,
None,
None,
Some(pg_version),
)?;
Ok(initial_timeline_info.timeline_id)
})
}
pub fn repo_path(&self) -> PathBuf {
@@ -241,7 +173,7 @@ impl PageServerNode {
let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
args.push(Cow::Borrowed("--init"));
let init_output = Command::new(&self.env.pageserver_bin())
let init_output = Command::new(self.env.pageserver_bin())
.args(args.iter().map(Cow::as_ref))
.envs(self.pageserver_env_variables()?)
.output()
@@ -320,7 +252,7 @@ impl PageServerNode {
let token = self
.env
.generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
vec![("ZENITH_AUTH_TOKEN".to_owned(), token)]
vec![("NEON_AUTH_TOKEN".to_owned(), token)]
} else {
Vec::new()
})

90
deny.toml Normal file
View File

@@ -0,0 +1,90 @@
# This file was auto-generated using `cargo deny init`.
# cargo-deny is a cargo plugin that lets you lint your project's
# dependency graph to ensure all your dependencies conform
# to your expectations and requirements.
# Root options
targets = []
all-features = false
no-default-features = false
feature-depth = 1
# This section is considered when running `cargo deny check advisories`
# More documentation for the advisories section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
[advisories]
db-urls = ["https://github.com/rustsec/advisory-db"]
vulnerability = "deny"
unmaintained = "warn"
yanked = "warn"
notice = "warn"
ignore = []
# This section is considered when running `cargo deny check licenses`
# More documentation for the licenses section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
[licenses]
unlicensed = "deny"
allow = [
"Apache-2.0",
"Artistic-2.0",
"BSD-2-Clause",
"BSD-3-Clause",
"ISC",
"MIT",
"MPL-2.0",
"OpenSSL",
"Unicode-DFS-2016",
]
deny = []
copyleft = "warn"
allow-osi-fsf-free = "neither"
default = "deny"
confidence-threshold = 0.8
exceptions = [
# Zlib license has some restrictions if we decide to change sth
{ allow = ["Zlib"], name = "const_format_proc_macros", version = "*" },
{ allow = ["Zlib"], name = "const_format", version = "*" },
]
[[licenses.clarify]]
name = "ring"
version = "*"
expression = "MIT AND ISC AND OpenSSL"
license-files = [
{ path = "LICENSE", hash = 0xbd0eed23 },
]
[licenses.private]
ignore = true
registries = []
# This section is considered when running `cargo deny check bans`.
# More documentation about the 'bans' section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html
[bans]
multiple-versions = "warn"
wildcards = "allow"
highlight = "all"
workspace-default-features = "allow"
external-default-features = "allow"
allow = []
deny = []
skip = []
skip-tree = []
# This section is considered when running `cargo deny check sources`.
# More documentation about the 'sources' section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
[sources]
unknown-registry = "warn"
unknown-git = "warn"
allow-registry = ["https://github.com/rust-lang/crates.io-index"]
allow-git = []
[sources.allow-org]
github = [
"neondatabase",
]
gitlab = []
bitbucket = []

View File

@@ -65,7 +65,7 @@ There is no administrative API except those provided by PostgreSQL.
#### Outgoing connections
Compute connects to Pageserver for getting pages.
The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$ZENITH_AUTH_TOKEN@localhost:15028`.
The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$NEON_AUTH_TOKEN@localhost:15028`.
The environment variable inside the connection string is substituted with
the JWT token.
@@ -77,7 +77,7 @@ If the GUC is unset, no token is passed.
Note that both tokens can be (and typically are) the same;
the scope is the tenant and the token is usually passed through the
`$ZENITH_AUTH_TOKEN` environment variable.
`$NEON_AUTH_TOKEN` environment variable.
### Pageserver
#### Overview
@@ -114,7 +114,7 @@ either of three values:
Pageserver makes a connection to a Safekeeper for each active timeline.
As Pageserver may want to access any timeline it has on the disk,
it is given a blanket JWT token to access any data on any Safekeeper.
This token is passed through an environment variable called `ZENITH_AUTH_TOKEN`
This token is passed through an environment variable called `NEON_AUTH_TOKEN`
(non-configurable as of writing this text).
A better way _may be_ to store JWT token for each timeline next to it,

View File

@@ -2,6 +2,7 @@
name = "metrics"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency

View File

@@ -2,6 +2,7 @@
name = "pageserver_api"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
serde = { version = "1.0", features = ["derive"] }

View File

@@ -163,6 +163,8 @@ pub struct TenantInfo {
#[serde_as(as = "DisplayFromStr")]
pub id: TenantId,
pub state: TenantState,
/// Sum of the size of all layer files.
/// If a layer is present in both local FS and S3, it counts only once.
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
pub has_in_progress_downloads: Option<bool>,
}
@@ -191,9 +193,12 @@ pub struct TimelineInfo {
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn: Lsn,
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
/// Sum of the size of all layer files.
/// If a layer is present in both local FS and S3, it counts only once.
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
pub current_logical_size_non_incremental: Option<u64>,
pub current_physical_size_non_incremental: Option<u64>,
pub timeline_dir_layer_file_size_sum: Option<u64>,
pub wal_source_connstr: Option<String>,
#[serde_as(as = "Option<DisplayFromStr>")]
@@ -203,29 +208,22 @@ pub struct TimelineInfo {
pub pg_version: u32,
pub state: TimelineState,
// Some of the above fields are duplicated in 'local' and 'remote', for backwards-
// compatility with older clients.
pub local: LocalTimelineInfo,
pub remote: RemoteTimelineInfo,
}
#[serde_as]
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct LocalTimelineInfo {
#[serde_as(as = "Option<DisplayFromStr>")]
pub ancestor_timeline_id: Option<TimelineId>,
#[serde_as(as = "Option<DisplayFromStr>")]
pub ancestor_lsn: Option<Lsn>,
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
pub struct DownloadRemoteLayersTaskInfo {
pub task_id: String,
pub state: DownloadRemoteLayersTaskState,
pub total_layer_count: u64, // stable once `completed`
pub successful_download_count: u64, // stable once `completed`
pub failed_download_count: u64, // stable once `completed`
}
#[serde_as]
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct RemoteTimelineInfo {
#[serde_as(as = "Option<DisplayFromStr>")]
pub remote_consistent_lsn: Option<Lsn>,
pub enum DownloadRemoteLayersTaskState {
Running,
Completed,
ShutDown,
}
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
@@ -325,7 +323,7 @@ impl PagestreamFeMessage {
match self {
Self::Exists(req) => {
bytes.put_u8(0);
bytes.put_u8(if req.latest { 1 } else { 0 });
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
@@ -335,7 +333,7 @@ impl PagestreamFeMessage {
Self::Nblocks(req) => {
bytes.put_u8(1);
bytes.put_u8(if req.latest { 1 } else { 0 });
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
@@ -345,7 +343,7 @@ impl PagestreamFeMessage {
Self::GetPage(req) => {
bytes.put_u8(2);
bytes.put_u8(if req.latest { 1 } else { 0 });
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
@@ -356,7 +354,7 @@ impl PagestreamFeMessage {
Self::DbSize(req) => {
bytes.put_u8(3);
bytes.put_u8(if req.latest { 1 } else { 0 });
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.dbnode);
}

View File

@@ -2,14 +2,15 @@
name = "postgres_connection"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0"
itertools = "0.10.3"
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
url = "2.2.2"
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -2,6 +2,7 @@
name = "postgres_ffi"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
rand = "0.8.3"
@@ -21,7 +22,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
[dev-dependencies]
env_logger = "0.9"
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
wal_craft = { path = "wal_craft" }
[build-dependencies]

View File

@@ -14,8 +14,8 @@ pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
status
);
let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
let byteno: usize =
((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
let bshift: u8 =
((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
@@ -25,13 +25,13 @@ pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
}
pub fn transaction_id_get_status(xid: u32, page: &[u8]) -> u8 {
let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
let byteno: usize =
((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
let bshift: u8 =
((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
((page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK) as u8
(page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK
}
// See CLOGPagePrecedes in clog.c

View File

@@ -333,7 +333,7 @@ impl CheckPoint {
// We need this segment to start compute node.
//
pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, SerializeError> {
let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE as usize);
let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE);
let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
let hdr = XLogLongPageHeaderData {
@@ -574,7 +574,7 @@ mod tests {
// Rename file to partial to actually find last valid lsn, then rename it back.
fs::rename(
cfg.wal_dir().join(&last_segment),
cfg.wal_dir().join(last_segment),
cfg.wal_dir().join(format!("{}.partial", last_segment)),
)
.unwrap();

View File

@@ -2,7 +2,7 @@
name = "wal_craft"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
@@ -11,7 +11,7 @@ clap = "4.0"
env_logger = "0.9"
log = "0.4"
once_cell = "1.13.0"
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
postgres_ffi = { path = "../" }
tempfile = "3.2"
workspace_hack = { version = "0.1", path = "../../../workspace_hack" }

View File

@@ -81,7 +81,7 @@ impl Conf {
.new_pg_command("initdb")?
.arg("-D")
.arg(self.datadir.as_os_str())
.args(&["-U", "postgres", "--no-instructions", "--no-sync"])
.args(["-U", "postgres", "--no-instructions", "--no-sync"])
.output()?;
debug!("initdb output: {:?}", output);
ensure!(
@@ -105,12 +105,12 @@ impl Conf {
let unix_socket_dir_path = unix_socket_dir.path().to_owned();
let server_process = self
.new_pg_command("postgres")?
.args(&["-c", "listen_addresses="])
.args(["-c", "listen_addresses="])
.arg("-k")
.arg(unix_socket_dir_path.as_os_str())
.arg("-D")
.arg(self.datadir.as_os_str())
.args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output
.args(["-c", "logging_collector=on"]) // stderr will mess up with tests output
.args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg]))
.stderr(Stdio::from(log_file))
.spawn()?;
@@ -142,7 +142,7 @@ impl Conf {
);
let output = self
.new_pg_command("pg_waldump")?
.args(&[
.args([
&first_segment_file.as_os_str(),
&last_segment_file.as_os_str(),
])

View File

@@ -2,15 +2,17 @@
name = "pq_proto"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
anyhow = "1.0"
bytes = "1.0.1"
pin-project-lite = "0.2.7"
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
rand = "0.8.3"
serde = { version = "1.0", features = ["derive"] }
tokio = { version = "1.17", features = ["macros"] }
tracing = "0.1"
thiserror = "1.0"
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -5,7 +5,7 @@
// Tools for calling certain async methods in sync contexts.
pub mod sync;
use anyhow::{bail, ensure, Context, Result};
use anyhow::{ensure, Context, Result};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use postgres_protocol::PG_EPOCH;
use serde::{Deserialize, Serialize};
@@ -194,6 +194,35 @@ macro_rules! retry_read {
};
}
/// An error occured during connection being open.
#[derive(thiserror::Error, Debug)]
pub enum ConnectionError {
/// IO error during writing to or reading from the connection socket.
#[error("Socket IO error: {0}")]
Socket(std::io::Error),
/// Invalid packet was received from client
#[error("Protocol error: {0}")]
Protocol(String),
/// Failed to parse a protocol mesage
#[error("Message parse error: {0}")]
MessageParse(anyhow::Error),
}
impl From<anyhow::Error> for ConnectionError {
fn from(e: anyhow::Error) -> Self {
Self::MessageParse(e)
}
}
impl ConnectionError {
pub fn into_io_error(self) -> io::Error {
match self {
ConnectionError::Socket(io) => io,
other => io::Error::new(io::ErrorKind::Other, other.to_string()),
}
}
}
impl FeMessage {
/// Read one message from the stream.
/// This function returns `Ok(None)` in case of EOF.
@@ -216,7 +245,9 @@ impl FeMessage {
/// }
/// ```
#[inline(never)]
pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<Option<FeMessage>> {
pub fn read(
stream: &mut (impl io::Read + Unpin),
) -> Result<Option<FeMessage>, ConnectionError> {
Self::read_fut(&mut AsyncishRead(stream)).wait()
}
@@ -224,7 +255,7 @@ impl FeMessage {
/// See documentation for `Self::read`.
pub fn read_fut<Reader>(
stream: &mut Reader,
) -> SyncFuture<Reader, impl Future<Output = anyhow::Result<Option<FeMessage>>> + '_>
) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
where
Reader: tokio::io::AsyncRead + Unpin,
{
@@ -238,17 +269,21 @@ impl FeMessage {
let tag = match retry_read!(stream.read_u8().await) {
Ok(b) => b,
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
Err(e) => return Err(e.into()),
Err(e) => return Err(ConnectionError::Socket(e)),
};
// The message length includes itself, so it better be at least 4.
let len = retry_read!(stream.read_u32().await)?
let len = retry_read!(stream.read_u32().await)
.map_err(ConnectionError::Socket)?
.checked_sub(4)
.context("invalid message length")?;
.ok_or_else(|| ConnectionError::Protocol("invalid message length".to_string()))?;
let body = {
let mut buffer = vec![0u8; len as usize];
stream.read_exact(&mut buffer).await?;
stream
.read_exact(&mut buffer)
.await
.map_err(ConnectionError::Socket)?;
Bytes::from(buffer)
};
@@ -265,7 +300,11 @@ impl FeMessage {
b'c' => Ok(Some(FeMessage::CopyDone)),
b'f' => Ok(Some(FeMessage::CopyFail)),
b'p' => Ok(Some(FeMessage::PasswordMessage(body))),
tag => bail!("unknown message tag: {},'{:?}'", tag, body),
tag => {
return Err(ConnectionError::Protocol(format!(
"unknown message tag: {tag},'{body:?}'"
)))
}
}
})
}
@@ -275,7 +314,9 @@ impl FeStartupPacket {
/// Read startup message from the stream.
// XXX: It's tempting yet undesirable to accept `stream` by value,
// since such a change will cause user-supplied &mut references to be consumed
pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<Option<FeMessage>> {
pub fn read(
stream: &mut (impl io::Read + Unpin),
) -> Result<Option<FeMessage>, ConnectionError> {
Self::read_fut(&mut AsyncishRead(stream)).wait()
}
@@ -284,7 +325,7 @@ impl FeStartupPacket {
// since such a change will cause user-supplied &mut references to be consumed
pub fn read_fut<Reader>(
stream: &mut Reader,
) -> SyncFuture<Reader, impl Future<Output = anyhow::Result<Option<FeMessage>>> + '_>
) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
where
Reader: tokio::io::AsyncRead + Unpin,
{
@@ -302,31 +343,41 @@ impl FeStartupPacket {
let len = match retry_read!(stream.read_u32().await) {
Ok(len) => len as usize,
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
Err(e) => return Err(e.into()),
Err(e) => return Err(ConnectionError::Socket(e)),
};
#[allow(clippy::manual_range_contains)]
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
bail!("invalid message length");
return Err(ConnectionError::Protocol(format!(
"invalid message length {len}"
)));
}
let request_code = retry_read!(stream.read_u32().await)?;
let request_code =
retry_read!(stream.read_u32().await).map_err(ConnectionError::Socket)?;
// the rest of startup packet are params
let params_len = len - 8;
let mut params_bytes = vec![0u8; params_len];
stream.read_exact(params_bytes.as_mut()).await?;
stream
.read_exact(params_bytes.as_mut())
.await
.map_err(ConnectionError::Socket)?;
// Parse params depending on request code
let req_hi = request_code >> 16;
let req_lo = request_code & ((1 << 16) - 1);
let message = match (req_hi, req_lo) {
(RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
ensure!(params_len == 8, "expected 8 bytes for CancelRequest params");
if params_len != 8 {
return Err(ConnectionError::Protocol(
"expected 8 bytes for CancelRequest params".to_string(),
));
}
let mut cursor = Cursor::new(params_bytes);
FeStartupPacket::CancelRequest(CancelKeyData {
backend_pid: cursor.read_i32().await?,
cancel_key: cursor.read_i32().await?,
backend_pid: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
cancel_key: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
})
}
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
@@ -338,7 +389,9 @@ impl FeStartupPacket {
FeStartupPacket::GssEncRequest
}
(RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
bail!("Unrecognized request code {}", unrecognized_code)
return Err(ConnectionError::Protocol(format!(
"Unrecognized request code {unrecognized_code}"
)));
}
// TODO bail if protocol major_version is not 3?
(major_version, minor_version) => {
@@ -346,15 +399,21 @@ impl FeStartupPacket {
// See `postgres: ProcessStartupPacket, build_startup_packet`.
let mut tokens = str::from_utf8(&params_bytes)
.context("StartupMessage params: invalid utf-8")?
.strip_suffix('\0') // drop packet's own null terminator
.context("StartupMessage params: missing null terminator")?
.strip_suffix('\0') // drop packet's own null
.ok_or_else(|| {
ConnectionError::Protocol(
"StartupMessage params: missing null terminator".to_string(),
)
})?
.split_terminator('\0');
let mut params = HashMap::new();
while let Some(name) = tokens.next() {
let value = tokens
.next()
.context("StartupMessage params: key without value")?;
let value = tokens.next().ok_or_else(|| {
ConnectionError::Protocol(
"StartupMessage params: key without value".to_string(),
)
})?;
params.insert(name.to_owned(), value.to_owned());
}
@@ -458,12 +517,15 @@ pub enum BeMessage<'a> {
CloseComplete,
// None means column is NULL
DataRow(&'a [Option<&'a [u8]>]),
ErrorResponse(&'a str),
ErrorResponse(&'a str, Option<&'a [u8; 5]>),
/// Single byte - used in response to SSLRequest/GSSENCRequest.
EncryptionResponse(bool),
NoData,
ParameterDescription,
ParameterStatus(BeParameterStatusMessage<'a>),
ParameterStatus {
name: &'a [u8],
value: &'a [u8],
},
ParseComplete,
ReadyForQuery,
RowDescription(&'a [RowDescriptor<'a>]),
@@ -472,6 +534,28 @@ pub enum BeMessage<'a> {
KeepAlive(WalSndKeepAlive),
}
/// Common shorthands.
impl<'a> BeMessage<'a> {
/// A [`BeMessage::ParameterStatus`] holding the client encoding, i.e. UTF-8.
/// This is a sensible default, given that:
/// * rust strings only support this encoding out of the box.
/// * tokio-postgres, postgres-jdbc (and probably more) mandate it.
///
/// TODO: do we need to report `server_encoding` as well?
pub const CLIENT_ENCODING: Self = Self::ParameterStatus {
name: b"client_encoding",
value: b"UTF8",
};
/// Build a [`BeMessage::ParameterStatus`] holding the server version.
pub fn server_version(version: &'a str) -> Self {
Self::ParameterStatus {
name: b"server_version",
value: version.as_bytes(),
}
}
}
#[derive(Debug)]
pub enum BeAuthenticationSaslMessage<'a> {
Methods(&'a [&'a str]),
@@ -485,12 +569,6 @@ pub enum BeParameterStatusMessage<'a> {
ServerVersion(&'a str),
}
impl BeParameterStatusMessage<'static> {
pub fn encoding() -> BeMessage<'static> {
BeMessage::ParameterStatus(Self::Encoding("UTF8"))
}
}
// One row description in RowDescription packet.
#[derive(Debug)]
pub struct RowDescriptor<'a> {
@@ -587,14 +665,15 @@ fn write_body<R>(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R {
}
/// Safe write of s into buf as cstring (String in the protocol).
fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
if s.contains(&0) {
fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> {
let bytes = s.as_ref();
if bytes.contains(&0) {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"string contains embedded null",
));
}
buf.put_slice(s);
buf.put_slice(bytes);
buf.put_u8(0);
Ok(())
}
@@ -606,6 +685,8 @@ fn read_cstr(buf: &mut Bytes) -> anyhow::Result<Bytes> {
Ok(result)
}
pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
impl<'a> BeMessage<'a> {
/// Write message to the given buf.
// Unlike the reading side, we use BytesMut
@@ -644,7 +725,7 @@ impl<'a> BeMessage<'a> {
Methods(methods) => {
buf.put_i32(10); // Specifies that SASL auth method is used.
for method in methods.iter() {
write_cstr(method.as_bytes(), buf)?;
write_cstr(method, buf)?;
}
buf.put_u8(0); // zero terminator for the list
}
@@ -745,10 +826,7 @@ impl<'a> BeMessage<'a> {
// First byte of each field represents type of this field. Set just enough fields
// to satisfy rust-postgres client: 'S' -- severity, 'C' -- error, 'M' -- error
// message text.
BeMessage::ErrorResponse(error_msg) => {
// For all the errors set Severity to Error and error code to
// 'internal error'.
BeMessage::ErrorResponse(error_msg, pg_error_code) => {
// 'E' signalizes ErrorResponse messages
buf.put_u8(b'E');
write_body(buf, |buf| {
@@ -756,10 +834,12 @@ impl<'a> BeMessage<'a> {
buf.put_slice(b"ERROR\0");
buf.put_u8(b'C'); // SQLSTATE error code
buf.put_slice(b"CXX000\0");
buf.put_slice(&terminate_code(
pg_error_code.unwrap_or(SQLSTATE_INTERNAL_ERROR),
));
buf.put_u8(b'M'); // the message
write_cstr(error_msg.as_bytes(), buf)?;
write_cstr(error_msg, buf)?;
buf.put_u8(0); // terminator
Ok::<_, io::Error>(())
@@ -779,7 +859,7 @@ impl<'a> BeMessage<'a> {
buf.put_slice(b"NOTICE\0");
buf.put_u8(b'C'); // SQLSTATE error code
buf.put_slice(b"CXX000\0");
buf.put_slice(&terminate_code(SQLSTATE_INTERNAL_ERROR));
buf.put_u8(b'M'); // the message
write_cstr(error_msg.as_bytes(), buf)?;
@@ -799,24 +879,12 @@ impl<'a> BeMessage<'a> {
buf.put_u8(response);
}
BeMessage::ParameterStatus(param) => {
use std::io::{IoSlice, Write};
use BeParameterStatusMessage::*;
let [name, value] = match param {
Encoding(name) => [b"client_encoding", name.as_bytes()],
ServerVersion(version) => [b"server_version", version.as_bytes()],
};
// Parameter names and values are passed as null-terminated strings
let iov = &mut [name, b"\0", value, b"\0"].map(IoSlice::new);
let mut buffer = [0u8; 64]; // this should be enough
let cnt = buffer.as_mut().write_vectored(iov).unwrap();
BeMessage::ParameterStatus { name, value } => {
buf.put_u8(b'S');
write_body(buf, |buf| {
buf.put_slice(&buffer[..cnt]);
});
write_cstr(name, buf)?;
write_cstr(value, buf)
})?;
}
BeMessage::ParameterDescription => {
@@ -873,7 +941,7 @@ impl<'a> BeMessage<'a> {
buf.put_u8(b'k');
buf.put_u64(req.sent_ptr);
buf.put_i64(req.timestamp);
buf.put_u8(if req.request_reply { 1 } else { 0 });
buf.put_u8(u8::from(req.request_reply));
});
}
}
@@ -1079,3 +1147,12 @@ mod tests {
let _ = FeStartupPacket::read_fut(stream).await;
}
}
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
let mut terminated = [0; 6];
for (i, &elem) in code.iter().enumerate() {
terminated[i] = elem;
}
terminated
}

View File

@@ -2,6 +2,7 @@
name = "remote_storage"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
anyhow = { version = "1.0", features = ["backtrace"] }

View File

@@ -7,6 +7,7 @@
//!
mod local_fs;
mod s3_bucket;
mod simulate_failures;
use std::{
collections::HashMap,
@@ -24,7 +25,7 @@ use tokio::io;
use toml_edit::Item;
use tracing::info;
pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket};
pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper};
/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
@@ -77,7 +78,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
/// so this method doesnt need to.
async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
) -> Result<Vec<RemotePath>, DownloadError>;
/// Streams the local file contents into remote into the remote storage entry.
async fn upload(
@@ -150,6 +154,7 @@ impl std::error::Error for DownloadError {}
pub enum GenericRemoteStorage {
LocalFs(LocalFs),
AwsS3(Arc<S3Bucket>),
Unreliable(Arc<UnreliableWrapper>),
}
impl Deref for GenericRemoteStorage {
@@ -159,27 +164,30 @@ impl Deref for GenericRemoteStorage {
match self {
GenericRemoteStorage::LocalFs(local_fs) => local_fs,
GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(),
GenericRemoteStorage::Unreliable(s) => s.as_ref(),
}
}
}
impl GenericRemoteStorage {
pub fn from_config(
storage_config: &RemoteStorageConfig,
) -> anyhow::Result<GenericRemoteStorage> {
pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
Ok(match &storage_config.storage {
RemoteStorageKind::LocalFs(root) => {
info!("Using fs root '{}' as a remote storage", root.display());
GenericRemoteStorage::LocalFs(LocalFs::new(root.clone())?)
Self::LocalFs(LocalFs::new(root.clone())?)
}
RemoteStorageKind::AwsS3(s3_config) => {
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
GenericRemoteStorage::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
}
})
}
pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
}
/// Takes storage object contents and its size and uploads to remote storage,
/// mapping `from_path` to the corresponding remote object id in the storage.
///

View File

@@ -92,13 +92,17 @@ impl RemoteStorage for LocalFs {
.collect())
}
async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
) -> Result<Vec<RemotePath>, DownloadError> {
let path = match prefix {
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
None => Cow::Borrowed(&self.storage_root),
};
Ok(get_all_files(path.as_ref(), false)
.await?
.await
.map_err(DownloadError::Other)?
.into_iter()
.map(|path| {
path.strip_prefix(&self.storage_root)

View File

@@ -286,7 +286,10 @@ impl RemoteStorage for S3Bucket {
/// See the doc for `RemoteStorage::list_prefixes`
/// Note: it wont include empty "directories"
async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
) -> Result<Vec<RemotePath>, DownloadError> {
// get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix
.map(|p| self.relative_path_to_s3_object(p))
@@ -308,7 +311,8 @@ impl RemoteStorage for S3Bucket {
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 list")?;
.context("Concurrency limiter semaphore got closed during S3 list")
.map_err(DownloadError::Other)?;
metrics::inc_list_objects();
@@ -324,7 +328,9 @@ impl RemoteStorage for S3Bucket {
.map_err(|e| {
metrics::inc_list_objects_fail();
e
})?;
})
.context("Failed to list S3 prefixes")
.map_err(DownloadError::Other)?;
document_keys.extend(
fetch_response

View File

@@ -0,0 +1,129 @@
//! This module provides a wrapper around a real RemoteStorage implementation that
//! causes the first N attempts at each upload or download operatio to fail. For
//! testing purposes.
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Mutex;
use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};
pub struct UnreliableWrapper {
inner: crate::GenericRemoteStorage,
// This many attempts of each operation will fail, then we let it succeed.
attempts_to_fail: u64,
// Tracks how many failed attempts of each operation has been made.
attempts: Mutex<HashMap<RemoteOp, u64>>,
}
/// Used to identify retries of different unique operation.
#[derive(Debug, Hash, Eq, PartialEq)]
enum RemoteOp {
List,
ListPrefixes(Option<RemotePath>),
Upload(RemotePath),
Download(RemotePath),
Delete(RemotePath),
}
impl UnreliableWrapper {
pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
assert!(attempts_to_fail > 0);
UnreliableWrapper {
inner,
attempts_to_fail,
attempts: Mutex::new(HashMap::new()),
}
}
///
/// Common functionality for all operations.
///
/// On the first attempts of this operation, return an error. After 'attempts_to_fail'
/// attempts, let the operation go ahead, and clear the counter.
///
fn attempt(&self, op: RemoteOp) -> Result<u64, DownloadError> {
let mut attempts = self.attempts.lock().unwrap();
match attempts.entry(op) {
Entry::Occupied(mut e) => {
let attempts_before_this = {
let p = e.get_mut();
*p += 1;
*p
};
if attempts_before_this >= self.attempts_to_fail {
// let it succeed
e.remove();
Ok(attempts_before_this)
} else {
let error =
anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
Err(DownloadError::Other(error))
}
}
Entry::Vacant(e) => {
let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
e.insert(1);
Err(DownloadError::Other(error))
}
}
}
}
#[async_trait::async_trait]
impl RemoteStorage for UnreliableWrapper {
/// Lists all items the storage has right now.
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
self.attempt(RemoteOp::List)?;
self.inner.list().await
}
async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
) -> Result<Vec<RemotePath>, DownloadError> {
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
self.inner.list_prefixes(prefix).await
}
async fn upload(
&self,
data: Box<(dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static)>,
// S3 PUT request requires the content length to be specified,
// otherwise it starts to fail with the concurrent connection count increasing.
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
) -> anyhow::Result<()> {
self.attempt(RemoteOp::Upload(to.clone()))?;
self.inner.upload(data, data_size_bytes, to, metadata).await
}
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
self.attempt(RemoteOp::Download(from.clone()))?;
self.inner.download(from).await
}
async fn download_byte_range(
&self,
from: &RemotePath,
start_inclusive: u64,
end_exclusive: Option<u64>,
) -> Result<Download, DownloadError> {
// Note: We treat any download_byte_range as an "attempt" of the same
// operation. We don't pay attention to the ranges. That's good enough
// for now.
self.attempt(RemoteOp::Download(from.clone()))?;
self.inner
.download_byte_range(from, start_inclusive, end_exclusive)
.await
}
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
self.attempt(RemoteOp::Delete(path.clone()))?;
self.inner.delete(path).await
}
}

View File

@@ -2,6 +2,7 @@
name = "safekeeper_api"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
serde = { version = "1.0", features = ["derive"] }

View File

@@ -3,6 +3,7 @@ name = "tenant_size_model"
version = "0.1.0"
edition = "2021"
publish = false
license = "Apache-2.0"
[dependencies]
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -2,9 +2,10 @@
name = "utils"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
sentry = "0.29.0"
sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
async-trait = "0.1"
anyhow = "1.0"
bincode = "1.3"

View File

@@ -157,34 +157,34 @@ mod tests {
assert_eq!(err.kind(), io::ErrorKind::AlreadyExists);
let invalid_dir_path = file_path.join("folder");
create_dir_all(&invalid_dir_path).unwrap_err();
create_dir_all(invalid_dir_path).unwrap_err();
}
#[test]
fn test_path_with_suffix_extension() {
let p = PathBuf::from("/foo/bar");
assert_eq!(
&path_with_suffix_extension(&p, "temp").to_string_lossy(),
&path_with_suffix_extension(p, "temp").to_string_lossy(),
"/foo/bar.temp"
);
let p = PathBuf::from("/foo/bar");
assert_eq!(
&path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
&path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
"/foo/bar.temp.temp"
);
let p = PathBuf::from("/foo/bar.baz");
assert_eq!(
&path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
&path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
"/foo/bar.baz.temp.temp"
);
let p = PathBuf::from("/foo/bar.baz");
assert_eq!(
&path_with_suffix_extension(&p, ".temp").to_string_lossy(),
&path_with_suffix_extension(p, ".temp").to_string_lossy(),
"/foo/bar.baz..temp"
);
let p = PathBuf::from("/foo/bar/dir/");
assert_eq!(
&path_with_suffix_extension(&p, ".temp").to_string_lossy(),
&path_with_suffix_extension(p, ".temp").to_string_lossy(),
"/foo/bar/dir..temp"
);
}

View File

@@ -3,11 +3,11 @@
//! implementation determining how to process the queries. Currently its API
//! is rather narrow, but we can extend it once required.
use crate::postgres_backend_async::{log_query_error, short_error, QueryError};
use crate::sock_split::{BidiStream, ReadStream, WriteStream};
use anyhow::{bail, ensure, Context, Result};
use anyhow::Context;
use bytes::{Bytes, BytesMut};
use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
use rand::Rng;
use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
use serde::{Deserialize, Serialize};
use std::fmt;
use std::io::{self, Write};
@@ -22,25 +22,32 @@ pub trait Handler {
/// postgres_backend will issue ReadyForQuery after calling this (this
/// might be not what we want after CopyData streaming, but currently we don't
/// care).
fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>;
fn process_query(
&mut self,
pgb: &mut PostgresBackend,
query_string: &str,
) -> Result<(), QueryError>;
/// Called on startup packet receival, allows to process params.
///
/// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
/// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
/// to override whole init logic in implementations.
fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> {
fn startup(
&mut self,
_pgb: &mut PostgresBackend,
_sm: &FeStartupPacket,
) -> Result<(), QueryError> {
Ok(())
}
/// Check auth md5
fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> {
bail!("MD5 auth failed")
}
/// Check auth jwt
fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
bail!("JWT auth failed")
fn check_auth_jwt(
&mut self,
_pgb: &mut PostgresBackend,
_jwt_response: &[u8],
) -> Result<(), QueryError> {
Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
}
fn is_shutdown_requested(&self) -> bool {
@@ -61,7 +68,6 @@ pub enum ProtoState {
#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
pub enum AuthType {
Trust,
MD5,
// This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT
NeonJWT,
}
@@ -72,9 +78,8 @@ impl FromStr for AuthType {
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"Trust" => Ok(Self::Trust),
"MD5" => Ok(Self::MD5),
"NeonJWT" => Ok(Self::NeonJWT),
_ => bail!("invalid value \"{s}\" for auth type"),
_ => anyhow::bail!("invalid value \"{s}\" for auth type"),
}
}
}
@@ -83,7 +88,6 @@ impl fmt::Display for AuthType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(match self {
AuthType::Trust => "Trust",
AuthType::MD5 => "MD5",
AuthType::NeonJWT => "NeonJWT",
})
}
@@ -134,7 +138,6 @@ pub struct PostgresBackend {
pub state: ProtoState,
md5_salt: [u8; 4],
auth_type: AuthType,
peer_addr: SocketAddr,
@@ -164,7 +167,7 @@ pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool {
}
// Cast a byte slice to a string slice, dropping null terminator if there's one.
fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
std::str::from_utf8(without_null).map_err(|e| e.into())
}
@@ -187,7 +190,6 @@ impl PostgresBackend {
stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))),
buf_out: BytesMut::with_capacity(10 * 1024),
state: ProtoState::Initialization,
md5_salt: [0u8; 4],
auth_type,
tls_config,
peer_addr,
@@ -199,10 +201,10 @@ impl PostgresBackend {
}
/// Get direct reference (into the Option) to the read stream.
fn get_stream_in(&mut self) -> Result<&mut BidiStream> {
fn get_stream_in(&mut self) -> anyhow::Result<&mut BidiStream> {
match &mut self.stream {
Some(Stream::Bidirectional(stream)) => Ok(stream),
_ => bail!("reader taken"),
_ => anyhow::bail!("reader taken"),
}
}
@@ -226,7 +228,7 @@ impl PostgresBackend {
}
/// Read full message or return None if connection is closed.
pub fn read_message(&mut self) -> Result<Option<FeMessage>> {
pub fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
let (state, stream) = (self.state, self.get_stream_in()?);
use ProtoState::*;
@@ -234,6 +236,7 @@ impl PostgresBackend {
Initialization | Encrypted => FeStartupPacket::read(stream),
Authentication | Established => FeMessage::read(stream),
}
.map_err(QueryError::from)
}
/// Write message into internal output buffer.
@@ -257,7 +260,7 @@ impl PostgresBackend {
}
// Wrapper for run_message_loop() that shuts down socket when we are done
pub fn run(mut self, handler: &mut impl Handler) -> Result<()> {
pub fn run(mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
let ret = self.run_message_loop(handler);
if let Some(stream) = self.stream.as_mut() {
let _ = stream.shutdown(Shutdown::Both);
@@ -265,7 +268,7 @@ impl PostgresBackend {
ret
}
fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<()> {
fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
trace!("postgres backend to {:?} started", self.peer_addr);
let mut unnamed_query_string = Bytes::new();
@@ -274,7 +277,7 @@ impl PostgresBackend {
match self.read_message() {
Ok(message) => {
if let Some(msg) = message {
trace!("got message {:?}", msg);
trace!("got message {msg:?}");
match self.process_message(handler, msg, &mut unnamed_query_string)? {
ProcessMsgResult::Continue => continue,
@@ -285,10 +288,12 @@ impl PostgresBackend {
}
}
Err(e) => {
// If it is a timeout error, continue the loop
if !is_socket_read_timed_out(&e) {
return Err(e);
if let QueryError::Other(e) = &e {
if is_socket_read_timed_out(e) {
continue;
}
}
return Err(e);
}
}
}
@@ -306,7 +311,7 @@ impl PostgresBackend {
}
stream => {
self.stream = stream;
bail!("can't start TLs without bidi stream");
anyhow::bail!("can't start TLs without bidi stream");
}
}
}
@@ -316,17 +321,16 @@ impl PostgresBackend {
handler: &mut impl Handler,
msg: FeMessage,
unnamed_query_string: &mut Bytes,
) -> Result<ProcessMsgResult> {
) -> Result<ProcessMsgResult, QueryError> {
// Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
// TODO: change that to proper top-level match of protocol state with separate message handling for each state
if self.state < ProtoState::Established {
ensure!(
matches!(
msg,
FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_)
),
"protocol violation"
);
if self.state < ProtoState::Established
&& !matches!(
msg,
FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_)
)
{
return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
}
let have_tls = self.tls_config.is_some();
@@ -350,8 +354,13 @@ impl PostgresBackend {
}
FeStartupPacket::StartupMessage { .. } => {
if have_tls && !matches!(self.state, ProtoState::Encrypted) {
self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?;
bail!("client did not connect with TLS");
self.write_message(&BeMessage::ErrorResponse(
"must connect with TLS",
None,
))?;
return Err(QueryError::Other(anyhow::anyhow!(
"client did not connect with TLS"
)));
}
// NB: startup() may change self.auth_type -- we are using that in proxy code
@@ -361,21 +370,12 @@ impl PostgresBackend {
match self.auth_type {
AuthType::Trust => {
self.write_message_noflush(&BeMessage::AuthenticationOk)?
.write_message_noflush(&BeParameterStatusMessage::encoding())?
.write_message_noflush(&BeMessage::CLIENT_ENCODING)?
// The async python driver requires a valid server_version
.write_message_noflush(&BeMessage::ParameterStatus(
BeParameterStatusMessage::ServerVersion("14.1"),
))?
.write_message_noflush(&BeMessage::server_version("14.1"))?
.write_message(&BeMessage::ReadyForQuery)?;
self.state = ProtoState::Established;
}
AuthType::MD5 => {
rand::thread_rng().fill(&mut self.md5_salt);
self.write_message(&BeMessage::AuthenticationMD5Password(
self.md5_salt,
))?;
self.state = ProtoState::Authentication;
}
AuthType::NeonJWT => {
self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
self.state = ProtoState::Authentication;
@@ -395,25 +395,20 @@ impl PostgresBackend {
match self.auth_type {
AuthType::Trust => unreachable!(),
AuthType::MD5 => {
let (_, md5_response) = m.split_last().context("protocol violation")?;
if let Err(e) = handler.check_auth_md5(self, md5_response) {
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
bail!("auth failed: {}", e);
}
}
AuthType::NeonJWT => {
let (_, jwt_response) = m.split_last().context("protocol violation")?;
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
bail!("auth failed: {}", e);
self.write_message(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?;
return Err(e);
}
}
}
self.write_message_noflush(&BeMessage::AuthenticationOk)?
.write_message_noflush(&BeParameterStatusMessage::encoding())?
.write_message_noflush(&BeMessage::CLIENT_ENCODING)?
.write_message(&BeMessage::ReadyForQuery)?;
self.state = ProtoState::Established;
}
@@ -422,33 +417,14 @@ impl PostgresBackend {
// remove null terminator
let query_string = cstr_to_str(&body)?;
trace!("got query {:?}", query_string);
// xxx distinguish fatal and recoverable errors?
trace!("got query {query_string:?}");
if let Err(e) = handler.process_query(self, query_string) {
// ":?" uses the alternate formatting style, which makes anyhow display the
// full cause of the error, not just the top-level context + its trace.
// We don't want to send that in the ErrorResponse though,
// because it's not relevant to the compute node logs.
//
// We also don't want to log full stacktrace when the error is primitive,
// such as usual connection closed.
let short_error = format!("{:#}", e);
let root_cause = e.root_cause().to_string();
if root_cause.contains("connection closed unexpectedly")
|| root_cause.contains("Broken pipe (os error 32)")
{
error!(
"query handler for '{}' failed: {}",
query_string, short_error
);
} else {
error!("query handler for '{}' failed: {:?}", query_string, e);
}
self.write_message_noflush(&BeMessage::ErrorResponse(&short_error))?;
// TODO: untangle convoluted control flow
if e.to_string().contains("failed to run") {
return Ok(ProcessMsgResult::Break);
}
log_query_error(query_string, &e);
let short_error = short_error(&e);
self.write_message_noflush(&BeMessage::ErrorResponse(
&short_error,
Some(e.pg_error_code()),
))?;
}
self.write_message(&BeMessage::ReadyForQuery)?;
}
@@ -473,11 +449,13 @@ impl PostgresBackend {
FeMessage::Execute(_) => {
let query_string = cstr_to_str(unnamed_query_string)?;
trace!("got execute {:?}", query_string);
// xxx distinguish fatal and recoverable errors?
trace!("got execute {query_string:?}");
if let Err(e) = handler.process_query(self, query_string) {
error!("query handler for '{}' failed: {:?}", query_string, e);
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
log_query_error(query_string, &e);
self.write_message(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?;
}
// NOTE there is no ReadyForQuery message. This handler is used
// for basebackup and it uses CopyOut which doesn't require
@@ -496,7 +474,9 @@ impl PostgresBackend {
// We prefer explicit pattern matching to wildcards, because
// this helps us spot the places where new variants are missing
FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
bail!("unexpected message type: {:?}", msg);
return Err(QueryError::Other(anyhow::anyhow!(
"unexpected message type: {msg:?}"
)));
}
}

View File

@@ -4,45 +4,87 @@
//! is rather narrow, but we can extend it once required.
use crate::postgres_backend::AuthType;
use anyhow::{bail, Context, Result};
use bytes::{Bytes, BytesMut};
use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
use rand::Rng;
use anyhow::Context;
use bytes::{Buf, Bytes, BytesMut};
use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR};
use std::future::Future;
use std::io;
use std::net::SocketAddr;
use std::pin::Pin;
use std::sync::Arc;
use std::task::Poll;
use tracing::{debug, error, trace};
use tracing::{debug, error, info, trace};
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
use tokio_rustls::TlsAcceptor;
pub fn is_expected_io_error(e: &io::Error) -> bool {
use io::ErrorKind::*;
matches!(
e.kind(),
ConnectionRefused | ConnectionAborted | ConnectionReset
)
}
/// An error, occurred during query processing:
/// either during the connection ([`ConnectionError`]) or before/after it.
#[derive(thiserror::Error, Debug)]
pub enum QueryError {
/// The connection was lost while processing the query.
#[error(transparent)]
Disconnected(#[from] ConnectionError),
/// Some other error
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl From<io::Error> for QueryError {
fn from(e: io::Error) -> Self {
Self::Disconnected(ConnectionError::Socket(e))
}
}
impl QueryError {
pub fn pg_error_code(&self) -> &'static [u8; 5] {
match self {
Self::Disconnected(_) => b"08006", // connection failure
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
}
}
}
#[async_trait::async_trait]
pub trait Handler {
/// Handle single query.
/// postgres_backend will issue ReadyForQuery after calling this (this
/// might be not what we want after CopyData streaming, but currently we don't
/// care).
async fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>;
async fn process_query(
&mut self,
pgb: &mut PostgresBackend,
query_string: &str,
) -> Result<(), QueryError>;
/// Called on startup packet receival, allows to process params.
///
/// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
/// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
/// to override whole init logic in implementations.
fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> {
fn startup(
&mut self,
_pgb: &mut PostgresBackend,
_sm: &FeStartupPacket,
) -> Result<(), QueryError> {
Ok(())
}
/// Check auth md5
fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> {
bail!("MD5 auth failed")
}
/// Check auth jwt
fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
bail!("JWT auth failed")
fn check_auth_jwt(
&mut self,
_pgb: &mut PostgresBackend,
_jwt_response: &[u8],
) -> Result<(), QueryError> {
Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
}
}
@@ -76,17 +118,14 @@ impl AsyncWrite for Stream {
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
buf: &[u8],
) -> Poll<Result<usize, std::io::Error>> {
) -> Poll<io::Result<usize>> {
match self.get_mut() {
Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf),
Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf),
Self::Broken => unreachable!(),
}
}
fn poll_flush(
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
) -> Poll<Result<(), std::io::Error>> {
fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<io::Result<()>> {
match self.get_mut() {
Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx),
Self::Tls(stream) => Pin::new(stream).poll_flush(cx),
@@ -96,7 +135,7 @@ impl AsyncWrite for Stream {
fn poll_shutdown(
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
) -> Poll<Result<(), std::io::Error>> {
) -> Poll<io::Result<()>> {
match self.get_mut() {
Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx),
Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx),
@@ -109,7 +148,7 @@ impl AsyncRead for Stream {
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
buf: &mut tokio::io::ReadBuf<'_>,
) -> Poll<Result<(), std::io::Error>> {
) -> Poll<io::Result<()>> {
match self.get_mut() {
Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf),
Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf),
@@ -120,12 +159,14 @@ impl AsyncRead for Stream {
pub struct PostgresBackend {
stream: Stream,
// Output buffer. c.f. BeMessage::write why we are using BytesMut here.
// The data between 0 and "current position" as tracked by the bytes::Buf
// implementation of BytesMut, have already been written.
buf_out: BytesMut,
pub state: ProtoState,
md5_salt: [u8; 4],
auth_type: AuthType,
peer_addr: SocketAddr,
@@ -143,7 +184,7 @@ pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
}
// Cast a byte slice to a string slice, dropping null terminator if there's one.
fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
std::str::from_utf8(without_null).map_err(|e| e.into())
}
@@ -153,14 +194,13 @@ impl PostgresBackend {
socket: tokio::net::TcpStream,
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
) -> std::io::Result<Self> {
) -> io::Result<Self> {
let peer_addr = socket.peer_addr()?;
Ok(Self {
stream: Stream::Unencrypted(BufReader::new(socket)),
buf_out: BytesMut::with_capacity(10 * 1024),
state: ProtoState::Initialization,
md5_salt: [0u8; 4],
auth_type,
tls_config,
peer_addr,
@@ -172,30 +212,68 @@ impl PostgresBackend {
}
/// Read full message or return None if connection is closed.
pub async fn read_message(&mut self) -> Result<Option<FeMessage>> {
pub async fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
use ProtoState::*;
match self.state {
Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await,
Authentication | Established => FeMessage::read_fut(&mut self.stream).await,
Closed => Ok(None),
}
.map_err(QueryError::from)
}
/// Flush output buffer into the socket.
pub async fn flush(&mut self) -> std::io::Result<&mut Self> {
self.stream.write_all(&self.buf_out).await?;
pub async fn flush(&mut self) -> io::Result<()> {
while self.buf_out.has_remaining() {
let bytes_written = self.stream.write(self.buf_out.chunk()).await?;
self.buf_out.advance(bytes_written);
}
self.buf_out.clear();
Ok(self)
Ok(())
}
/// Write message into internal output buffer.
pub fn write_message(&mut self, message: &BeMessage<'_>) -> Result<&mut Self, std::io::Error> {
pub fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
BeMessage::write(&mut self.buf_out, message)?;
Ok(self)
}
/// Returns an AsyncWrite implementation that wraps all the data written
/// to it in CopyData messages, and writes them to the connection
///
/// The caller is responsible for sending CopyOutResponse and CopyDone messages.
pub fn copyout_writer(&mut self) -> CopyDataWriter {
CopyDataWriter { pgb: self }
}
/// A polling function that tries to write all the data from 'buf_out' to the
/// underlying stream.
fn poll_write_buf(
&mut self,
cx: &mut std::task::Context<'_>,
) -> Poll<Result<(), std::io::Error>> {
while self.buf_out.has_remaining() {
match Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk()) {
Poll::Ready(Ok(bytes_written)) => {
self.buf_out.advance(bytes_written);
}
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
Poll::Pending => return Poll::Pending,
}
}
Poll::Ready(Ok(()))
}
fn poll_flush(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), std::io::Error>> {
Pin::new(&mut self.stream).poll_flush(cx)
}
// Wrapper for run_message_loop() that shuts down socket when we are done
pub async fn run<F, S>(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()>
pub async fn run<F, S>(
mut self,
handler: &mut impl Handler,
shutdown_watcher: F,
) -> Result<(), QueryError>
where
F: Fn() -> S,
S: Future,
@@ -209,7 +287,7 @@ impl PostgresBackend {
&mut self,
handler: &mut impl Handler,
shutdown_watcher: F,
) -> Result<()>
) -> Result<(), QueryError>
where
F: Fn() -> S,
S: Future,
@@ -245,7 +323,7 @@ impl PostgresBackend {
return Ok(());
}
}
Ok::<(), anyhow::Error>(())
Ok::<(), QueryError>(())
} => {
// Handshake complete.
result?;
@@ -290,14 +368,14 @@ impl PostgresBackend {
self.stream = Stream::Tls(Box::new(tls_stream));
return Ok(());
};
bail!("TLS already started");
anyhow::bail!("TLS already started");
}
async fn process_handshake_message(
&mut self,
handler: &mut impl Handler,
msg: FeMessage,
) -> Result<ProcessMsgResult> {
) -> Result<ProcessMsgResult, QueryError> {
assert!(self.state < ProtoState::Established);
let have_tls = self.tls_config.is_some();
match msg {
@@ -320,8 +398,13 @@ impl PostgresBackend {
}
FeStartupPacket::StartupMessage { .. } => {
if have_tls && !matches!(self.state, ProtoState::Encrypted) {
self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?;
bail!("client did not connect with TLS");
self.write_message(&BeMessage::ErrorResponse(
"must connect with TLS",
None,
))?;
return Err(QueryError::Other(anyhow::anyhow!(
"client did not connect with TLS"
)));
}
// NB: startup() may change self.auth_type -- we are using that in proxy code
@@ -331,21 +414,12 @@ impl PostgresBackend {
match self.auth_type {
AuthType::Trust => {
self.write_message(&BeMessage::AuthenticationOk)?
.write_message(&BeParameterStatusMessage::encoding())?
.write_message(&BeMessage::CLIENT_ENCODING)?
// The async python driver requires a valid server_version
.write_message(&BeMessage::ParameterStatus(
BeParameterStatusMessage::ServerVersion("14.1"),
))?
.write_message(&BeMessage::server_version("14.1"))?
.write_message(&BeMessage::ReadyForQuery)?;
self.state = ProtoState::Established;
}
AuthType::MD5 => {
rand::thread_rng().fill(&mut self.md5_salt);
self.write_message(&BeMessage::AuthenticationMD5Password(
self.md5_salt,
))?;
self.state = ProtoState::Authentication;
}
AuthType::NeonJWT => {
self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
self.state = ProtoState::Authentication;
@@ -366,25 +440,20 @@ impl PostgresBackend {
match self.auth_type {
AuthType::Trust => unreachable!(),
AuthType::MD5 => {
let (_, md5_response) = m.split_last().context("protocol violation")?;
if let Err(e) = handler.check_auth_md5(self, md5_response) {
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
bail!("auth failed: {}", e);
}
}
AuthType::NeonJWT => {
let (_, jwt_response) = m.split_last().context("protocol violation")?;
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
bail!("auth failed: {}", e);
self.write_message(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?;
return Err(e);
}
}
}
self.write_message(&BeMessage::AuthenticationOk)?
.write_message(&BeParameterStatusMessage::encoding())?
.write_message(&BeMessage::CLIENT_ENCODING)?
.write_message(&BeMessage::ReadyForQuery)?;
self.state = ProtoState::Established;
}
@@ -402,33 +471,28 @@ impl PostgresBackend {
handler: &mut impl Handler,
msg: FeMessage,
unnamed_query_string: &mut Bytes,
) -> Result<ProcessMsgResult> {
) -> Result<ProcessMsgResult, QueryError> {
// Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
// TODO: change that to proper top-level match of protocol state with separate message handling for each state
assert!(self.state == ProtoState::Established);
match msg {
FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => {
bail!("protocol violation");
return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
}
FeMessage::Query(body) => {
// remove null terminator
let query_string = cstr_to_str(&body)?;
trace!("got query {:?}", query_string);
// xxx distinguish fatal and recoverable errors?
trace!("got query {query_string:?}");
if let Err(e) = handler.process_query(self, query_string).await {
// ":?" uses the alternate formatting style, which makes anyhow display the
// full cause of the error, not just the top-level context + its trace.
// We don't want to send that in the ErrorResponse though,
// because it's not relevant to the compute node logs.
error!("query handler for '{}' failed: {:?}", query_string, e);
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
// TODO: untangle convoluted control flow
if e.to_string().contains("failed to run") {
return Ok(ProcessMsgResult::Break);
}
log_query_error(query_string, &e);
let short_error = short_error(&e);
self.write_message(&BeMessage::ErrorResponse(
&short_error,
Some(e.pg_error_code()),
))?;
}
self.write_message(&BeMessage::ReadyForQuery)?;
}
@@ -453,11 +517,13 @@ impl PostgresBackend {
FeMessage::Execute(_) => {
let query_string = cstr_to_str(unnamed_query_string)?;
trace!("got execute {:?}", query_string);
// xxx distinguish fatal and recoverable errors?
trace!("got execute {query_string:?}");
if let Err(e) = handler.process_query(self, query_string).await {
error!("query handler for '{}' failed: {:?}", query_string, e);
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
log_query_error(query_string, &e);
self.write_message(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?;
}
// NOTE there is no ReadyForQuery message. This handler is used
// for basebackup and it uses CopyOut which doesn't require
@@ -476,10 +542,99 @@ impl PostgresBackend {
// We prefer explicit pattern matching to wildcards, because
// this helps us spot the places where new variants are missing
FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
bail!("unexpected message type: {:?}", msg);
return Err(QueryError::Other(anyhow::anyhow!(
"unexpected message type: {:?}",
msg
)));
}
}
Ok(ProcessMsgResult::Continue)
}
}
///
/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
/// messages.
///
pub struct CopyDataWriter<'a> {
pgb: &'a mut PostgresBackend,
}
impl<'a> AsyncWrite for CopyDataWriter<'a> {
fn poll_write(
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
buf: &[u8],
) -> Poll<Result<usize, std::io::Error>> {
let this = self.get_mut();
// It's not strictly required to flush between each message, but makes it easier
// to view in wireshark, and usually the messages that the callers write are
// decently-sized anyway.
match this.pgb.poll_write_buf(cx) {
Poll::Ready(Ok(())) => {}
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
Poll::Pending => return Poll::Pending,
}
// CopyData
// XXX: if the input is large, we should split it into multiple messages.
// Not sure what the threshold should be, but the ultimate hard limit is that
// the length cannot exceed u32.
this.pgb.write_message(&BeMessage::CopyData(buf))?;
Poll::Ready(Ok(buf.len()))
}
fn poll_flush(
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
) -> Poll<Result<(), std::io::Error>> {
let this = self.get_mut();
match this.pgb.poll_write_buf(cx) {
Poll::Ready(Ok(())) => {}
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
Poll::Pending => return Poll::Pending,
}
this.pgb.poll_flush(cx)
}
fn poll_shutdown(
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
) -> Poll<Result<(), std::io::Error>> {
let this = self.get_mut();
match this.pgb.poll_write_buf(cx) {
Poll::Ready(Ok(())) => {}
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
Poll::Pending => return Poll::Pending,
}
this.pgb.poll_flush(cx)
}
}
pub fn short_error(e: &QueryError) -> String {
match e {
QueryError::Disconnected(connection_error) => connection_error.to_string(),
QueryError::Other(e) => format!("{e:#}"),
}
}
pub(super) fn log_query_error(query: &str, e: &QueryError) {
match e {
QueryError::Disconnected(ConnectionError::Socket(io_error)) => {
if is_expected_io_error(io_error) {
info!("query handler for '{query}' failed with expected io error: {io_error}");
} else {
error!("query handler for '{query}' failed with io error: {io_error}");
}
}
QueryError::Disconnected(other_connection_error) => {
error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
}
QueryError::Other(e) => {
error!("query handler for '{query}' failed: {e:?}");
}
}
}

View File

@@ -11,11 +11,13 @@ use tokio::time::timeout;
/// An error happened while waiting for a number
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
#[error("SeqWaitError")]
pub enum SeqWaitError {
/// The wait timeout was reached
#[error("seqwait timeout was reached")]
Timeout,
/// [`SeqWait::shutdown`] was called
#[error("SeqWait::shutdown was called")]
Shutdown,
}

View File

@@ -50,7 +50,7 @@ impl BufStream {
/// Returns a reference to the underlying TcpStream.
fn get_ref(&self) -> &TcpStream {
&*self.0.get_ref().0
&self.0.get_ref().0
}
}

View File

@@ -9,7 +9,10 @@ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use once_cell::sync::Lazy;
use utils::postgres_backend::{AuthType, Handler, PostgresBackend};
use utils::{
postgres_backend::{AuthType, Handler, PostgresBackend},
postgres_backend_async::QueryError,
};
fn make_tcp_pair() -> (TcpStream, TcpStream) {
let listener = TcpListener::bind("127.0.0.1:0").unwrap();
@@ -105,7 +108,7 @@ fn ssl() {
&mut self,
_pgb: &mut PostgresBackend,
query_string: &str,
) -> anyhow::Result<()> {
) -> Result<(), QueryError> {
self.got_query = query_string == QUERY;
Ok(())
}
@@ -152,7 +155,7 @@ fn no_ssl() {
&mut self,
_pgb: &mut PostgresBackend,
_query_string: &str,
) -> anyhow::Result<()> {
) -> Result<(), QueryError> {
panic!()
}
}
@@ -212,7 +215,7 @@ fn server_forces_ssl() {
&mut self,
_pgb: &mut PostgresBackend,
_query_string: &str,
) -> anyhow::Result<()> {
) -> Result<(), QueryError> {
panic!()
}
}

View File

@@ -2,6 +2,7 @@
name = "pageserver"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[features]
default = []
@@ -9,8 +10,6 @@ default = []
# which adds some runtime cost to run tests on outage conditions
testing = ["fail/failpoints"]
profiling = ["pprof"]
[dependencies]
amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
anyhow = { version = "1.0", features = ["backtrace"] }
@@ -18,7 +17,7 @@ async-stream = "0.3"
async-trait = "0.1"
byteorder = "1.4.3"
bytes = "1.0.1"
chrono = { version = "0.4.23", default-features = false, features = ["clock"] }
chrono = { version = "0.4.23", default-features = false, features = ["clock", "serde"] }
clap = { version = "4.0", features = ["string"] }
close_fds = "0.3.2"
const_format = "0.2.21"
@@ -36,23 +35,22 @@ nix = "0.25"
num-traits = "0.2.15"
once_cell = "1.13.0"
pin-project-lite = "0.2.7"
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
rand = "0.8.3"
regex = "1.4.5"
rstar = "0.9.3"
scopeguard = "1.1.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_json = { version = "1.0", features = ["raw_value"] }
serde_with = "2.0"
signal-hook = "0.3.10"
svg_fmt = "0.4.1"
tar = "0.4.33"
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
thiserror = "1.0"
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
toml_edit = { version = "0.14", features = ["easy"] }
tracing = "0.1.36"
@@ -69,6 +67,7 @@ storage_broker = { version = "0.1", path = "../storage_broker" }
tenant_size_model = { path = "../libs/tenant_size_model" }
utils = { path = "../libs/utils" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
[dev-dependencies]
criterion = "0.4"

View File

@@ -1,8 +1,7 @@
use anyhow::Result;
use pageserver::repository::Key;
use pageserver::tenant::filename::{DeltaFileName, ImageFileName};
use pageserver::tenant::layer_map::LayerMap;
use pageserver::tenant::storage_layer::ValueReconstructState;
use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState};
use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult};
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
use std::cmp::{max, min};
@@ -163,7 +162,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
c.bench_function("captest_uniform_queries", |b| {
b.iter(|| {
for q in queries.clone().into_iter() {
layer_map.search(q.0, q.1).unwrap();
layer_map.search(q.0, q.1);
}
});
});
@@ -192,7 +191,7 @@ fn bench_from_real_project(c: &mut Criterion) {
c.bench_function("real_map_uniform_queries", |b| {
b.iter(|| {
for q in queries.clone().into_iter() {
layer_map.search(q.0, q.1).unwrap();
layer_map.search(q.0, q.1);
}
});
});
@@ -238,7 +237,7 @@ fn bench_sequential(c: &mut Criterion) {
// Run the search queries
b.iter(|| {
for q in queries.clone().into_iter() {
layer_map.search(q.0, q.1).unwrap();
layer_map.search(q.0, q.1);
}
});
});

View File

@@ -84,7 +84,7 @@ fn add_multithreaded_walredo_requesters(
barrier.wait();
execute_all(input, &*manager).unwrap();
execute_all(input, &manager).unwrap();
barrier.wait();
}

View File

@@ -10,19 +10,24 @@
//! This module is responsible for creation of such tarball
//! from data stored in object storage.
//!
use anyhow::{anyhow, bail, ensure, Context, Result};
use anyhow::{anyhow, bail, ensure, Context};
use bytes::{BufMut, BytesMut};
use fail::fail_point;
use itertools::Itertools;
use std::fmt::Write as FmtWrite;
use std::io;
use std::io::Write;
use std::sync::Arc;
use std::time::SystemTime;
use tar::{Builder, EntryType, Header};
use tokio::io;
use tokio::io::AsyncWrite;
use tracing::*;
use crate::tenant::Timeline;
/// NB: This relies on a modified version of tokio_tar that does *not* write the
/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
/// without explicitly calling 'finish' or 'into_inner'!
///
/// See https://github.com/neondatabase/tokio-tar/pull/1
///
use tokio_tar::{Builder, EntryType, Header};
use crate::tenant::{with_ondemand_download, Timeline};
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
@@ -33,116 +38,130 @@ use postgres_ffi::PG_TLI;
use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
use utils::lsn::Lsn;
/// Create basebackup with non-rel data in it.
/// Only include relational data if 'full_backup' is true.
///
/// Currently we use empty 'req_lsn' in two cases:
/// * During the basebackup right after timeline creation
/// * When working without safekeepers. In this situation it is important to match the lsn
/// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
/// to start the replication.
pub async fn send_basebackup_tarball<'a, W>(
write: &'a mut W,
timeline: &'a Timeline,
req_lsn: Option<Lsn>,
prev_lsn: Option<Lsn>,
full_backup: bool,
) -> anyhow::Result<()>
where
W: AsyncWrite + Send + Sync + Unpin,
{
// Compute postgres doesn't have any previous WAL files, but the first
// record that it's going to write needs to include the LSN of the
// previous record (xl_prev). We include prev_record_lsn in the
// "zenith.signal" file, so that postgres can read it during startup.
//
// We don't keep full history of record boundaries in the page server,
// however, only the predecessor of the latest record on each
// timeline. So we can only provide prev_record_lsn when you take a
// base backup at the end of the timeline, i.e. at last_record_lsn.
// Even at the end of the timeline, we sometimes don't have a valid
// prev_lsn value; that happens if the timeline was just branched from
// an old LSN and it doesn't have any WAL of its own yet. We will set
// prev_lsn to Lsn(0) if we cannot provide the correct value.
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
// Backup was requested at a particular LSN. The caller should've
// already checked that it's a valid LSN.
// If the requested point is the end of the timeline, we can
// provide prev_lsn. (get_last_record_rlsn() might return it as
// zero, though, if no WAL has been generated on this timeline
// yet.)
let end_of_timeline = timeline.get_last_record_rlsn();
if req_lsn == end_of_timeline.last {
(end_of_timeline.prev, req_lsn)
} else {
(Lsn(0), req_lsn)
}
} else {
// Backup was requested at end of the timeline.
let end_of_timeline = timeline.get_last_record_rlsn();
(end_of_timeline.prev, end_of_timeline.last)
};
// Consolidate the derived and the provided prev_lsn values
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
if backup_prev != Lsn(0) {
ensure!(backup_prev == provided_prev_lsn);
}
provided_prev_lsn
} else {
backup_prev
};
info!(
"taking basebackup lsn={}, prev_lsn={} (full_backup={})",
backup_lsn, prev_lsn, full_backup
);
let basebackup = Basebackup {
ar: Builder::new_non_terminated(write),
timeline,
lsn: backup_lsn,
prev_record_lsn: prev_lsn,
full_backup,
};
basebackup
.send_tarball()
.instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
.await
}
/// This is short-living object only for the time of tarball creation,
/// created mostly to avoid passing a lot of parameters between various functions
/// used for constructing tarball.
pub struct Basebackup<'a, W>
struct Basebackup<'a, W>
where
W: Write,
W: AsyncWrite + Send + Sync + Unpin,
{
ar: Builder<AbortableWrite<W>>,
timeline: &'a Arc<Timeline>,
pub lsn: Lsn,
ar: Builder<&'a mut W>,
timeline: &'a Timeline,
lsn: Lsn,
prev_record_lsn: Lsn,
full_backup: bool,
finished: bool,
}
// Create basebackup with non-rel data in it.
// Only include relational data if 'full_backup' is true.
//
// Currently we use empty lsn in two cases:
// * During the basebackup right after timeline creation
// * When working without safekeepers. In this situation it is important to match the lsn
// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
// to start the replication.
impl<'a, W> Basebackup<'a, W>
where
W: Write,
W: AsyncWrite + Send + Sync + Unpin,
{
pub fn new(
write: W,
timeline: &'a Arc<Timeline>,
req_lsn: Option<Lsn>,
prev_lsn: Option<Lsn>,
full_backup: bool,
) -> Result<Basebackup<'a, W>> {
// Compute postgres doesn't have any previous WAL files, but the first
// record that it's going to write needs to include the LSN of the
// previous record (xl_prev). We include prev_record_lsn in the
// "zenith.signal" file, so that postgres can read it during startup.
//
// We don't keep full history of record boundaries in the page server,
// however, only the predecessor of the latest record on each
// timeline. So we can only provide prev_record_lsn when you take a
// base backup at the end of the timeline, i.e. at last_record_lsn.
// Even at the end of the timeline, we sometimes don't have a valid
// prev_lsn value; that happens if the timeline was just branched from
// an old LSN and it doesn't have any WAL of its own yet. We will set
// prev_lsn to Lsn(0) if we cannot provide the correct value.
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
// Backup was requested at a particular LSN. The caller should've
// already checked that it's a valid LSN.
// If the requested point is the end of the timeline, we can
// provide prev_lsn. (get_last_record_rlsn() might return it as
// zero, though, if no WAL has been generated on this timeline
// yet.)
let end_of_timeline = timeline.get_last_record_rlsn();
if req_lsn == end_of_timeline.last {
(end_of_timeline.prev, req_lsn)
} else {
(Lsn(0), req_lsn)
}
} else {
// Backup was requested at end of the timeline.
let end_of_timeline = timeline.get_last_record_rlsn();
(end_of_timeline.prev, end_of_timeline.last)
};
// Consolidate the derived and the provided prev_lsn values
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
if backup_prev != Lsn(0) {
ensure!(backup_prev == provided_prev_lsn)
}
provided_prev_lsn
} else {
backup_prev
};
info!(
"taking basebackup lsn={}, prev_lsn={} (full_backup={})",
backup_lsn, prev_lsn, full_backup
);
Ok(Basebackup {
ar: Builder::new(AbortableWrite::new(write)),
timeline,
lsn: backup_lsn,
prev_record_lsn: prev_lsn,
full_backup,
finished: false,
})
}
pub fn send_tarball(mut self) -> anyhow::Result<()> {
async fn send_tarball(mut self) -> anyhow::Result<()> {
// TODO include checksum
// Create pgdata subdirs structure
for dir in PGDATA_SUBDIRS.iter() {
let header = new_tar_header_dir(*dir)?;
self.ar.append(&header, &mut io::empty())?;
let header = new_tar_header_dir(dir)?;
self.ar
.append(&header, &mut io::empty())
.await
.context("could not add directory to basebackup tarball")?;
}
// Send empty config files.
// Send config files.
for filepath in PGDATA_SPECIAL_FILES.iter() {
if *filepath == "pg_hba.conf" {
let data = PG_HBA.as_bytes();
let header = new_tar_header(filepath, data.len() as u64)?;
self.ar.append(&header, data)?;
self.ar
.append(&header, data)
.await
.context("could not add config file to basebackup tarball")?;
} else {
let header = new_tar_header(filepath, 0)?;
self.ar.append(&header, &mut io::empty())?;
self.ar
.append(&header, &mut io::empty())
.await
.context("could not add config file to basebackup tarball")?;
}
}
@@ -152,24 +171,31 @@ where
SlruKind::MultiXactOffsets,
SlruKind::MultiXactMembers,
] {
for segno in self.timeline.list_slru_segments(kind, self.lsn)? {
self.add_slru_segment(kind, segno)?;
for segno in
with_ondemand_download(|| self.timeline.list_slru_segments(kind, self.lsn)).await?
{
self.add_slru_segment(kind, segno).await?;
}
}
// Create tablespace directories
for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? {
self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
for ((spcnode, dbnode), has_relmap_file) in
with_ondemand_download(|| self.timeline.list_dbdirs(self.lsn)).await?
{
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
// Gather and send relational files in each database if full backup is requested.
if self.full_backup {
for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? {
self.add_rel(rel)?;
for rel in
with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn))
.await?
{
self.add_rel(rel).await?;
}
}
}
for xid in self.timeline.list_twophase_files(self.lsn)? {
self.add_twophase_file(xid)?;
for xid in with_ondemand_download(|| self.timeline.list_twophase_files(self.lsn)).await? {
self.add_twophase_file(xid).await?;
}
fail_point!("basebackup-before-control-file", |_| {
@@ -177,42 +203,46 @@ where
});
// Generate pg_control and bootstrap WAL segment.
self.add_pgcontrol_file()?;
self.ar.finish()?;
self.finished = true;
self.add_pgcontrol_file().await?;
self.ar.finish().await?;
debug!("all tarred up!");
Ok(())
}
fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
let nblocks = self.timeline.get_rel_size(tag, self.lsn, false)?;
// Function that adds relation segment data to archive
let mut add_file = |segment_index, data: &Vec<u8>| -> anyhow::Result<()> {
let file_name = tag.to_segfile_name(segment_index as u32);
let header = new_tar_header(&file_name, data.len() as u64)?;
self.ar.append(&header, data.as_slice())?;
Ok(())
};
async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
let nblocks =
with_ondemand_download(|| self.timeline.get_rel_size(tag, self.lsn, false)).await?;
// If the relation is empty, create an empty file
if nblocks == 0 {
add_file(0, &vec![])?;
let file_name = tag.to_segfile_name(0);
let header = new_tar_header(&file_name, 0)?;
self.ar.append(&header, &mut io::empty()).await?;
return Ok(());
}
// Add a file for each chunk of blocks (aka segment)
let chunks = (0..nblocks).chunks(RELSEG_SIZE as usize);
for (seg, blocks) in chunks.into_iter().enumerate() {
let mut startblk = 0;
let mut seg = 0;
while startblk < nblocks {
let endblk = std::cmp::min(startblk + RELSEG_SIZE, nblocks);
let mut segment_data: Vec<u8> = vec![];
for blknum in blocks {
let img = self
.timeline
.get_rel_page_at_lsn(tag, blknum, self.lsn, false)?;
for blknum in startblk..endblk {
let img = with_ondemand_download(|| {
self.timeline
.get_rel_page_at_lsn(tag, blknum, self.lsn, false)
})
.await?;
segment_data.extend_from_slice(&img[..]);
}
add_file(seg, &segment_data)?;
let file_name = tag.to_segfile_name(seg as u32);
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
self.ar.append(&header, segment_data.as_slice()).await?;
seg += 1;
startblk = endblk;
}
Ok(())
@@ -221,14 +251,18 @@ where
//
// Generate SLRU segment files from repository.
//
fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?;
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
let nblocks =
with_ondemand_download(|| self.timeline.get_slru_segment_size(slru, segno, self.lsn))
.await?;
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
for blknum in 0..nblocks {
let img = self
.timeline
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
let img = with_ondemand_download(|| {
self.timeline
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
})
.await?;
if slru == SlruKind::Clog {
ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
@@ -241,7 +275,7 @@ where
let segname = format!("{}/{:>04X}", slru.to_str(), segno);
let header = new_tar_header(&segname, slru_buf.len() as u64)?;
self.ar.append(&header, slru_buf.as_slice())?;
self.ar.append(&header, slru_buf.as_slice()).await?;
trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
Ok(())
@@ -253,14 +287,16 @@ where
// Each directory contains a PG_VERSION file, and the default database
// directories also contain pg_filenode.map files.
//
fn add_dbdir(
async fn add_dbdir(
&mut self,
spcnode: u32,
dbnode: u32,
has_relmap_file: bool,
) -> anyhow::Result<()> {
let relmap_img = if has_relmap_file {
let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?;
let img =
with_ondemand_download(|| self.timeline.get_relmap_file(spcnode, dbnode, self.lsn))
.await?;
ensure!(img.len() == 512);
Some(img)
} else {
@@ -270,14 +306,14 @@ where
if spcnode == GLOBALTABLESPACE_OID {
let pg_version_str = self.timeline.pg_version.to_string();
let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
self.ar.append(&header, pg_version_str.as_bytes())?;
self.ar.append(&header, pg_version_str.as_bytes()).await?;
info!("timeline.pg_version {}", self.timeline.pg_version);
if let Some(img) = relmap_img {
// filenode map for global tablespace
let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
self.ar.append(&header, &img[..])?;
self.ar.append(&header, &img[..]).await?;
} else {
warn!("global/pg_filenode.map is missing");
}
@@ -293,9 +329,8 @@ where
// XLOG_TBLSPC_DROP records. But we probably should just
// throw an error on CREATE TABLESPACE in the first place.
if !has_relmap_file
&& self
.timeline
.list_rels(spcnode, dbnode, self.lsn)?
&& with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn))
.await?
.is_empty()
{
return Ok(());
@@ -306,18 +341,18 @@ where
// Append dir path for each database
let path = format!("base/{}", dbnode);
let header = new_tar_header_dir(&path)?;
self.ar.append(&header, &mut io::empty())?;
self.ar.append(&header, &mut io::empty()).await?;
if let Some(img) = relmap_img {
let dst_path = format!("base/{}/PG_VERSION", dbnode);
let pg_version_str = self.timeline.pg_version.to_string();
let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
self.ar.append(&header, pg_version_str.as_bytes())?;
self.ar.append(&header, pg_version_str.as_bytes()).await?;
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
let header = new_tar_header(&relmap_path, img.len() as u64)?;
self.ar.append(&header, &img[..])?;
self.ar.append(&header, &img[..]).await?;
}
};
Ok(())
@@ -326,8 +361,8 @@ where
//
// Extract twophase state files
//
fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
let img = self.timeline.get_twophase_file(xid, self.lsn)?;
async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
let img = with_ondemand_download(|| self.timeline.get_twophase_file(xid, self.lsn)).await?;
let mut buf = BytesMut::new();
buf.extend_from_slice(&img[..]);
@@ -335,7 +370,7 @@ where
buf.put_u32_le(crc);
let path = format!("pg_twophase/{:>08X}", xid);
let header = new_tar_header(&path, buf.len() as u64)?;
self.ar.append(&header, &buf[..])?;
self.ar.append(&header, &buf[..]).await?;
Ok(())
}
@@ -344,7 +379,7 @@ where
// Add generated pg_control file and bootstrap WAL segment.
// Also send zenith.signal file with extra bootstrap data.
//
fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
// add zenith.signal file
let mut zenith_signal = String::new();
if self.prev_record_lsn == Lsn(0) {
@@ -356,18 +391,18 @@ where
} else {
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
}
self.ar.append(
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
zenith_signal.as_bytes(),
)?;
self.ar
.append(
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
zenith_signal.as_bytes(),
)
.await?;
let checkpoint_bytes = self
.timeline
.get_checkpoint(self.lsn)
let checkpoint_bytes = with_ondemand_download(|| self.timeline.get_checkpoint(self.lsn))
.await
.context("failed to get checkpoint bytes")?;
let pg_control_bytes = self
.timeline
.get_control_file(self.lsn)
let pg_control_bytes = with_ondemand_download(|| self.timeline.get_control_file(self.lsn))
.await
.context("failed get control bytes")?;
let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
@@ -379,7 +414,7 @@ where
//send pg_control
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
self.ar.append(&header, &pg_control_bytes[..])?;
self.ar.append(&header, &pg_control_bytes[..]).await?;
//send wal segment
let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -391,24 +426,11 @@ where
postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version)
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
self.ar.append(&header, &wal_seg[..])?;
self.ar.append(&header, &wal_seg[..]).await?;
Ok(())
}
}
impl<'a, W> Drop for Basebackup<'a, W>
where
W: Write,
{
/// If the basebackup was not finished, prevent the Archive::drop() from
/// writing the end-of-archive marker.
fn drop(&mut self) {
if !self.finished {
self.ar.get_mut().abort();
}
}
}
//
// Create new tarball entry header
//
@@ -444,49 +466,3 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result<Header> {
header.set_cksum();
Ok(header)
}
/// A wrapper that passes through all data to the underlying Write,
/// until abort() is called.
///
/// tar::Builder has an annoying habit of finishing the archive with
/// a valid tar end-of-archive marker (two 512-byte sectors of zeros),
/// even if an error occurs and we don't finish building the archive.
/// We'd rather abort writing the tarball immediately than construct
/// a seemingly valid but incomplete archive. This wrapper allows us
/// to swallow the end-of-archive marker that Builder::drop() emits,
/// without writing it to the underlying sink.
///
struct AbortableWrite<W> {
w: W,
aborted: bool,
}
impl<W> AbortableWrite<W> {
pub fn new(w: W) -> Self {
AbortableWrite { w, aborted: false }
}
pub fn abort(&mut self) {
self.aborted = true;
}
}
impl<W> Write for AbortableWrite<W>
where
W: Write,
{
fn write(&mut self, data: &[u8]) -> io::Result<usize> {
if self.aborted {
Ok(data.len())
} else {
self.w.write(data)
}
}
fn flush(&mut self) -> io::Result<()> {
if self.aborted {
Ok(())
} else {
self.w.flush()
}
}
}

View File

@@ -7,19 +7,20 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};
use anyhow::{anyhow, Context};
use clap::{Arg, ArgAction, Command};
use fail::FailScenario;
use remote_storage::GenericRemoteStorage;
use tracing::*;
use metrics::set_build_info_metric;
use pageserver::{
config::{defaults::*, PageServerConf},
http, page_cache, page_service, profiling, task_mgr,
http, page_cache, page_service, task_mgr,
task_mgr::TaskKind,
task_mgr::{
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
},
tenant_mgr, virtual_file,
tenant::mgr,
virtual_file,
};
use remote_storage::GenericRemoteStorage;
use utils::{
auth::JwtAuth,
logging,
@@ -39,8 +40,6 @@ const FEATURES: &[&str] = &[
"testing",
#[cfg(feature = "fail/failpoints")]
"fail/failpoints",
#[cfg(feature = "profiling")]
"profiling",
];
fn version() -> String {
@@ -127,7 +126,7 @@ fn initialize_config(
);
}
// Supplement the CLI arguments with the config file
let cfg_file_contents = std::fs::read_to_string(&cfg_file_path).with_context(|| {
let cfg_file_contents = std::fs::read_to_string(cfg_file_path).with_context(|| {
format!(
"Failed to read pageserver config at '{}'",
cfg_file_path.display()
@@ -181,7 +180,7 @@ fn initialize_config(
if update_config {
info!("Writing pageserver config to '{}'", cfg_file_path.display());
std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| {
std::fs::write(cfg_file_path, toml.to_string()).with_context(|| {
format!(
"Failed to write pageserver config to '{}'",
cfg_file_path.display()
@@ -201,8 +200,12 @@ fn initialize_config(
}
fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
// Initialize logging
logging::init(conf.log_format)?;
// Print version to the log, and expose it as a prometheus metric too.
info!("version: {}", version());
set_build_info_metric(GIT_VERSION);
// If any failpoints were set from FAILPOINTS environment variable,
// print them to the log for debugging purposes
@@ -218,40 +221,36 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
)
}
// Create and lock PID file. This ensures that there cannot be more than one
// pageserver process running at the same time.
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
let lock_file =
utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
info!("Claimed pid file at {lock_file_path:?}");
// ensure that the lock file is held even if the main thread of the process is panics
// we need to release the lock file only when the current process is gone
// Ensure that the lock file is held even if the main thread of the process panics.
// We need to release the lock file only when the process exits.
std::mem::forget(lock_file);
// TODO: Check that it looks like a valid repository before going further
// Bind the HTTP and libpq ports early, so that if they are in use by some other
// process, we error out early.
let http_addr = &conf.listen_http_addr;
info!("Starting pageserver http handler on {http_addr}");
let http_listener = tcp_listener::bind(http_addr)?;
// bind sockets before daemonizing so we report errors early and do not return until we are listening
info!(
"Starting pageserver http handler on {}",
conf.listen_http_addr
);
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?;
info!(
"Starting pageserver pg protocol handler on {}",
conf.listen_pg_addr
);
let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
let pg_addr = &conf.listen_pg_addr;
info!("Starting pageserver pg protocol handler on {pg_addr}");
let pageserver_listener = tcp_listener::bind(pg_addr)?;
// Install signal handlers
let signals = signals::install_shutdown_handlers()?;
// start profiler (if enabled)
let profiler_guard = profiling::init_profiler(conf);
// Launch broker client
WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
// initialize authentication for incoming connections
// Initialize authentication for incoming connections
let auth = match &conf.auth_type {
AuthType::Trust | AuthType::MD5 => None,
AuthType::Trust => None,
AuthType::NeonJWT => {
// unwrap is ok because check is performed when creating config, so path is set and file exists
let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
@@ -260,54 +259,54 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
};
info!("Using auth: {:#?}", conf.auth_type);
match var("ZENITH_AUTH_TOKEN") {
Ok(v) => {
// TODO: remove ZENITH_AUTH_TOKEN once it's not used anywhere in development/staging/prod configuration.
match (var("ZENITH_AUTH_TOKEN"), var("NEON_AUTH_TOKEN")) {
(old, Ok(v)) => {
info!("Loaded JWT token for authentication with Safekeeper");
if let Ok(v_old) = old {
warn!(
"JWT token for Safekeeper is specified twice, ZENITH_AUTH_TOKEN is deprecated"
);
if v_old != v {
warn!("JWT token for Safekeeper has two different values, choosing NEON_AUTH_TOKEN");
}
}
pageserver::config::SAFEKEEPER_AUTH_TOKEN
.set(Arc::new(v))
.map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
}
Err(VarError::NotPresent) => {
(Ok(v), _) => {
info!("Loaded JWT token for authentication with Safekeeper");
warn!("Please update pageserver configuration: the JWT token should be NEON_AUTH_TOKEN, not ZENITH_AUTH_TOKEN");
pageserver::config::SAFEKEEPER_AUTH_TOKEN
.set(Arc::new(v))
.map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
}
(_, Err(VarError::NotPresent)) => {
info!("No JWT token for authentication with Safekeeper detected");
}
Err(e) => {
(_, Err(e)) => {
return Err(e).with_context(|| {
"Failed to either load to detect non-present ZENITH_AUTH_TOKEN environment variable"
"Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable"
})
}
};
let remote_storage = conf
.remote_storage_config
.as_ref()
.map(GenericRemoteStorage::from_config)
.transpose()
.context("Failed to init generic remote storage")?;
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;
let (init_result_sender, init_result_receiver) =
std::sync::mpsc::channel::<anyhow::Result<()>>();
let storage_for_spawn = remote_storage.clone();
let _handler = BACKGROUND_RUNTIME.spawn(async move {
let result = tenant_mgr::init_tenant_mgr(conf, storage_for_spawn).await;
init_result_sender.send(result)
});
match init_result_receiver.recv() {
Ok(init_result) => init_result.context("Failed to init tenant_mgr")?,
Err(_sender_dropped_err) => {
anyhow::bail!("Failed to init tenant_mgr: no init status was returned");
}
}
// Scan the local 'tenants/' directory and start loading the tenants
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
// Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME.
// bind before launching separate thread so the error reported before startup exits
// Create a Service from the router above to handle incoming requests.
// Start up the service to handle HTTP mgmt API request. We created the
// listener earlier already.
{
let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
let router = http::make_router(conf, auth.clone(), remote_storage)?;
let service =
utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap();
let router = http::make_router(conf, auth.clone(), remote_storage)?
.build()
.map_err(|err| anyhow!(err))?;
let service = utils::http::RouterService::new(router).unwrap();
let server = hyper::Server::from_tcp(http_listener)?
.serve(service)
.with_graceful_shutdown(task_mgr::shutdown_watcher());
@@ -324,10 +323,31 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
Ok(())
},
);
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
task_mgr::spawn(
MGMT_REQUEST_RUNTIME.handle(),
TaskKind::MetricsCollection,
None,
None,
"consumption metrics collection",
true,
async move {
pageserver::consumption_metrics::collect_metrics(
metric_collection_endpoint,
conf.metric_collection_interval,
conf.id,
)
.instrument(info_span!("metrics_collection"))
.await?;
Ok(())
},
);
}
}
// Spawn a task to listen for libpq connections. It will spawn further tasks
// for each connection.
// for each connection. We created the listener earlier already.
task_mgr::spawn(
COMPUTE_REQUEST_RUNTIME.handle(),
TaskKind::LibpqEndpointListener,
@@ -340,8 +360,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
},
);
set_build_info_metric(GIT_VERSION);
// All started up! Now just sit and wait for shutdown signal.
signals.handle(|signal| match signal {
Signal::Quit => {
@@ -349,7 +367,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
"Got {}. Terminating in immediate shutdown mode",
signal.name()
);
profiling::exit_profiler(conf, &profiler_guard);
std::process::exit(111);
}
@@ -358,13 +375,42 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
"Got {}. Terminating gracefully in fast shutdown mode",
signal.name()
);
profiling::exit_profiler(conf, &profiler_guard);
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
unreachable!()
}
})
}
fn create_remote_storage_client(
conf: &'static PageServerConf,
) -> anyhow::Result<Option<GenericRemoteStorage>> {
let config = if let Some(config) = &conf.remote_storage_config {
config
} else {
// No remote storage configured.
return Ok(None);
};
// Create the client
let mut remote_storage = GenericRemoteStorage::from_config(config)?;
// If `test_remote_failures` is non-zero, wrap the client with a
// wrapper that simulates failures.
if conf.test_remote_failures > 0 {
if !cfg!(feature = "testing") {
anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature");
}
info!(
"Simulating remote failures for first {} attempts of each op",
conf.test_remote_failures
);
remote_storage =
GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
}
Ok(Some(remote_storage))
}
fn cli() -> Command {
Command::new("Neon page server")
.about("Materializes WAL stream to pages and serves them to the postgres")

View File

@@ -60,7 +60,7 @@ fn main() -> anyhow::Result<()> {
}
fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
let control_file = ControlFileData::decode(&std::fs::read(&control_file_path)?)?;
let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
println!("{control_file:?}");
let control_file_initdb = Lsn(control_file.checkPoint);
println!(
@@ -79,7 +79,7 @@ fn print_layerfile(path: &Path) -> anyhow::Result<()> {
}
fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
let metadata_bytes = std::fs::read(&path)?;
let metadata_bytes = std::fs::read(path)?;
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
println!("Current metadata:\n{meta:?}");
let mut update_meta = false;
@@ -110,7 +110,7 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an
if update_meta {
let metadata_bytes = meta.to_bytes()?;
std::fs::write(&path, &metadata_bytes)?;
std::fs::write(path, metadata_bytes)?;
}
Ok(())

View File

@@ -12,6 +12,7 @@ use utils::crashsafe::path_with_suffix_extension;
use utils::id::ConnectionId;
use once_cell::sync::OnceCell;
use reqwest::Url;
use std::num::NonZeroUsize;
use std::path::{Path, PathBuf};
use std::str::FromStr;
@@ -26,14 +27,15 @@ use utils::{
postgres_backend::AuthType,
};
use crate::tenant::config::TenantConf;
use crate::tenant::config::TenantConfOpt;
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
use crate::tenant_config::{TenantConf, TenantConfOpt};
use crate::{
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
};
pub mod defaults {
use crate::tenant_config::defaults::*;
use crate::tenant::config::defaults::*;
use const_format::formatcp;
pub use pageserver_api::{
@@ -55,6 +57,8 @@ pub mod defaults {
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
///
/// Default built-in configuration file.
///
@@ -78,6 +82,8 @@ pub mod defaults {
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
# [tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -132,16 +138,22 @@ pub struct PageServerConf {
pub auth_validation_public_key_path: Option<PathBuf>,
pub remote_storage_config: Option<RemoteStorageConfig>,
pub profiling: ProfilingConfig,
pub default_tenant_conf: TenantConf,
/// Storage broker endpoints to connect to.
pub broker_endpoint: Uri,
pub broker_keepalive_interval: Duration,
pub log_format: LogFormat,
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
// How often to collect metrics and send them to the metrics endpoint.
pub metric_collection_interval: Duration,
pub metric_collection_endpoint: Option<Url>,
pub test_remote_failures: u64,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -152,25 +164,6 @@ pub struct PageServerConf {
/// startup code to the connection code through a dozen layers.
pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ProfilingConfig {
Disabled,
PageRequests,
}
impl FromStr for ProfilingConfig {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<ProfilingConfig, Self::Err> {
let result = match s {
"disabled" => ProfilingConfig::Disabled,
"page_requests" => ProfilingConfig::PageRequests,
_ => bail!("invalid value \"{s}\" for profiling option, valid values are \"disabled\" and \"page_requests\""),
};
Ok(result)
}
}
// use dedicated enum for builder to better indicate the intention
// and avoid possible confusion with nested options
pub enum BuilderValue<T> {
@@ -213,12 +206,17 @@ struct PageServerConfigBuilder {
id: BuilderValue<NodeId>,
profiling: BuilderValue<ProfilingConfig>,
broker_endpoint: BuilderValue<Uri>,
broker_keepalive_interval: BuilderValue<Duration>,
log_format: BuilderValue<LogFormat>,
concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
metric_collection_interval: BuilderValue<Duration>,
metric_collection_endpoint: BuilderValue<Option<Url>>,
test_remote_failures: BuilderValue<u64>,
}
impl Default for PageServerConfigBuilder {
@@ -243,13 +241,23 @@ impl Default for PageServerConfigBuilder {
auth_validation_public_key_path: Set(None),
remote_storage_config: Set(None),
id: NotSet,
profiling: Set(ProfilingConfig::Disabled),
broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
.parse()
.expect("failed to parse default broker endpoint")),
broker_keepalive_interval: Set(humantime::parse_duration(
storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
)
.expect("cannot parse default keepalive interval")),
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
metric_collection_interval: Set(humantime::parse_duration(
DEFAULT_METRIC_COLLECTION_INTERVAL,
)
.expect("cannot parse default metric collection interval")),
metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
test_remote_failures: Set(0),
}
}
}
@@ -310,12 +318,12 @@ impl PageServerConfigBuilder {
self.broker_endpoint = BuilderValue::Set(broker_endpoint)
}
pub fn id(&mut self, node_id: NodeId) {
self.id = BuilderValue::Set(node_id)
pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) {
self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
}
pub fn profiling(&mut self, profiling: ProfilingConfig) {
self.profiling = BuilderValue::Set(profiling)
pub fn id(&mut self, node_id: NodeId) {
self.id = BuilderValue::Set(node_id)
}
pub fn log_format(&mut self, log_format: LogFormat) {
@@ -326,6 +334,18 @@ impl PageServerConfigBuilder {
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
}
pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) {
self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
}
pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
}
pub fn test_remote_failures(&mut self, fail_first: u64) {
self.test_remote_failures = BuilderValue::Set(fail_first);
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
Ok(PageServerConf {
listen_pg_addr: self
@@ -359,18 +379,29 @@ impl PageServerConfigBuilder {
.remote_storage_config
.ok_or(anyhow!("missing remote_storage_config"))?,
id: self.id.ok_or(anyhow!("missing id"))?,
profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
// TenantConf is handled separately
default_tenant_conf: TenantConf::default(),
broker_endpoint: self
.broker_endpoint
.ok_or(anyhow!("No broker endpoints provided"))?,
broker_keepalive_interval: self
.broker_keepalive_interval
.ok_or(anyhow!("No broker keepalive interval provided"))?,
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
concurrent_tenant_size_logical_size_queries: self
.concurrent_tenant_size_logical_size_queries
.ok_or(anyhow!(
"missing concurrent_tenant_size_logical_size_queries"
))?,
metric_collection_interval: self
.metric_collection_interval
.ok_or(anyhow!("missing metric_collection_interval"))?,
metric_collection_endpoint: self
.metric_collection_endpoint
.ok_or(anyhow!("missing metric_collection_endpoint"))?,
test_remote_failures: self
.test_remote_failures
.ok_or(anyhow!("missing test_remote_failuers"))?,
})
}
}
@@ -530,8 +561,8 @@ impl PageServerConf {
t_conf = Self::parse_toml_tenant_conf(item)?;
}
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
"profiling" => builder.profiling(parse_toml_from_str(key, item)?),
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
"broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
"log_format" => builder.log_format(
LogFormat::from_config(&parse_toml_string(key, item)?)?
),
@@ -541,6 +572,13 @@ impl PageServerConf {
let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
ConfigurableSemaphore::new(permits)
}),
"metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
"metric_collection_endpoint" => {
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
builder.metric_collection_endpoint(Some(endpoint));
},
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -656,11 +694,14 @@ impl PageServerConf {
auth_type: AuthType::Trust,
auth_validation_public_key_path: None,
remote_storage_config: None,
profiling: ProfilingConfig::Disabled,
default_tenant_conf: TenantConf::dummy_conf(),
default_tenant_conf: TenantConf::default(),
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
broker_keepalive_interval: Duration::from_secs(5000),
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
metric_collection_interval: Duration::from_secs(60),
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
test_remote_failures: 0,
}
}
}
@@ -791,6 +832,8 @@ max_file_descriptors = 333
initial_superuser_name = 'zzzz'
id = 10
metric_collection_interval = '222 s'
metric_collection_endpoint = 'http://localhost:80/metrics'
log_format = 'json'
"#;
@@ -826,11 +869,18 @@ log_format = 'json'
auth_type: AuthType::Trust,
auth_validation_public_key_path: None,
remote_storage_config: None,
profiling: ProfilingConfig::Disabled,
default_tenant_conf: TenantConf::default(),
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
broker_keepalive_interval: humantime::parse_duration(
storage_broker::DEFAULT_KEEPALIVE_INTERVAL
)?,
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
metric_collection_interval: humantime::parse_duration(
defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
)?,
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
test_remote_failures: 0,
},
"Correct defaults should be used when no config values are provided"
);
@@ -869,11 +919,14 @@ log_format = 'json'
auth_type: AuthType::Trust,
auth_validation_public_key_path: None,
remote_storage_config: None,
profiling: ProfilingConfig::Disabled,
default_tenant_conf: TenantConf::default(),
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
broker_keepalive_interval: Duration::from_secs(5),
log_format: LogFormat::Json,
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
metric_collection_interval: Duration::from_secs(222),
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
test_remote_failures: 0,
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -0,0 +1,324 @@
//!
//! Periodically collect consumption metrics for all active tenants
//! and push them to a HTTP endpoint.
//! Cache metrics to send only the updated ones.
//!
use anyhow;
use tracing::*;
use utils::id::NodeId;
use utils::id::TimelineId;
use crate::task_mgr;
use crate::tenant::mgr;
use pageserver_api::models::TenantState;
use utils::id::TenantId;
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use std::collections::HashMap;
use std::fmt;
use std::str::FromStr;
use std::time::Duration;
use chrono::{DateTime, Utc};
use rand::Rng;
use reqwest::Url;
/// ConsumptionMetric struct that defines the format for one metric entry
/// i.e.
///
/// ```json
/// {
/// "metric": "remote_storage_size",
/// "type": "absolute",
/// "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
/// "timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
/// "time": "2022-12-28T11:07:19.317310284Z",
/// "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
/// "value": 12345454,
/// }
/// ```
#[serde_as]
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
pub struct ConsumptionMetric {
pub metric: ConsumptionMetricKind,
#[serde(rename = "type")]
pub metric_type: &'static str,
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId,
#[serde_as(as = "Option<DisplayFromStr>")]
#[serde(skip_serializing_if = "Option::is_none")]
pub timeline_id: Option<TimelineId>,
pub time: DateTime<Utc>,
pub idempotency_key: String,
pub value: u64,
}
impl ConsumptionMetric {
pub fn new_absolute<R: Rng + ?Sized>(
metric: ConsumptionMetricKind,
tenant_id: TenantId,
timeline_id: Option<TimelineId>,
value: u64,
node_id: NodeId,
rng: &mut R,
) -> Self {
Self {
metric,
metric_type: "absolute",
tenant_id,
timeline_id,
time: Utc::now(),
// key that allows metric collector to distinguish unique events
idempotency_key: format!("{}-{}-{:04}", Utc::now(), node_id, rng.gen_range(0..=9999)),
value,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ConsumptionMetricKind {
/// Amount of WAL produced , by a timeline, i.e. last_record_lsn
/// This is an absolute, per-timeline metric.
WrittenSize,
/// Size of all tenant branches including WAL
/// This is an absolute, per-tenant metric.
/// This is the same metric that tenant/tenant_id/size endpoint returns.
SyntheticStorageSize,
/// Size of all the layer files in the tenant's directory on disk on the pageserver.
/// This is an absolute, per-tenant metric.
/// See also prometheus metric RESIDENT_PHYSICAL_SIZE.
ResidentSize,
/// Size of the remote storage (S3) directory.
/// This is an absolute, per-tenant metric.
RemoteStorageSize,
/// Logical size of the data in the timeline
/// This is an absolute, per-timeline metric
TimelineLogicalSize,
}
impl FromStr for ConsumptionMetricKind {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"written_size" => Ok(Self::WrittenSize),
"synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
"resident_size" => Ok(Self::ResidentSize),
"remote_storage_size" => Ok(Self::RemoteStorageSize),
"timeline_logical_size" => Ok(Self::TimelineLogicalSize),
_ => anyhow::bail!("invalid value \"{s}\" for metric type"),
}
}
}
impl fmt::Display for ConsumptionMetricKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(match self {
ConsumptionMetricKind::WrittenSize => "written_size",
ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size",
ConsumptionMetricKind::ResidentSize => "resident_size",
ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size",
ConsumptionMetricKind::TimelineLogicalSize => "timeline_logical_size",
})
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ConsumptionMetricsKey {
tenant_id: TenantId,
timeline_id: Option<TimelineId>,
metric: ConsumptionMetricKind,
}
#[derive(serde::Serialize)]
struct EventChunk<'a> {
events: &'a [ConsumptionMetric],
}
/// Main thread that serves metrics collection
pub async fn collect_metrics(
metric_collection_endpoint: &Url,
metric_collection_interval: Duration,
node_id: NodeId,
) -> anyhow::Result<()> {
let mut ticker = tokio::time::interval(metric_collection_interval);
info!("starting collect_metrics");
// define client here to reuse it for all requests
let client = reqwest::Client::new();
let mut cached_metrics: HashMap<ConsumptionMetricsKey, u64> = HashMap::new();
loop {
tokio::select! {
_ = task_mgr::shutdown_watcher() => {
info!("collect_metrics received cancellation request");
return Ok(());
},
_ = ticker.tick() => {
collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await?;
}
}
}
}
/// One iteration of metrics collection
///
/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
/// Cache metrics to avoid sending the same metrics multiple times.
pub async fn collect_metrics_task(
client: &reqwest::Client,
cached_metrics: &mut HashMap<ConsumptionMetricsKey, u64>,
metric_collection_endpoint: &reqwest::Url,
node_id: NodeId,
) -> anyhow::Result<()> {
let mut current_metrics: Vec<(ConsumptionMetricsKey, u64)> = Vec::new();
trace!(
"starting collect_metrics_task. metric_collection_endpoint: {}",
metric_collection_endpoint
);
// get list of tenants
let tenants = mgr::list_tenants().await;
// iterate through list of Active tenants and collect metrics
for (tenant_id, tenant_state) in tenants {
if tenant_state != TenantState::Active {
continue;
}
let tenant = mgr::get_tenant(tenant_id, true).await?;
let mut tenant_resident_size = 0;
// iterate through list of timelines in tenant
for timeline in tenant.list_timelines().iter() {
// collect per-timeline metrics only for active timelines
if timeline.is_active() {
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
current_metrics.push((
ConsumptionMetricsKey {
tenant_id,
timeline_id: Some(timeline.timeline_id),
metric: ConsumptionMetricKind::WrittenSize,
},
timeline_written_size,
));
let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?;
// Only send timeline logical size when it is fully calculated.
if is_exact {
current_metrics.push((
ConsumptionMetricsKey {
tenant_id,
timeline_id: Some(timeline.timeline_id),
metric: ConsumptionMetricKind::TimelineLogicalSize,
},
timeline_logical_size,
));
}
}
let timeline_resident_size = timeline.get_resident_physical_size();
tenant_resident_size += timeline_resident_size;
}
let tenant_remote_size = tenant.get_remote_size().await?;
debug!(
"collected current metrics for tenant: {}: state={:?} resident_size={} remote_size={}",
tenant_id, tenant_state, tenant_resident_size, tenant_remote_size
);
current_metrics.push((
ConsumptionMetricsKey {
tenant_id,
timeline_id: None,
metric: ConsumptionMetricKind::ResidentSize,
},
tenant_resident_size,
));
current_metrics.push((
ConsumptionMetricsKey {
tenant_id,
timeline_id: None,
metric: ConsumptionMetricKind::RemoteStorageSize,
},
tenant_remote_size,
));
// TODO add SyntheticStorageSize metric
}
// Filter metrics
current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
Some(val) => val != curr_val,
None => true,
});
if current_metrics.is_empty() {
trace!("no new metrics to send");
return Ok(());
}
// Send metrics.
// Split into chunks of 1000 metrics to avoid exceeding the max request size
const CHUNK_SIZE: usize = 1000;
let chunks = current_metrics.chunks(CHUNK_SIZE);
let mut chunk_to_send: Vec<ConsumptionMetric> = Vec::with_capacity(1000);
for chunk in chunks {
chunk_to_send.clear();
// this code block is needed to convince compiler
// that rng is not reused aroung await point
{
// enrich metrics with timestamp and metric_kind before sending
let mut rng = rand::thread_rng();
chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
ConsumptionMetric::new_absolute(
curr_key.metric,
curr_key.tenant_id,
curr_key.timeline_id,
*curr_val,
node_id,
&mut rng,
)
}));
}
let chunk_json = serde_json::value::to_raw_value(&EventChunk {
events: &chunk_to_send,
})
.expect("ConsumptionMetric should not fail serialization");
let res = client
.post(metric_collection_endpoint.clone())
.json(&chunk_json)
.send()
.await;
match res {
Ok(res) => {
if res.status().is_success() {
// update cached metrics after they were sent successfully
for (curr_key, curr_val) in chunk.iter() {
cached_metrics.insert(curr_key.clone(), *curr_val);
}
} else {
error!("metrics endpoint refused the sent metrics: {:?}", res);
}
}
Err(err) => {
error!("failed to send metrics: {:?}", err);
}
}
}
Ok(())
}

View File

@@ -77,16 +77,6 @@ paths:
schema:
type: string
format: hex
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
- name: include-non-incremental-physical-size
in: query
schema:
type: string
description: Controls calculation of current_physical_size_non_incremental
get:
description: Get timelines for tenant
responses:
@@ -139,17 +129,6 @@ paths:
format: hex
get:
description: Get info about the timeline
parameters:
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
- name: include-non-incremental-physical-size
in: query
schema:
type: string
description: Controls calculation of current_physical_size_non_incremental
responses:
"200":
description: TimelineInfo
@@ -779,10 +758,6 @@ components:
type: integer
current_physical_size:
type: integer
current_logical_size_non_incremental:
type: integer
current_physical_size_non_incremental:
type: integer
wal_source_connstr:
type: string
last_received_msg_lsn:
@@ -795,37 +770,6 @@ components:
latest_gc_cutoff_lsn:
type: string
format: hex
# These 'local' and 'remote' fields just duplicate some of the fields
# above. They are kept for backwards-compatibility. They can be removed,
# when the control plane has been updated to look at the above fields
# directly.
local:
$ref: "#/components/schemas/LocalTimelineInfo"
remote:
$ref: "#/components/schemas/RemoteTimelineInfo"
LocalTimelineInfo:
type: object
properties:
ancestor_timeline_id:
type: string
format: hex
ancestor_lsn:
type: string
format: hex
current_logical_size:
type: integer
current_physical_size:
type: integer
RemoteTimelineInfo:
type: object
required:
- remote_consistent_lsn
properties:
remote_consistent_lsn:
type: string
format: hex
Error:
type: object
required:

View File

@@ -4,16 +4,17 @@ use anyhow::{anyhow, Context, Result};
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use remote_storage::GenericRemoteStorage;
use tokio_util::sync::CancellationToken;
use tracing::*;
use super::models::{
LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, TenantConfigRequest,
TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo,
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
TimelineCreateRequest, TimelineInfo,
};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::tenant::Timeline;
use crate::tenant_config::TenantConfOpt;
use crate::{config::PageServerConf, tenant_mgr};
use crate::tenant::config::TenantConfOpt;
use crate::tenant::{with_ondemand_download, Timeline};
use crate::{config::PageServerConf, tenant::mgr};
use utils::{
auth::JwtAuth,
http::{
@@ -30,8 +31,6 @@ use utils::{
// Imports only used for testing APIs
#[cfg(feature = "testing")]
use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
#[cfg(feature = "testing")]
use crate::CheckpointConfig;
struct State {
conf: &'static PageServerConf,
@@ -79,19 +78,23 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
}
// Helper function to construct a TimelineInfo struct for a timeline
fn build_timeline_info(
async fn build_timeline_info(
timeline: &Arc<Timeline>,
include_non_incremental_logical_size: bool,
include_non_incremental_physical_size: bool,
) -> anyhow::Result<TimelineInfo> {
let mut info = build_timeline_info_common(timeline)?;
if include_non_incremental_logical_size {
info.current_logical_size_non_incremental =
Some(timeline.get_current_logical_size_non_incremental(info.last_record_lsn)?);
}
if include_non_incremental_physical_size {
info.current_physical_size_non_incremental =
Some(timeline.get_physical_size_non_incremental()?)
// XXX we should be using spawn_ondemand_logical_size_calculation here.
// Otherwise, if someone deletes the timeline / detaches the tenant while
// we're executing this function, we will outlive the timeline on-disk state.
info.current_logical_size_non_incremental = Some(
timeline
.get_current_logical_size_non_incremental(
info.last_record_lsn,
CancellationToken::new(),
)
.await?,
);
}
Ok(info)
}
@@ -117,13 +120,13 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
lsn @ Lsn(_) => Some(lsn),
};
let current_logical_size = match timeline.get_current_logical_size() {
Ok(size) => Some(size),
Ok((size, _)) => Some(size),
Err(err) => {
error!("Timeline info creation failed to get current logical size: {err:?}");
None
}
};
let current_physical_size = Some(timeline.get_physical_size());
let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok());
let state = timeline.current_state();
let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
@@ -140,25 +143,13 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
current_logical_size,
current_physical_size,
current_logical_size_non_incremental: None,
current_physical_size_non_incremental: None,
timeline_dir_layer_file_size_sum: None,
wal_source_connstr,
last_received_msg_lsn,
last_received_msg_ts,
pg_version: timeline.pg_version,
state,
// Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
// with the control plane.
local: LocalTimelineInfo {
ancestor_timeline_id,
ancestor_lsn,
current_logical_size,
current_physical_size,
},
remote: RemoteTimelineInfo {
remote_consistent_lsn: Some(remote_consistent_lsn),
},
};
Ok(info)
}
@@ -179,7 +170,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
.new_timeline_id
.unwrap_or_else(TimelineId::generate);
let tenant = tenant_mgr::get_tenant(tenant_id, true)
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
match tenant.create_timeline(
@@ -205,29 +196,26 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let include_non_incremental_logical_size =
query_param_present(&request, "include-non-incremental-logical-size");
let include_non_incremental_physical_size =
query_param_present(&request, "include-non-incremental-physical-size");
check_permission(&request, Some(tenant_id))?;
let response_data = async {
let tenant = tenant_mgr::get_tenant(tenant_id, true)
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let timelines = tenant.list_timelines();
let mut response_data = Vec::with_capacity(timelines.len());
for timeline in timelines {
let timeline_info = build_timeline_info(
&timeline,
include_non_incremental_logical_size,
include_non_incremental_physical_size,
)
.context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
.map_err(ApiError::InternalServerError)?;
let timeline_info =
build_timeline_info(&timeline, include_non_incremental_logical_size)
.await
.context(
"Failed to convert tenant timeline {timeline_id} into the local one: {e:?}",
)
.map_err(ApiError::InternalServerError)?;
response_data.push(timeline_info);
}
Ok(response_data)
}
.instrument(info_span!("timeline_list", tenant = %tenant_id))
@@ -271,12 +259,10 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let include_non_incremental_logical_size =
query_param_present(&request, "include-non-incremental-logical-size");
let include_non_incremental_physical_size =
query_param_present(&request, "include-non-incremental-physical-size");
check_permission(&request, Some(tenant_id))?;
let timeline_info = async {
let tenant = tenant_mgr::get_tenant(tenant_id, true)
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
@@ -284,13 +270,10 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
.get_timeline(timeline_id, false)
.map_err(ApiError::NotFound)?;
let timeline_info = build_timeline_info(
&timeline,
include_non_incremental_logical_size,
include_non_incremental_physical_size,
)
.context("Failed to get local timeline info: {e:#}")
.map_err(ApiError::InternalServerError)?;
let timeline_info = build_timeline_info(&timeline, include_non_incremental_logical_size)
.await
.context("Failed to get local timeline info: {e:#}")
.map_err(ApiError::InternalServerError)?;
Ok::<_, ApiError>(timeline_info)
}
@@ -311,14 +294,15 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
.map_err(ApiError::BadRequest)?;
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
let timeline = tenant_mgr::get_tenant(tenant_id, true)
let timeline = mgr::get_tenant(tenant_id, true)
.await
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
.map_err(ApiError::NotFound)?;
let result = match timeline
.find_lsn_for_timestamp(timestamp_pg)
.map_err(ApiError::InternalServerError)?
{
let result = with_ondemand_download(|| timeline.find_lsn_for_timestamp(timestamp_pg))
.await
.map_err(ApiError::InternalServerError)?;
let result = match result {
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
LsnForTimestamp::Future(_lsn) => "future".into(),
LsnForTimestamp::Past(_lsn) => "past".into(),
@@ -338,7 +322,7 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
if let Some(remote_storage) = &state.remote_storage {
// FIXME: distinguish between "Tenant already exists" and other errors
tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
.await
.map_err(ApiError::InternalServerError)?;
@@ -356,7 +340,7 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
tenant_mgr::delete_timeline(tenant_id, timeline_id)
mgr::delete_timeline(tenant_id, timeline_id)
.instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
.await
// FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
@@ -373,7 +357,7 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
let state = get_state(&request);
let conf = state.conf;
tenant_mgr::detach_tenant(conf, tenant_id)
mgr::detach_tenant(conf, tenant_id)
.instrument(info_span!("tenant_detach", tenant = %tenant_id))
.await
// FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors.
@@ -388,7 +372,7 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
check_permission(&request, Some(tenant_id))?;
let state = get_state(&request);
tenant_mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
.instrument(info_span!("load", tenant = %tenant_id))
.await
.map_err(ApiError::InternalServerError)?;
@@ -402,7 +386,7 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
let state = get_state(&request);
let conf = state.conf;
tenant_mgr::ignore_tenant(conf, tenant_id)
mgr::ignore_tenant(conf, tenant_id)
.instrument(info_span!("ignore_tenant", tenant = %tenant_id))
.await
// FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors.
@@ -415,7 +399,7 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let response_data = tenant_mgr::list_tenants()
let response_data = mgr::list_tenants()
.instrument(info_span!("tenant_list"))
.await
.iter()
@@ -435,12 +419,12 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
check_permission(&request, Some(tenant_id))?;
let tenant_info = async {
let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
let tenant = mgr::get_tenant(tenant_id, false).await?;
// Calculate total physical size of all timelines
let mut current_physical_size = 0;
for timeline in tenant.list_timelines().iter() {
current_physical_size += timeline.get_physical_size();
current_physical_size += timeline.layer_size_sum().approximate_is_ok();
}
let state = tenant.current_state();
@@ -462,7 +446,7 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = tenant_mgr::get_tenant(tenant_id, true)
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::InternalServerError)?;
@@ -583,7 +567,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
let state = get_state(&request);
let new_tenant = tenant_mgr::create_tenant(
let new_tenant = mgr::create_tenant(
state.conf,
tenant_conf,
target_tenant_id,
@@ -685,7 +669,7 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
}
let state = get_state(&request);
tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
.await
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
@@ -737,7 +721,7 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
let gc_result = wait_task_done
.await
.context("wait for gc task")
@@ -754,17 +738,17 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = tenant_mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)?;
timeline
.compact()
let result_receiver = mgr::immediate_compact(tenant_id, timeline_id)
.await
.context("spawn compaction task")
.map_err(ApiError::InternalServerError)?;
let result: anyhow::Result<()> = result_receiver
.await
.context("receive compaction result")
.map_err(ApiError::InternalServerError)?;
result.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
@@ -775,20 +759,63 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = tenant_mgr::get_tenant(tenant_id, true)
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)?;
timeline
.checkpoint(CheckpointConfig::Forced)
.freeze_and_flush()
.await
.map_err(ApiError::InternalServerError)?;
timeline
.compact()
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
async fn timeline_download_remote_layers_handler_post(
request: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)?;
match timeline.spawn_download_all_remote_layers().await {
Ok(st) => json_response(StatusCode::ACCEPTED, st),
Err(st) => json_response(StatusCode::CONFLICT, st),
}
}
async fn timeline_download_remote_layers_handler_get(
request: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)?;
let info = timeline
.get_download_all_remote_layers_task_info()
.context("task never started since last pageserver process start")
.map_err(ApiError::NotFound)?;
json_response(StatusCode::OK, info)
}
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(
StatusCode::NOT_FOUND,
@@ -873,6 +900,14 @@ pub fn make_router(
"/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
testing_api!("run timeline checkpoint", timeline_checkpoint_handler),
)
.post(
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
timeline_download_remote_layers_handler_post,
)
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
timeline_download_remote_layers_handler_get,
)
.delete(
"/v1/tenant/:tenant_id/timeline/:timeline_id",
timeline_delete_handler,

View File

@@ -2,12 +2,13 @@
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
//! a neon Timeline.
//!
use std::fs::File;
use std::io::{Read, Seek, SeekFrom};
use std::path::{Path, PathBuf};
use anyhow::{bail, ensure, Context, Result};
use bytes::Bytes;
use futures::StreamExt;
use tokio::io::{AsyncRead, AsyncReadExt};
use tokio_tar::Archive;
use tracing::*;
use walkdir::WalkDir;
@@ -42,7 +43,7 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
/// This is currently only used to import a cluster freshly created by initdb.
/// The code that deals with the checkpoint would not work right if the
/// cluster was not shut down cleanly.
pub fn import_timeline_from_postgres_datadir(
pub async fn import_timeline_from_postgres_datadir(
tline: &Timeline,
pgdata_path: &Path,
pgdata_lsn: Lsn,
@@ -65,9 +66,11 @@ pub fn import_timeline_from_postgres_datadir(
let absolute_path = entry.path();
let relative_path = absolute_path.strip_prefix(pgdata_path)?;
let file = File::open(absolute_path)?;
let mut file = tokio::fs::File::open(absolute_path).await?;
let len = metadata.len() as usize;
if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? {
if let Some(control_file) =
import_file(&mut modification, relative_path, &mut file, len).await?
{
pg_control = Some(control_file);
}
modification.flush()?;
@@ -96,18 +99,19 @@ pub fn import_timeline_from_postgres_datadir(
tline,
Lsn(pg_control.checkPointCopy.redo),
pgdata_lsn,
)?;
)
.await?;
Ok(())
}
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
fn import_rel<Reader: Read>(
modification: &mut DatadirModification,
async fn import_rel(
modification: &mut DatadirModification<'_>,
path: &Path,
spcoid: Oid,
dboid: Oid,
mut reader: Reader,
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
len: usize,
) -> anyhow::Result<()> {
// Does it look like a relation file?
@@ -148,7 +152,7 @@ fn import_rel<Reader: Read>(
}
loop {
let r = reader.read_exact(&mut buf);
let r = reader.read_exact(&mut buf).await;
match r {
Ok(_) => {
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
@@ -181,19 +185,19 @@ fn import_rel<Reader: Read>(
/// Import an SLRU segment file
///
fn import_slru<Reader: Read>(
modification: &mut DatadirModification,
async fn import_slru(
modification: &mut DatadirModification<'_>,
slru: SlruKind,
path: &Path,
mut reader: Reader,
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
len: usize,
) -> Result<()> {
trace!("importing slru file {}", path.display());
) -> anyhow::Result<()> {
info!("importing slru file {path:?}");
let mut buf: [u8; 8192] = [0u8; 8192];
let filename = &path
.file_name()
.expect("missing slru filename")
.with_context(|| format!("missing slru filename for path {path:?}"))?
.to_string_lossy();
let segno = u32::from_str_radix(filename, 16)?;
@@ -206,7 +210,7 @@ fn import_slru<Reader: Read>(
let mut rpageno = 0;
loop {
let r = reader.read_exact(&mut buf);
let r = reader.read_exact(&mut buf).await;
match r {
Ok(_) => {
modification.put_slru_page_image(
@@ -237,14 +241,20 @@ fn import_slru<Reader: Read>(
/// Scan PostgreSQL WAL files in given directory and load all records between
/// 'startpoint' and 'endpoint' into the repository.
fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> {
async fn import_wal(
walpath: &Path,
tline: &Timeline,
startpoint: Lsn,
endpoint: Lsn,
) -> anyhow::Result<()> {
use std::io::Read;
let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
let mut last_lsn = startpoint;
let mut walingest = WalIngest::new(tline, startpoint)?;
let mut walingest = WalIngest::new(tline, startpoint).await?;
while last_lsn <= endpoint {
// FIXME: assume postgresql tli 1 for now
@@ -260,14 +270,15 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
}
// Slurp the WAL file
let mut file = File::open(&path)?;
let mut file = std::fs::File::open(&path)?;
if offset > 0 {
file.seek(SeekFrom::Start(offset as u64))?;
use std::io::Seek;
file.seek(std::io::SeekFrom::Start(offset as u64))?;
}
let nread = file.read_to_end(&mut buf)?;
if nread != WAL_SEGMENT_SIZE - offset as usize {
if nread != WAL_SEGMENT_SIZE - offset {
// Maybe allow this for .partial files?
error!("read only {} bytes from WAL file", nread);
}
@@ -279,7 +290,9 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
let mut decoded = DecodedWALRecord::default();
while last_lsn <= endpoint {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded)
.await?;
last_lsn = lsn;
nrecords += 1;
@@ -303,9 +316,9 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
Ok(())
}
pub fn import_basebackup_from_tar<Reader: Read>(
pub async fn import_basebackup_from_tar(
tline: &Timeline,
reader: Reader,
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
base_lsn: Lsn,
) -> Result<()> {
info!("importing base at {base_lsn}");
@@ -315,21 +328,24 @@ pub fn import_basebackup_from_tar<Reader: Read>(
let mut pg_control: Option<ControlFileData> = None;
// Import base
for base_tar_entry in tar::Archive::new(reader).entries()? {
let entry = base_tar_entry?;
let mut entries = Archive::new(reader).entries()?;
while let Some(base_tar_entry) = entries.next().await {
let mut entry = base_tar_entry?;
let header = entry.header();
let len = header.entry_size()? as usize;
let file_path = header.path()?.into_owned();
match header.entry_type() {
tar::EntryType::Regular => {
if let Some(res) = import_file(&mut modification, file_path.as_ref(), entry, len)? {
tokio_tar::EntryType::Regular => {
if let Some(res) =
import_file(&mut modification, file_path.as_ref(), &mut entry, len).await?
{
// We found the pg_control file.
pg_control = Some(res);
}
modification.flush()?;
}
tar::EntryType::Directory => {
tokio_tar::EntryType::Directory => {
debug!("directory {:?}", file_path);
}
_ => {
@@ -349,9 +365,9 @@ pub fn import_basebackup_from_tar<Reader: Read>(
Ok(())
}
pub fn import_wal_from_tar<Reader: Read>(
pub async fn import_wal_from_tar(
tline: &Timeline,
reader: Reader,
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
start_lsn: Lsn,
end_lsn: Lsn,
) -> Result<()> {
@@ -360,20 +376,23 @@ pub fn import_wal_from_tar<Reader: Read>(
let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
let mut last_lsn = start_lsn;
let mut walingest = WalIngest::new(tline, start_lsn)?;
let mut walingest = WalIngest::new(tline, start_lsn).await?;
// Ingest wal until end_lsn
info!("importing wal until {}", end_lsn);
let mut pg_wal_tar = tar::Archive::new(reader);
let mut pg_wal_entries_iter = pg_wal_tar.entries()?;
let mut pg_wal_tar = Archive::new(reader);
let mut pg_wal_entries = pg_wal_tar.entries()?;
while last_lsn <= end_lsn {
let bytes = {
let entry = pg_wal_entries_iter.next().expect("expected more wal")?;
let mut entry = pg_wal_entries
.next()
.await
.ok_or_else(|| anyhow::anyhow!("expected more wal"))??;
let header = entry.header();
let file_path = header.path()?.into_owned();
match header.entry_type() {
tar::EntryType::Regular => {
tokio_tar::EntryType::Regular => {
// FIXME: assume postgresql tli 1 for now
let expected_filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE);
let file_name = file_path
@@ -383,9 +402,9 @@ pub fn import_wal_from_tar<Reader: Read>(
ensure!(expected_filename == file_name);
debug!("processing wal file {:?}", file_path);
read_all_bytes(entry)?
read_all_bytes(&mut entry).await?
}
tar::EntryType::Directory => {
tokio_tar::EntryType::Directory => {
debug!("directory {:?}", file_path);
continue;
}
@@ -405,7 +424,9 @@ pub fn import_wal_from_tar<Reader: Read>(
let mut decoded = DecodedWALRecord::default();
while last_lsn <= end_lsn {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded)
.await?;
last_lsn = lsn;
debug!("imported record at {} (end {})", lsn, end_lsn);
@@ -424,7 +445,7 @@ pub fn import_wal_from_tar<Reader: Read>(
}
// Log any extra unused files
for e in &mut pg_wal_entries_iter {
while let Some(e) = pg_wal_entries.next().await {
let entry = e?;
let header = entry.header();
let file_path = header.path()?.into_owned();
@@ -434,24 +455,30 @@ pub fn import_wal_from_tar<Reader: Read>(
Ok(())
}
fn import_file<Reader: Read>(
modification: &mut DatadirModification,
async fn import_file(
modification: &mut DatadirModification<'_>,
file_path: &Path,
reader: Reader,
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
len: usize,
) -> Result<Option<ControlFileData>> {
let file_name = match file_path.file_name() {
Some(name) => name.to_string_lossy(),
None => return Ok(None),
};
if file_name.starts_with('.') {
// tar archives on macOs, created without COPYFILE_DISABLE=1 env var
// will contain "fork files", skip them.
return Ok(None);
}
if file_path.starts_with("global") {
let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
let dbnode = 0;
match file_path
.file_name()
.expect("missing filename")
.to_string_lossy()
.as_ref()
{
match file_name.as_ref() {
"pg_control" => {
let bytes = read_all_bytes(reader)?;
let bytes = read_all_bytes(reader).await?;
// Extract the checkpoint record and import it separately.
let pg_control = ControlFileData::decode(&bytes[..])?;
@@ -464,7 +491,7 @@ fn import_file<Reader: Read>(
return Ok(Some(pg_control));
}
"pg_filenode.map" => {
let bytes = read_all_bytes(reader)?;
let bytes = read_all_bytes(reader).await?;
modification.put_relmap_file(spcnode, dbnode, bytes)?;
debug!("imported relmap file")
}
@@ -472,7 +499,7 @@ fn import_file<Reader: Read>(
debug!("ignored PG_VERSION file");
}
_ => {
import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
debug!("imported rel creation");
}
}
@@ -485,14 +512,9 @@ fn import_file<Reader: Read>(
.to_string_lossy()
.parse()?;
match file_path
.file_name()
.expect("missing base filename")
.to_string_lossy()
.as_ref()
{
match file_name.as_ref() {
"pg_filenode.map" => {
let bytes = read_all_bytes(reader)?;
let bytes = read_all_bytes(reader).await?;
modification.put_relmap_file(spcnode, dbnode, bytes)?;
debug!("imported relmap file")
}
@@ -500,40 +522,36 @@ fn import_file<Reader: Read>(
debug!("ignored PG_VERSION file");
}
_ => {
import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
debug!("imported rel creation");
}
}
} else if file_path.starts_with("pg_xact") {
let slru = SlruKind::Clog;
import_slru(modification, slru, file_path, reader, len)?;
import_slru(modification, slru, file_path, reader, len).await?;
debug!("imported clog slru");
} else if file_path.starts_with("pg_multixact/offsets") {
let slru = SlruKind::MultiXactOffsets;
import_slru(modification, slru, file_path, reader, len)?;
import_slru(modification, slru, file_path, reader, len).await?;
debug!("imported multixact offsets slru");
} else if file_path.starts_with("pg_multixact/members") {
let slru = SlruKind::MultiXactMembers;
import_slru(modification, slru, file_path, reader, len)?;
import_slru(modification, slru, file_path, reader, len).await?;
debug!("imported multixact members slru");
} else if file_path.starts_with("pg_twophase") {
let file_name = &file_path
.file_name()
.expect("missing twophase filename")
.to_string_lossy();
let xid = u32::from_str_radix(file_name, 16)?;
let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
let bytes = read_all_bytes(reader)?;
let bytes = read_all_bytes(reader).await?;
modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?;
debug!("imported twophase file");
} else if file_path.starts_with("pg_wal") {
debug!("found wal file in base section. ignore it");
} else if file_path.starts_with("zenith.signal") {
// Parse zenith signal file to set correct previous LSN
let bytes = read_all_bytes(reader)?;
let bytes = read_all_bytes(reader).await?;
// zenith.signal format is "PREV LSN: prev_lsn"
// TODO write serialization and deserialization in the same place.
let zenith_signal = std::str::from_utf8(&bytes)?.trim();
@@ -570,8 +588,8 @@ fn import_file<Reader: Read>(
Ok(None)
}
fn read_all_bytes<Reader: Read>(mut reader: Reader) -> Result<Bytes> {
async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result<Bytes> {
let mut buf: Vec<u8> = vec![];
reader.read_to_end(&mut buf)?;
reader.read_to_end(&mut buf).await?;
Ok(Bytes::copy_from_slice(&buf[..]))
}

View File

@@ -1,6 +1,7 @@
mod auth;
pub mod basebackup;
pub mod config;
pub mod consumption_metrics;
pub mod http;
pub mod import_datadir;
pub mod keyspace;
@@ -8,15 +9,9 @@ pub(crate) mod metrics;
pub mod page_cache;
pub mod page_service;
pub mod pgdatadir_mapping;
pub mod profiling;
pub mod repository;
pub mod storage_sync2;
pub use storage_sync2 as storage_sync;
pub mod task_mgr;
pub mod tenant;
pub mod tenant_config;
pub mod tenant_mgr;
pub mod tenant_tasks;
pub mod trace;
pub mod virtual_file;
pub mod walingest;
@@ -26,9 +21,8 @@ pub mod walredo;
use std::path::Path;
use tracing::info;
use crate::task_mgr::TaskKind;
use tracing::info;
/// Current storage format version
///
@@ -47,15 +41,6 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
/// Config for the Repository checkpointer
#[derive(Debug, Clone, Copy)]
pub enum CheckpointConfig {
// Flush all in-memory data
Flush,
// Flush all in-memory data and reconstruct all page images
Forced,
}
pub async fn shutdown_pageserver(exit_code: i32) {
// Shut down the libpq endpoint task. This prevents new connections from
// being accepted.
@@ -66,7 +51,7 @@ pub async fn shutdown_pageserver(exit_code: i32) {
// Shut down all the tenants. This flushes everything to disk and kills
// the checkpoint and GC tasks.
tenant_mgr::shutdown_all_tenants().await;
tenant::mgr::shutdown_all_tenants().await;
// Stop syncing with remote storage.
//
@@ -99,7 +84,7 @@ async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
}
}
fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
if n == 0 {
0.0
} else {

View File

@@ -84,13 +84,20 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
// Metrics for determining timeline's physical size.
// A layered timeline's physical is defined as the total size of
// (delta/image) layer files on disk.
static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_current_physical_size",
"Current physical size grouped by timeline",
"pageserver_resident_physical_size",
"The size of the layer files present in the pageserver's filesystem.",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_remote_physical_size",
"The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
// Corollary: If any files are missing from the index part, they won't be included here.
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
@@ -136,8 +143,9 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
1.0, // 1 sec
];
const STORAGE_IO_TIME_OPERATIONS: &[&str] =
&["open", "close", "read", "write", "seek", "fsync", "gc"];
const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
"open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
];
const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
@@ -201,23 +209,42 @@ pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
// remote storage metrics
pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_remote_upload_queue_unfinished_tasks",
"Number of tasks in the upload queue that are not finished yet.",
"pageserver_remote_timeline_client_calls_unfinished",
"Number of ongoing calls to remote timeline client. \
Used to populate pageserver_remote_timeline_client_calls_started. \
This metric is not useful for sampling from Prometheus, but useful in tests.",
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
)
.expect("failed to define a metric")
});
#[derive(Debug, Clone, Copy)]
static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_remote_timeline_client_calls_started",
"When calling a remote timeline client method, we record the current value \
of the calls_unfinished gauge in this histogram. Plot the histogram \
over time in a heatmap to visualize how many operations were ongoing \
at a given instant. It gives you a better idea of the queue depth \
than plotting the gauge directly, since operations may complete faster \
than the sampling interval.",
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
// The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
)
.expect("failed to define a metric")
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RemoteOpKind {
Upload,
Download,
Delete,
}
impl RemoteOpKind {
pub fn as_str(&self) -> &str {
pub fn as_str(&self) -> &'static str {
match self {
Self::Upload => "upload",
Self::Download => "download",
@@ -226,13 +253,13 @@ impl RemoteOpKind {
}
}
#[derive(Debug, Clone, Copy)]
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
pub enum RemoteOpFileKind {
Layer,
Index,
}
impl RemoteOpFileKind {
pub fn as_str(&self) -> &str {
pub fn as_str(&self) -> &'static str {
match self {
Self::Layer => "layer",
Self::Index => "index",
@@ -240,15 +267,12 @@ impl RemoteOpFileKind {
}
}
pub static REMOTE_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"];
pub static REMOTE_OPERATION_FILE_KINDS: &[&str] = &["layer", "index"];
pub static REMOTE_OPERATION_STATUSES: &[&str] = &["success", "failure"];
pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_remote_operation_seconds",
"Time spent on remote storage operations. \
Grouped by tenant, timeline, operation_kind and status",
Grouped by tenant, timeline, operation_kind and status. \
Does not account for time spent waiting in remote timeline client's queues.",
&["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
)
.expect("failed to define a metric")
@@ -365,7 +389,7 @@ pub struct TimelineMetrics {
pub load_layer_map_histo: Histogram,
pub last_record_gauge: IntGauge,
pub wait_lsn_time_histo: Histogram,
pub current_physical_size_gauge: UIntGauge,
pub resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
pub num_persistent_files_created: IntCounter,
@@ -406,7 +430,7 @@ impl TimelineMetrics {
let wait_lsn_time_histo = WAIT_LSN_TIME
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
@@ -432,7 +456,7 @@ impl TimelineMetrics {
load_layer_map_histo,
last_record_gauge,
wait_lsn_time_histo,
current_physical_size_gauge,
resident_physical_size_gauge,
current_logical_size_gauge,
num_persistent_files_created,
persistent_bytes_written,
@@ -448,7 +472,7 @@ impl Drop for TimelineMetrics {
let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
@@ -467,21 +491,6 @@ impl Drop for TimelineMetrics {
for op in SMGR_QUERY_TIME_OPERATIONS {
let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
}
let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[tenant_id, timeline_id]);
for file_kind in REMOTE_OPERATION_FILE_KINDS {
for op in REMOTE_OPERATION_KINDS {
for status in REMOTE_OPERATION_STATUSES {
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[
tenant_id,
timeline_id,
file_kind,
op,
status,
]);
}
}
}
}
}
@@ -491,10 +500,198 @@ pub fn remove_tenant_metrics(tenant_id: &TenantId) {
use futures::Future;
use pin_project_lite::pin_project;
use std::collections::HashMap;
use std::pin::Pin;
use std::sync::{Arc, Mutex};
use std::task::{Context, Poll};
use std::time::Instant;
pub struct RemoteTimelineClientMetrics {
tenant_id: String,
timeline_id: String,
remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
}
impl RemoteTimelineClientMetrics {
pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
RemoteTimelineClientMetrics {
tenant_id: tenant_id.to_string(),
timeline_id: timeline_id.to_string(),
remote_operation_time: Mutex::new(HashMap::default()),
calls_unfinished_gauge: Mutex::new(HashMap::default()),
calls_started_hist: Mutex::new(HashMap::default()),
remote_physical_size_gauge: Mutex::new(None),
}
}
pub fn remote_physical_size_gauge(&self) -> UIntGauge {
let mut guard = self.remote_physical_size_gauge.lock().unwrap();
guard
.get_or_insert_with(|| {
REMOTE_PHYSICAL_SIZE
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
])
.unwrap()
})
.clone()
}
pub fn remote_operation_time(
&self,
file_kind: &RemoteOpFileKind,
op_kind: &RemoteOpKind,
status: &'static str,
) -> Histogram {
// XXX would be nice to have an upgradable RwLock
let mut guard = self.remote_operation_time.lock().unwrap();
let key = (file_kind.as_str(), op_kind.as_str(), status);
let metric = guard.entry(key).or_insert_with(move || {
REMOTE_OPERATION_TIME
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
key.0,
key.1,
key.2,
])
.unwrap()
});
metric.clone()
}
fn calls_unfinished_gauge(
&self,
file_kind: &RemoteOpFileKind,
op_kind: &RemoteOpKind,
) -> IntGauge {
// XXX would be nice to have an upgradable RwLock
let mut guard = self.calls_unfinished_gauge.lock().unwrap();
let key = (file_kind.as_str(), op_kind.as_str());
let metric = guard.entry(key).or_insert_with(move || {
REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
key.0,
key.1,
])
.unwrap()
});
metric.clone()
}
fn calls_started_hist(
&self,
file_kind: &RemoteOpFileKind,
op_kind: &RemoteOpKind,
) -> Histogram {
// XXX would be nice to have an upgradable RwLock
let mut guard = self.calls_started_hist.lock().unwrap();
let key = (file_kind.as_str(), op_kind.as_str());
let metric = guard.entry(key).or_insert_with(move || {
REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
key.0,
key.1,
])
.unwrap()
});
metric.clone()
}
}
/// See [`RemoteTimelineClientMetrics::call_begin`].
#[must_use]
pub(crate) struct RemoteTimelineClientCallMetricGuard(Option<IntGauge>);
impl RemoteTimelineClientCallMetricGuard {
/// Consume this guard object without decrementing the metric.
/// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out.
pub fn will_decrement_manually(mut self) {
self.0 = None; // prevent drop() from decrementing
}
}
impl Drop for RemoteTimelineClientCallMetricGuard {
fn drop(&mut self) {
if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self {
guard.dec();
}
}
}
impl RemoteTimelineClientMetrics {
/// Increment the metrics that track ongoing calls to the remote timeline client instance.
///
/// Drop the returned guard object once the operation is finished to decrement the values.
/// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
/// is more suitable.
/// Never do both.
pub(crate) fn call_begin(
&self,
file_kind: &RemoteOpFileKind,
op_kind: &RemoteOpKind,
) -> RemoteTimelineClientCallMetricGuard {
let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
self.calls_started_hist(file_kind, op_kind)
.observe(unfinished_metric.get() as f64);
unfinished_metric.inc();
RemoteTimelineClientCallMetricGuard(Some(unfinished_metric))
}
/// Manually decrement the metric instead of using the guard object.
/// Using the guard object is generally preferable.
/// See [`call_begin`] for more context.
pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) {
let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
debug_assert!(
unfinished_metric.get() > 0,
"begin and end should cancel out"
);
unfinished_metric.dec();
}
}
impl Drop for RemoteTimelineClientMetrics {
fn drop(&mut self) {
let RemoteTimelineClientMetrics {
tenant_id,
timeline_id,
remote_physical_size_gauge,
remote_operation_time,
calls_unfinished_gauge,
calls_started_hist,
} = self;
for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
}
for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
tenant_id,
timeline_id,
a,
b,
]);
}
for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
tenant_id,
timeline_id,
a,
b,
]);
}
{
let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
}
}
}
/// Wrapper future that measures the time spent by a remote storage operation,
/// and records the time and success/failure as a prometheus metric.
pub trait MeasureRemoteOp: Sized {
@@ -504,6 +701,7 @@ pub trait MeasureRemoteOp: Sized {
timeline_id: TimelineId,
file_kind: RemoteOpFileKind,
op: RemoteOpKind,
metrics: Arc<RemoteTimelineClientMetrics>,
) -> MeasuredRemoteOp<Self> {
let start = Instant::now();
MeasuredRemoteOp {
@@ -513,6 +711,7 @@ pub trait MeasureRemoteOp: Sized {
file_kind,
op,
start,
metrics,
}
}
}
@@ -529,6 +728,7 @@ pin_project! {
file_kind: RemoteOpFileKind,
op: RemoteOpKind,
start: Instant,
metrics: Arc<RemoteTimelineClientMetrics>,
}
}
@@ -541,15 +741,8 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
if let Poll::Ready(ref res) = poll_result {
let duration = this.start.elapsed();
let status = if res.is_ok() { &"success" } else { &"failure" };
REMOTE_OPERATION_TIME
.get_metric_with_label_values(&[
&this.tenant_id.to_string(),
&this.timeline_id.to_string(),
this.file_kind.as_str(),
this.op.as_str(),
status,
])
.unwrap()
this.metrics
.remote_operation_time(this.file_kind, this.op, status)
.observe(duration.as_secs_f64());
}
poll_result

View File

@@ -9,7 +9,7 @@
// custom protocol.
//
use anyhow::{bail, ensure, Context, Result};
use anyhow::Context;
use bytes::Buf;
use bytes::Bytes;
use futures::{Stream, StreamExt};
@@ -19,6 +19,8 @@ use pageserver_api::models::{
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
PagestreamNblocksRequest, PagestreamNblocksResponse,
};
use pq_proto::ConnectionError;
use pq_proto::FeStartupPacket;
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
use std::io;
use std::net::TcpListener;
@@ -26,11 +28,9 @@ use std::str;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use tokio::pin;
use tokio_util::io::StreamReader;
use tokio_util::io::SyncIoBridge;
use tracing::*;
use utils::id::ConnectionId;
use utils::postgres_backend_async::QueryError;
use utils::{
auth::{Claims, JwtAuth, Scope},
id::{TenantId, TimelineId},
@@ -42,16 +42,14 @@ use utils::{
use crate::auth::check_permission;
use crate::basebackup;
use crate::config::{PageServerConf, ProfilingConfig};
use crate::config::PageServerConf;
use crate::import_datadir::import_wal_from_tar;
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
use crate::profiling::profpoint_start;
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::mgr;
use crate::tenant::{Tenant, Timeline};
use crate::tenant_mgr;
use crate::trace::Tracer;
use crate::CheckpointConfig;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ;
@@ -65,8 +63,8 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
_ = task_mgr::shutdown_watcher() => {
// We were requested to shut down.
let msg = format!("pageserver is shutting down");
let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg));
Err(anyhow::anyhow!(msg))
let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg, None));
Err(QueryError::Other(anyhow::anyhow!(msg)))
}
msg = pgb.read_message() => { msg }
@@ -79,14 +77,15 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
FeMessage::CopyDone => { break },
FeMessage::Sync => continue,
FeMessage::Terminate => {
let msg = format!("client terminated connection with Terminate message during COPY");
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
let msg = "client terminated connection with Terminate message during COPY";
let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
break;
}
m => {
let msg = format!("unexpected message {:?}", m);
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
let msg = format!("unexpected message {m:?}");
pgb.write_message(&BeMessage::ErrorResponse(&msg, None))?;
Err(io::Error::new(io::ErrorKind::Other, msg))?;
break;
}
@@ -96,12 +95,16 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
}
Ok(None) => {
let msg = "client closed connection during COPY";
pgb.write_message(&BeMessage::ErrorResponse(msg))?;
let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
pgb.flush().await?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
}
Err(e) => {
Err(io::Error::new(io::ErrorKind::Other, e))?;
Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
Err(io_error)?;
}
Err(other) => {
Err(io::Error::new(io::ErrorKind::Other, other))?;
}
};
}
@@ -199,23 +202,19 @@ async fn page_service_conn_main(
// we've been requested to shut down
Ok(())
}
Err(err) => {
let root_cause_io_err_kind = err
.root_cause()
.downcast_ref::<io::Error>()
.map(|e| e.kind());
Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
// `ConnectionReset` error happens when the Postgres client closes the connection.
// As this disconnection happens quite often and is expected,
// we decided to downgrade the logging level to `INFO`.
// See: https://github.com/neondatabase/neon/issues/1683.
if root_cause_io_err_kind == Some(io::ErrorKind::ConnectionReset) {
if io_error.kind() == io::ErrorKind::ConnectionReset {
info!("Postgres client disconnected");
Ok(())
} else {
Err(err)
Err(io_error).context("Postgres connection error")
}
}
other => other.context("Postgres query error"),
}
}
@@ -254,7 +253,7 @@ impl PageRequestMetrics {
#[derive(Debug)]
struct PageServerHandler {
conf: &'static PageServerConf,
_conf: &'static PageServerConf,
auth: Option<Arc<JwtAuth>>,
claims: Option<Claims>,
}
@@ -262,7 +261,7 @@ struct PageServerHandler {
impl PageServerHandler {
pub fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {
PageServerHandler {
conf,
_conf: conf,
auth,
claims: None,
}
@@ -317,7 +316,7 @@ impl PageServerHandler {
Some(FeMessage::CopyData(bytes)) => bytes,
Some(FeMessage::Terminate) => break,
Some(m) => {
bail!("unexpected message: {m:?} during COPY");
anyhow::bail!("unexpected message: {m:?} during COPY");
}
None => break, // client disconnected
};
@@ -374,7 +373,7 @@ impl PageServerHandler {
base_lsn: Lsn,
_end_lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<()> {
) -> Result<(), QueryError> {
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
// Create empty timeline
info!("creating new timeline");
@@ -396,9 +395,7 @@ impl PageServerHandler {
pgb.write_message(&BeMessage::CopyInResponse)?;
pgb.flush().await?;
let copyin_stream = copyin_stream(pgb);
pin!(copyin_stream);
let mut copyin_stream = Box::pin(copyin_stream(pgb));
timeline
.import_basebackup_from_tar(&mut copyin_stream, base_lsn)
.await?;
@@ -430,11 +427,16 @@ impl PageServerHandler {
timeline_id: TimelineId,
start_lsn: Lsn,
end_lsn: Lsn,
) -> anyhow::Result<()> {
) -> Result<(), QueryError> {
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
ensure!(timeline.get_last_record_lsn() == start_lsn);
let last_record_lsn = timeline.get_last_record_lsn();
if last_record_lsn != start_lsn {
return Err(QueryError::Other(
anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
);
}
// TODO leave clean state on error. For now you can use detach to clean
// up broken state from a failed import.
@@ -444,10 +446,8 @@ impl PageServerHandler {
pgb.write_message(&BeMessage::CopyInResponse)?;
pgb.flush().await?;
let mut copyin_stream = Box::pin(copyin_stream(pgb));
let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
tokio::task::block_in_place(|| {
import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn)
})?;
let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn).await?;
info!("wal import complete");
// Drain the rest of the Copy data
@@ -460,13 +460,17 @@ impl PageServerHandler {
}
// TODO Does it make sense to overshoot?
ensure!(timeline.get_last_record_lsn() >= end_lsn);
if timeline.get_last_record_lsn() < end_lsn {
return Err(QueryError::Other(
anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
);
}
// Flush data to disk, then upload to s3. No need for a forced checkpoint.
// We only want to persist the data, and it doesn't matter if it's in the
// shape of deltas or images.
info!("flushing layers");
timeline.checkpoint(CheckpointConfig::Flush).await?;
timeline.freeze_and_flush().await?;
info!("done");
Ok(())
@@ -489,7 +493,7 @@ impl PageServerHandler {
mut lsn: Lsn,
latest: bool,
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
) -> Result<Lsn> {
) -> anyhow::Result<Lsn> {
if latest {
// Latest page version was requested. If LSN is given, it is a hint
// to the page server that there have been no modifications to the
@@ -520,11 +524,11 @@ impl PageServerHandler {
}
} else {
if lsn == Lsn(0) {
bail!("invalid LSN(0) in request");
anyhow::bail!("invalid LSN(0) in request");
}
timeline.wait_lsn(lsn).await?;
}
ensure!(
anyhow::ensure!(
lsn >= **latest_gc_cutoff_lsn,
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
lsn, **latest_gc_cutoff_lsn
@@ -537,12 +541,15 @@ impl PageServerHandler {
&self,
timeline: &Timeline,
req: &PagestreamExistsRequest,
) -> Result<PagestreamBeMessage> {
) -> anyhow::Result<PagestreamBeMessage> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
.await?;
let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?;
let exists = crate::tenant::with_ondemand_download(|| {
timeline.get_rel_exists(req.rel, lsn, req.latest)
})
.await?;
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
exists,
@@ -554,12 +561,15 @@ impl PageServerHandler {
&self,
timeline: &Timeline,
req: &PagestreamNblocksRequest,
) -> Result<PagestreamBeMessage> {
) -> anyhow::Result<PagestreamBeMessage> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
.await?;
let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?;
let n_blocks = crate::tenant::with_ondemand_download(|| {
timeline.get_rel_size(req.rel, lsn, req.latest)
})
.await?;
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
n_blocks,
@@ -571,14 +581,15 @@ impl PageServerHandler {
&self,
timeline: &Timeline,
req: &PagestreamDbSizeRequest,
) -> Result<PagestreamBeMessage> {
) -> anyhow::Result<PagestreamBeMessage> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
.await?;
let total_blocks =
timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?;
let total_blocks = crate::tenant::with_ondemand_download(|| {
timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)
})
.await?;
let db_size = total_blocks as i64 * BLCKSZ as i64;
Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
@@ -591,7 +602,7 @@ impl PageServerHandler {
&self,
timeline: &Timeline,
req: &PagestreamGetPageRequest,
) -> Result<PagestreamBeMessage> {
) -> anyhow::Result<PagestreamBeMessage> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
.await?;
@@ -604,11 +615,10 @@ impl PageServerHandler {
}
*/
// FIXME: this profiling now happens at different place than it used to. The
// current profiling is based on a thread-local variable, so it doesn't work
// across awaits
let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?;
let page = crate::tenant::with_ondemand_download(|| {
timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
})
.await?;
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
page,
@@ -642,16 +652,12 @@ impl PageServerHandler {
pgb.flush().await?;
/* Send a tarball of the latest layer on the timeline */
let mut writer = CopyDataSink {
pgb,
rt: tokio::runtime::Handle::current(),
};
tokio::task::block_in_place(|| {
let basebackup =
basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
tracing::Span::current().record("lsn", &basebackup.lsn.to_string().as_str());
basebackup.send_tarball()
})?;
{
let mut writer = pgb.copyout_writer();
basebackup::send_basebackup_tarball(&mut writer, &timeline, lsn, prev_lsn, full_backup)
.await?;
}
pgb.write_message(&BeMessage::CopyDone)?;
pgb.flush().await?;
info!("basebackup complete");
@@ -661,7 +667,7 @@ impl PageServerHandler {
// when accessing management api supply None as an argument
// when using to authorize tenant pass corresponding tenant id
fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<()> {
fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
if self.auth.is_none() {
// auth is set to Trust, nothing to check so just return ok
return Ok(());
@@ -683,20 +689,19 @@ impl postgres_backend_async::Handler for PageServerHandler {
&mut self,
_pgb: &mut PostgresBackend,
jwt_response: &[u8],
) -> anyhow::Result<()> {
) -> Result<(), QueryError> {
// this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
// which requires auth to be present
let data = self
.auth
.as_ref()
.unwrap()
.decode(str::from_utf8(jwt_response)?)?;
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
if matches!(data.claims.scope, Scope::Tenant) {
ensure!(
data.claims.tenant_id.is_some(),
if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
return Err(QueryError::Other(anyhow::anyhow!(
"jwt token scope is Tenant, but tenant id is missing"
)
)));
}
info!(
@@ -708,22 +713,33 @@ impl postgres_backend_async::Handler for PageServerHandler {
Ok(())
}
fn startup(
&mut self,
_pgb: &mut PostgresBackend,
_sm: &FeStartupPacket,
) -> Result<(), QueryError> {
Ok(())
}
async fn process_query(
&mut self,
pgb: &mut PostgresBackend,
query_string: &str,
) -> anyhow::Result<()> {
debug!("process query {:?}", query_string);
) -> Result<(), QueryError> {
debug!("process query {query_string:?}");
if query_string.starts_with("pagestream ") {
let (_, params_raw) = query_string.split_at("pagestream ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
ensure!(
params.len() == 2,
"invalid param number for pagestream command"
);
let tenant_id = TenantId::from_str(params[0])?;
let timeline_id = TimelineId::from_str(params[1])?;
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for pagestream command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
self.check_permission(Some(tenant_id))?;
@@ -733,18 +749,24 @@ impl postgres_backend_async::Handler for PageServerHandler {
let (_, params_raw) = query_string.split_at("basebackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
ensure!(
params.len() >= 2,
"invalid param number for basebackup command"
);
if params.len() < 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for basebackup command"
)));
}
let tenant_id = TenantId::from_str(params[0])?;
let timeline_id = TimelineId::from_str(params[1])?;
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
self.check_permission(Some(tenant_id))?;
let lsn = if params.len() == 3 {
Some(Lsn::from_str(params[2])?)
Some(
Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
)
} else {
None
};
@@ -759,13 +781,16 @@ impl postgres_backend_async::Handler for PageServerHandler {
let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
ensure!(
params.len() == 2,
"invalid param number for get_last_record_rlsn command"
);
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for get_last_record_rlsn command"
)));
}
let tenant_id = TenantId::from_str(params[0])?;
let timeline_id = TimelineId::from_str(params[1])?;
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
self.check_permission(Some(tenant_id))?;
let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
@@ -787,22 +812,31 @@ impl postgres_backend_async::Handler for PageServerHandler {
let (_, params_raw) = query_string.split_at("fullbackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
ensure!(
params.len() >= 2,
"invalid param number for fullbackup command"
);
if params.len() < 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for fullbackup command"
)));
}
let tenant_id = TenantId::from_str(params[0])?;
let timeline_id = TimelineId::from_str(params[1])?;
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
// The caller is responsible for providing correct lsn and prev_lsn.
let lsn = if params.len() > 2 {
Some(Lsn::from_str(params[2])?)
Some(
Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
)
} else {
None
};
let prev_lsn = if params.len() > 3 {
Some(Lsn::from_str(params[3])?)
Some(
Lsn::from_str(params[3])
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
)
} else {
None
};
@@ -827,12 +861,21 @@ impl postgres_backend_async::Handler for PageServerHandler {
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
let (_, params_raw) = query_string.split_at("import basebackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
ensure!(params.len() == 5);
let tenant_id = TenantId::from_str(params[0])?;
let timeline_id = TimelineId::from_str(params[1])?;
let base_lsn = Lsn::from_str(params[2])?;
let end_lsn = Lsn::from_str(params[3])?;
let pg_version = u32::from_str(params[4])?;
if params.len() != 5 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for import basebackup command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
let base_lsn = Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
let end_lsn = Lsn::from_str(params[3])
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
let pg_version = u32::from_str(params[4])
.with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
self.check_permission(Some(tenant_id))?;
@@ -850,7 +893,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
Err(e) => {
error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?
pgb.write_message(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?
}
};
} else if query_string.starts_with("import wal ") {
@@ -860,11 +906,19 @@ impl postgres_backend_async::Handler for PageServerHandler {
// caller should poll the http api to check when that is done.
let (_, params_raw) = query_string.split_at("import wal ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
ensure!(params.len() == 4);
let tenant_id = TenantId::from_str(params[0])?;
let timeline_id = TimelineId::from_str(params[1])?;
let start_lsn = Lsn::from_str(params[2])?;
let end_lsn = Lsn::from_str(params[3])?;
if params.len() != 4 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for import wal command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
let start_lsn = Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
let end_lsn = Lsn::from_str(params[3])
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
self.check_permission(Some(tenant_id))?;
@@ -875,7 +929,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
Err(e) => {
error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?
pgb.write_message(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?
}
};
} else if query_string.to_ascii_lowercase().starts_with("set ") {
@@ -886,8 +943,13 @@ impl postgres_backend_async::Handler for PageServerHandler {
// show <tenant_id>
let (_, params_raw) = query_string.split_at("show ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
ensure!(params.len() == 1, "invalid param number for config command");
let tenant_id = TenantId::from_str(params[0])?;
if params.len() != 1 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for config command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
self.check_permission(Some(tenant_id))?;
@@ -928,7 +990,9 @@ impl postgres_backend_async::Handler for PageServerHandler {
]))?
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else {
bail!("unknown command");
return Err(QueryError::Other(anyhow::anyhow!(
"unknown command {query_string}"
)));
}
Ok(())
@@ -940,8 +1004,8 @@ impl postgres_backend_async::Handler for PageServerHandler {
/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
/// ensures that queries don't fail immediately after pageserver startup, because
/// all tenants are still loading.
async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> anyhow::Result<Arc<Tenant>> {
let tenant = mgr::get_tenant(tenant_id, false).await?;
match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
Ok(wait_result) => wait_result
// no .context(), the error message is good enough and some tests depend on it
@@ -954,37 +1018,8 @@ async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenan
async fn get_active_timeline_with_timeout(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<Arc<Timeline>> {
) -> anyhow::Result<Arc<Timeline>> {
get_active_tenant_with_timeout(tenant_id)
.await
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
}
///
/// A std::io::Write implementation that wraps all data written to it in CopyData
/// messages.
///
struct CopyDataSink<'a> {
pgb: &'a mut PostgresBackend,
rt: tokio::runtime::Handle,
}
impl<'a> io::Write for CopyDataSink<'a> {
fn write(&mut self, data: &[u8]) -> io::Result<usize> {
// CopyData
// FIXME: if the input is large, we should split it into multiple messages.
// Not sure what the threshold should be, but the ultimate hard limit is that
// the length cannot exceed u32.
// FIXME: flush isn't really required, but makes it easier
// to view in wireshark
self.pgb.write_message(&BeMessage::CopyData(data))?;
self.rt.block_on(self.pgb.flush())?;
trace!("CopyData sent for {} bytes!", data.len());
Ok(data.len())
}
fn flush(&mut self) -> io::Result<()> {
// no-op
Ok(())
}
}

View File

@@ -6,11 +6,12 @@
//! walingest.rs handles a few things like implicit relation creation and extension.
//! Clarify that)
//!
use super::tenant::PageReconstructResult;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::repository::*;
use crate::tenant::Timeline;
use crate::tenant::{with_ondemand_download, Timeline};
use crate::walrecord::NeonWalRecord;
use anyhow::{bail, ensure, Result};
use crate::{repository::*, try_no_ondemand_download};
use anyhow::Context;
use bytes::{Buf, Bytes};
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -19,6 +20,7 @@ use postgres_ffi::{Oid, TimestampTz, TransactionId};
use serde::{Deserialize, Serialize};
use std::collections::{hash_map, HashMap, HashSet};
use std::ops::Range;
use tokio_util::sync::CancellationToken;
use tracing::{debug, trace, warn};
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -33,6 +35,14 @@ pub enum LsnForTimestamp {
NoData(Lsn),
}
#[derive(Debug, thiserror::Error)]
pub enum CalculateLogicalSizeError {
#[error("cancelled")]
Cancelled,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
///
/// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
/// and other special kinds of files, in a versioned key-value store. The
@@ -88,16 +98,18 @@ impl Timeline {
blknum: BlockNumber,
lsn: Lsn,
latest: bool,
) -> Result<Bytes> {
ensure!(tag.relnode != 0, "invalid relnode");
) -> PageReconstructResult<Bytes> {
if tag.relnode == 0 {
return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
}
let nblocks = self.get_rel_size(tag, lsn, latest)?;
let nblocks = try_no_ondemand_download!(self.get_rel_size(tag, lsn, latest));
if blknum >= nblocks {
debug!(
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
tag, blknum, lsn, nblocks
);
return Ok(ZERO_PAGE.clone());
return PageReconstructResult::Success(ZERO_PAGE.clone());
}
let key = rel_block_to_key(tag, blknum);
@@ -105,38 +117,51 @@ impl Timeline {
}
// Get size of a database in blocks
pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn, latest: bool) -> Result<usize> {
pub fn get_db_size(
&self,
spcnode: Oid,
dbnode: Oid,
lsn: Lsn,
latest: bool,
) -> PageReconstructResult<usize> {
let mut total_blocks = 0;
let rels = self.list_rels(spcnode, dbnode, lsn)?;
let rels = try_no_ondemand_download!(self.list_rels(spcnode, dbnode, lsn));
for rel in rels {
let n_blocks = self.get_rel_size(rel, lsn, latest)?;
let n_blocks = try_no_ondemand_download!(self.get_rel_size(rel, lsn, latest));
total_blocks += n_blocks as usize;
}
Ok(total_blocks)
PageReconstructResult::Success(total_blocks)
}
/// Get size of a relation file
pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> Result<BlockNumber> {
ensure!(tag.relnode != 0, "invalid relnode");
pub fn get_rel_size(
&self,
tag: RelTag,
lsn: Lsn,
latest: bool,
) -> PageReconstructResult<BlockNumber> {
if tag.relnode == 0 {
return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
}
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
return Ok(nblocks);
return PageReconstructResult::Success(nblocks);
}
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
&& !self.get_rel_exists(tag, lsn, latest)?
&& !try_no_ondemand_download!(self.get_rel_exists(tag, lsn, latest))
{
// FIXME: Postgres sometimes calls smgrcreate() to create
// FSM, and smgrnblocks() on it immediately afterwards,
// without extending it. Tolerate that by claiming that
// any non-existent FSM fork has size 0.
return Ok(0);
return PageReconstructResult::Success(0);
}
let key = rel_size_to_key(tag);
let mut buf = self.get(key, lsn)?;
let mut buf = try_no_ondemand_download!(self.get(key, lsn));
let nblocks = buf.get_u32_le();
if latest {
@@ -149,43 +174,62 @@ impl Timeline {
// associated with most recent value of LSN.
self.update_cached_rel_size(tag, lsn, nblocks);
}
Ok(nblocks)
PageReconstructResult::Success(nblocks)
}
/// Does relation exist?
pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> Result<bool> {
ensure!(tag.relnode != 0, "invalid relnode");
pub fn get_rel_exists(
&self,
tag: RelTag,
lsn: Lsn,
_latest: bool,
) -> PageReconstructResult<bool> {
if tag.relnode == 0 {
return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
}
// first try to lookup relation in cache
if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
return Ok(true);
return PageReconstructResult::Success(true);
}
// fetch directory listing
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
let buf = self.get(key, lsn)?;
let dir = RelDirectory::des(&buf)?;
let buf = try_no_ondemand_download!(self.get(key, lsn));
let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
Ok(exists)
match RelDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
PageReconstructResult::Success(exists)
}
Err(e) => PageReconstructResult::from(e),
}
}
/// Get a list of all existing relations in given tablespace and database.
pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<HashSet<RelTag>> {
pub fn list_rels(
&self,
spcnode: Oid,
dbnode: Oid,
lsn: Lsn,
) -> PageReconstructResult<HashSet<RelTag>> {
// fetch directory listing
let key = rel_dir_to_key(spcnode, dbnode);
let buf = self.get(key, lsn)?;
let dir = RelDirectory::des(&buf)?;
let buf = try_no_ondemand_download!(self.get(key, lsn));
let rels: HashSet<RelTag> =
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
spcnode,
dbnode,
relnode: *relnode,
forknum: *forknum,
}));
match RelDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
let rels: HashSet<RelTag> =
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
spcnode,
dbnode,
relnode: *relnode,
forknum: *forknum,
}));
Ok(rels)
PageReconstructResult::Success(rels)
}
Err(e) => PageReconstructResult::from(e),
}
}
/// Look up given SLRU page version.
@@ -195,7 +239,7 @@ impl Timeline {
segno: u32,
blknum: BlockNumber,
lsn: Lsn,
) -> Result<Bytes> {
) -> PageReconstructResult<Bytes> {
let key = slru_block_to_key(kind, segno, blknum);
self.get(key, lsn)
}
@@ -206,21 +250,30 @@ impl Timeline {
kind: SlruKind,
segno: u32,
lsn: Lsn,
) -> Result<BlockNumber> {
) -> PageReconstructResult<BlockNumber> {
let key = slru_segment_size_to_key(kind, segno);
let mut buf = self.get(key, lsn)?;
Ok(buf.get_u32_le())
let mut buf = try_no_ondemand_download!(self.get(key, lsn));
PageReconstructResult::Success(buf.get_u32_le())
}
/// Get size of an SLRU segment
pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<bool> {
pub fn get_slru_segment_exists(
&self,
kind: SlruKind,
segno: u32,
lsn: Lsn,
) -> PageReconstructResult<bool> {
// fetch directory listing
let key = slru_dir_to_key(kind);
let buf = self.get(key, lsn)?;
let dir = SlruSegmentDirectory::des(&buf)?;
let buf = try_no_ondemand_download!(self.get(key, lsn));
let exists = dir.segments.get(&segno).is_some();
Ok(exists)
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
let exists = dir.segments.get(&segno).is_some();
PageReconstructResult::Success(exists)
}
Err(e) => PageReconstructResult::from(e),
}
}
/// Locate LSN, such that all transactions that committed before
@@ -230,7 +283,10 @@ impl Timeline {
/// so it's not well defined which LSN you get if there were multiple commits
/// "in flight" at that point in time.
///
pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
pub fn find_lsn_for_timestamp(
&self,
search_timestamp: TimestampTz,
) -> PageReconstructResult<LsnForTimestamp> {
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
let min_lsn = *gc_cutoff_lsn_guard;
let max_lsn = self.get_last_record_lsn();
@@ -246,12 +302,12 @@ impl Timeline {
// cannot overflow, high and low are both smaller than u64::MAX / 2
let mid = (high + low) / 2;
let cmp = self.is_latest_commit_timestamp_ge_than(
let cmp = try_no_ondemand_download!(self.is_latest_commit_timestamp_ge_than(
search_timestamp,
Lsn(mid * 8),
&mut found_smaller,
&mut found_larger,
)?;
));
if cmp {
high = mid;
@@ -263,15 +319,15 @@ impl Timeline {
(false, false) => {
// This can happen if no commit records have been processed yet, e.g.
// just after importing a cluster.
Ok(LsnForTimestamp::NoData(max_lsn))
PageReconstructResult::Success(LsnForTimestamp::NoData(max_lsn))
}
(true, false) => {
// Didn't find any commit timestamps larger than the request
Ok(LsnForTimestamp::Future(max_lsn))
PageReconstructResult::Success(LsnForTimestamp::Future(max_lsn))
}
(false, true) => {
// Didn't find any commit timestamps smaller than the request
Ok(LsnForTimestamp::Past(max_lsn))
PageReconstructResult::Success(LsnForTimestamp::Past(max_lsn))
}
(true, true) => {
// low is the LSN of the first commit record *after* the search_timestamp,
@@ -281,7 +337,7 @@ impl Timeline {
// Otherwise, if you restore to the returned LSN, the database will
// include physical changes from later commits that will be marked
// as aborted, and will need to be vacuumed away.
Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
PageReconstructResult::Success(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
}
}
}
@@ -299,12 +355,20 @@ impl Timeline {
probe_lsn: Lsn,
found_smaller: &mut bool,
found_larger: &mut bool,
) -> Result<bool> {
for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? {
let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?;
) -> PageReconstructResult<bool> {
for segno in try_no_ondemand_download!(self.list_slru_segments(SlruKind::Clog, probe_lsn)) {
let nblocks = try_no_ondemand_download!(self.get_slru_segment_size(
SlruKind::Clog,
segno,
probe_lsn
));
for blknum in (0..nblocks).rev() {
let clog_page =
self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?;
let clog_page = try_no_ondemand_download!(self.get_slru_page_at_lsn(
SlruKind::Clog,
segno,
blknum,
probe_lsn
));
if clog_page.len() == BLCKSZ as usize + 8 {
let mut timestamp_bytes = [0u8; 8];
@@ -313,61 +377,75 @@ impl Timeline {
if timestamp >= search_timestamp {
*found_larger = true;
return Ok(true);
return PageReconstructResult::Success(true);
} else {
*found_smaller = true;
}
}
}
}
Ok(false)
PageReconstructResult::Success(false)
}
/// Get a list of SLRU segments
pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
pub fn list_slru_segments(
&self,
kind: SlruKind,
lsn: Lsn,
) -> PageReconstructResult<HashSet<u32>> {
// fetch directory entry
let key = slru_dir_to_key(kind);
let buf = self.get(key, lsn)?;
let dir = SlruSegmentDirectory::des(&buf)?;
Ok(dir.segments)
let buf = try_no_ondemand_download!(self.get(key, lsn));
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => PageReconstructResult::Success(dir.segments),
Err(e) => PageReconstructResult::from(e),
}
}
pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<Bytes> {
pub fn get_relmap_file(
&self,
spcnode: Oid,
dbnode: Oid,
lsn: Lsn,
) -> PageReconstructResult<Bytes> {
let key = relmap_file_key(spcnode, dbnode);
let buf = self.get(key, lsn)?;
Ok(buf)
let buf = try_no_ondemand_download!(self.get(key, lsn));
PageReconstructResult::Success(buf)
}
pub fn list_dbdirs(&self, lsn: Lsn) -> Result<HashMap<(Oid, Oid), bool>> {
pub fn list_dbdirs(&self, lsn: Lsn) -> PageReconstructResult<HashMap<(Oid, Oid), bool>> {
// fetch directory entry
let buf = self.get(DBDIR_KEY, lsn)?;
let dir = DbDirectory::des(&buf)?;
let buf = try_no_ondemand_download!(self.get(DBDIR_KEY, lsn));
Ok(dir.dbdirs)
match DbDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => PageReconstructResult::Success(dir.dbdirs),
Err(e) => PageReconstructResult::from(e),
}
}
pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result<Bytes> {
pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> PageReconstructResult<Bytes> {
let key = twophase_file_key(xid);
let buf = self.get(key, lsn)?;
Ok(buf)
let buf = try_no_ondemand_download!(self.get(key, lsn));
PageReconstructResult::Success(buf)
}
pub fn list_twophase_files(&self, lsn: Lsn) -> Result<HashSet<TransactionId>> {
pub fn list_twophase_files(&self, lsn: Lsn) -> PageReconstructResult<HashSet<TransactionId>> {
// fetch directory entry
let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
let dir = TwoPhaseDirectory::des(&buf)?;
let buf = try_no_ondemand_download!(self.get(TWOPHASEDIR_KEY, lsn));
Ok(dir.xids)
match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => PageReconstructResult::Success(dir.xids),
Err(e) => PageReconstructResult::from(e),
}
}
pub fn get_control_file(&self, lsn: Lsn) -> Result<Bytes> {
pub fn get_control_file(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
self.get(CONTROLFILE_KEY, lsn)
}
pub fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes> {
pub fn get_checkpoint(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
self.get(CHECKPOINT_KEY, lsn)
}
@@ -376,16 +454,26 @@ impl Timeline {
///
/// Only relation blocks are counted currently. That excludes metadata,
/// SLRUs, twophase files etc.
pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<u64> {
pub async fn get_current_logical_size_non_incremental(
&self,
lsn: Lsn,
cancel: CancellationToken,
) -> Result<u64, CalculateLogicalSizeError> {
// Fetch list of database dirs and iterate them
let buf = self.get(DBDIR_KEY, lsn)?;
let dbdir = DbDirectory::des(&buf)?;
let buf = self.get_download(DBDIR_KEY, lsn).await?;
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
let mut total_size: u64 = 0;
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
for rel in self.list_rels(*spcnode, *dbnode, lsn)? {
for rel in
crate::tenant::with_ondemand_download(|| self.list_rels(*spcnode, *dbnode, lsn))
.await?
{
if cancel.is_cancelled() {
return Err(CalculateLogicalSizeError::Cancelled);
}
let relsize_key = rel_size_to_key(rel);
let mut buf = self.get(relsize_key, lsn)?;
let mut buf = self.get_download(relsize_key, lsn).await?;
let relsize = buf.get_u32_le();
total_size += relsize as u64;
@@ -398,7 +486,7 @@ impl Timeline {
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
/// Anything that's not listed maybe removed from the underlying storage (from
/// that LSN forwards).
pub fn collect_keyspace(&self, lsn: Lsn) -> Result<KeySpace> {
pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
// Iterate through key ranges, greedily packing them into partitions
let mut result = KeySpaceAccum::new();
@@ -406,8 +494,8 @@ impl Timeline {
result.add_key(DBDIR_KEY);
// Fetch list of database dirs and iterate them
let buf = self.get(DBDIR_KEY, lsn)?;
let dbdir = DbDirectory::des(&buf)?;
let buf = self.get_download(DBDIR_KEY, lsn).await?;
let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
dbs.sort_unstable();
@@ -415,15 +503,15 @@ impl Timeline {
result.add_key(relmap_file_key(spcnode, dbnode));
result.add_key(rel_dir_to_key(spcnode, dbnode));
let mut rels: Vec<RelTag> = self
.list_rels(spcnode, dbnode, lsn)?
.iter()
.cloned()
.collect();
let mut rels: Vec<RelTag> =
with_ondemand_download(|| self.list_rels(spcnode, dbnode, lsn))
.await?
.into_iter()
.collect();
rels.sort_unstable();
for rel in rels {
let relsize_key = rel_size_to_key(rel);
let mut buf = self.get(relsize_key, lsn)?;
let mut buf = self.get_download(relsize_key, lsn).await?;
let relsize = buf.get_u32_le();
result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
@@ -439,13 +527,13 @@ impl Timeline {
] {
let slrudir_key = slru_dir_to_key(kind);
result.add_key(slrudir_key);
let buf = self.get(slrudir_key, lsn)?;
let dir = SlruSegmentDirectory::des(&buf)?;
let buf = self.get_download(slrudir_key, lsn).await?;
let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
segments.sort_unstable();
for segno in segments {
let segsize_key = slru_segment_size_to_key(kind, segno);
let mut buf = self.get(segsize_key, lsn)?;
let mut buf = self.get_download(segsize_key, lsn).await?;
let segsize = buf.get_u32_le();
result.add_range(
@@ -457,8 +545,8 @@ impl Timeline {
// Then pg_twophase
result.add_key(TWOPHASEDIR_KEY);
let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
let twophase_dir = TwoPhaseDirectory::des(&buf)?;
let buf = self.get_download(TWOPHASEDIR_KEY, lsn).await?;
let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
xids.sort_unstable();
for xid in xids {
@@ -537,7 +625,7 @@ impl<'a> DatadirModification<'a> {
///
/// This inserts the directory metadata entries that are assumed to
/// always exist.
pub fn init_empty(&mut self) -> Result<()> {
pub fn init_empty(&mut self) -> anyhow::Result<()> {
let buf = DbDirectory::ser(&DbDirectory {
dbdirs: HashMap::new(),
})?;
@@ -570,8 +658,8 @@ impl<'a> DatadirModification<'a> {
rel: RelTag,
blknum: BlockNumber,
rec: NeonWalRecord,
) -> Result<()> {
ensure!(rel.relnode != 0, "invalid relnode");
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
Ok(())
}
@@ -583,7 +671,7 @@ impl<'a> DatadirModification<'a> {
segno: u32,
blknum: BlockNumber,
rec: NeonWalRecord,
) -> Result<()> {
) -> anyhow::Result<()> {
self.put(
slru_block_to_key(kind, segno, blknum),
Value::WalRecord(rec),
@@ -597,8 +685,8 @@ impl<'a> DatadirModification<'a> {
rel: RelTag,
blknum: BlockNumber,
img: Bytes,
) -> Result<()> {
ensure!(rel.relnode != 0, "invalid relnode");
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
Ok(())
}
@@ -609,26 +697,26 @@ impl<'a> DatadirModification<'a> {
segno: u32,
blknum: BlockNumber,
img: Bytes,
) -> Result<()> {
) -> anyhow::Result<()> {
self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img));
Ok(())
}
/// Store a relmapper file (pg_filenode.map) in the repository
pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> Result<()> {
pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> anyhow::Result<()> {
// Add it to the directory (if it doesn't exist already)
let buf = self.get(DBDIR_KEY)?;
let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
let mut dbdir = DbDirectory::des(&buf)?;
let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
if r == None || r == Some(false) {
if r.is_none() || r == Some(false) {
// The dbdir entry didn't exist, or it contained a
// 'false'. The 'insert' call already updated it with
// 'true', now write the updated 'dbdirs' map back.
let buf = DbDirectory::ser(&dbdir)?;
self.put(DBDIR_KEY, Value::Image(buf.into()));
}
if r == None {
if r.is_none() {
// Create RelDirectory
let buf = RelDirectory::ser(&RelDirectory {
rels: HashSet::new(),
@@ -643,12 +731,12 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> Result<()> {
pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> anyhow::Result<()> {
// Add it to the directory entry
let buf = self.get(TWOPHASEDIR_KEY)?;
let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
let mut dir = TwoPhaseDirectory::des(&buf)?;
if !dir.xids.insert(xid) {
bail!("twophase file for xid {} already exists", xid);
anyhow::bail!("twophase file for xid {} already exists", xid);
}
self.put(
TWOPHASEDIR_KEY,
@@ -659,23 +747,26 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
pub fn put_control_file(&mut self, img: Bytes) -> Result<()> {
pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
self.put(CONTROLFILE_KEY, Value::Image(img));
Ok(())
}
pub fn put_checkpoint(&mut self, img: Bytes) -> Result<()> {
pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
self.put(CHECKPOINT_KEY, Value::Image(img));
Ok(())
}
pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> {
pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
let req_lsn = self.tline.get_last_record_lsn();
let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?;
let total_blocks = self
.tline
.get_db_size(spcnode, dbnode, req_lsn, true)
.no_ondemand_download()?;
// Remove entry from dbdir
let buf = self.get(DBDIR_KEY)?;
let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
let mut dir = DbDirectory::des(&buf)?;
if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
let buf = DbDirectory::ser(&dir)?;
@@ -698,11 +789,11 @@ impl<'a> DatadirModification<'a> {
/// Create a relation fork.
///
/// 'nblocks' is the initial size.
pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
ensure!(rel.relnode != 0, "invalid relnode");
pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
// It's possible that this is the first rel for this db in this
// tablespace. Create the reldir entry for it if so.
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY)?)?;
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).no_ondemand_download()?)?;
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
// Didn't exist. Update dbdir
@@ -714,12 +805,12 @@ impl<'a> DatadirModification<'a> {
RelDirectory::default()
} else {
// reldir already exists, fetch it
RelDirectory::des(&self.get(rel_dir_key)?)?
RelDirectory::des(&self.get(rel_dir_key).no_ondemand_download()?)?
};
// Add the new relation to the rel directory entry, and write it back
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
bail!("rel {} already exists", rel);
anyhow::bail!("rel {rel} already exists");
}
self.put(
rel_dir_key,
@@ -742,13 +833,17 @@ impl<'a> DatadirModification<'a> {
}
/// Truncate relation
pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
ensure!(rel.relnode != 0, "invalid relnode");
pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
let last_lsn = self.tline.get_last_record_lsn();
if self.tline.get_rel_exists(rel, last_lsn, true)? {
if self
.tline
.get_rel_exists(rel, last_lsn, true)
.no_ondemand_download()?
{
let size_key = rel_size_to_key(rel);
// Fetch the old size first
let old_size = self.get(size_key)?.get_u32_le();
let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
// Update the entry with the new size.
let buf = nblocks.to_le_bytes();
@@ -768,12 +863,12 @@ impl<'a> DatadirModification<'a> {
/// Extend relation
/// If new size is smaller, do nothing.
pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
ensure!(rel.relnode != 0, "invalid relnode");
pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
// Put size
let size_key = rel_size_to_key(rel);
let old_size = self.get(size_key)?.get_u32_le();
let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
// only extend relation here. never decrease the size
if nblocks > old_size {
@@ -789,12 +884,12 @@ impl<'a> DatadirModification<'a> {
}
/// Drop a relation.
pub fn put_rel_drop(&mut self, rel: RelTag) -> Result<()> {
ensure!(rel.relnode != 0, "invalid relnode");
pub fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
// Remove it from the directory entry
let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
let buf = self.get(dir_key)?;
let buf = self.get(dir_key).no_ondemand_download()?;
let mut dir = RelDirectory::des(&buf)?;
if dir.rels.remove(&(rel.relnode, rel.forknum)) {
@@ -805,7 +900,7 @@ impl<'a> DatadirModification<'a> {
// update logical size
let size_key = rel_size_to_key(rel);
let old_size = self.get(size_key)?.get_u32_le();
let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
self.pending_nblocks -= old_size as i64;
// Remove enty from relation size cache
@@ -822,14 +917,14 @@ impl<'a> DatadirModification<'a> {
kind: SlruKind,
segno: u32,
nblocks: BlockNumber,
) -> Result<()> {
) -> anyhow::Result<()> {
// Add it to the directory entry
let dir_key = slru_dir_to_key(kind);
let buf = self.get(dir_key)?;
let buf = self.get(dir_key).no_ondemand_download()?;
let mut dir = SlruSegmentDirectory::des(&buf)?;
if !dir.segments.insert(segno) {
bail!("slru segment {:?}/{} already exists", kind, segno);
anyhow::bail!("slru segment {kind:?}/{segno} already exists");
}
self.put(
dir_key,
@@ -852,7 +947,7 @@ impl<'a> DatadirModification<'a> {
kind: SlruKind,
segno: u32,
nblocks: BlockNumber,
) -> Result<()> {
) -> anyhow::Result<()> {
// Put size
let size_key = slru_segment_size_to_key(kind, segno);
let buf = nblocks.to_le_bytes();
@@ -861,10 +956,10 @@ impl<'a> DatadirModification<'a> {
}
/// This method is used for marking truncated SLRU files
pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> Result<()> {
pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
// Remove it from the directory entry
let dir_key = slru_dir_to_key(kind);
let buf = self.get(dir_key)?;
let buf = self.get(dir_key).no_ondemand_download()?;
let mut dir = SlruSegmentDirectory::des(&buf)?;
if !dir.segments.remove(&segno) {
@@ -882,15 +977,15 @@ impl<'a> DatadirModification<'a> {
}
/// Drop a relmapper file (pg_filenode.map)
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<()> {
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
// TODO
Ok(())
}
/// This method is used for marking truncated SLRU files
pub fn drop_twophase_file(&mut self, xid: TransactionId) -> Result<()> {
pub fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
// Remove it from the directory entry
let buf = self.get(TWOPHASEDIR_KEY)?;
let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
let mut dir = TwoPhaseDirectory::des(&buf)?;
if !dir.xids.remove(&xid) {
@@ -925,7 +1020,7 @@ impl<'a> DatadirModification<'a> {
/// retains all the metadata, but data pages are flushed. That's again OK
/// for bulk import, where you are just loading data pages and won't try to
/// modify the same pages twice.
pub fn flush(&mut self) -> Result<()> {
pub fn flush(&mut self) -> anyhow::Result<()> {
// Unless we have accumulated a decent amount of changes, it's not worth it
// to scan through the pending_updates list.
let pending_nblocks = self.pending_nblocks;
@@ -936,7 +1031,7 @@ impl<'a> DatadirModification<'a> {
let writer = self.tline.writer();
// Flush relation and SLRU data blocks, keep metadata.
let mut result: Result<()> = Ok(());
let mut result: anyhow::Result<()> = Ok(());
self.pending_updates.retain(|&key, value| {
if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
result = writer.put(key, self.lsn, value);
@@ -984,7 +1079,7 @@ impl<'a> DatadirModification<'a> {
// Internal helper functions to batch the modifications
fn get(&self, key: Key) -> Result<Bytes> {
fn get(&self, key: Key) -> PageReconstructResult<Bytes> {
// Have we already updated the same key? Read the pending updated
// version in that case.
//
@@ -992,14 +1087,14 @@ impl<'a> DatadirModification<'a> {
// value that has been removed, deletion only avoids leaking storage.
if let Some(value) = self.pending_updates.get(&key) {
if let Value::Image(img) = value {
Ok(img.clone())
PageReconstructResult::Success(img.clone())
} else {
// Currently, we never need to read back a WAL record that we
// inserted in the same "transaction". All the metadata updates
// work directly with Images, and we never need to read actual
// data pages. We could handle this if we had to, by calling
// the walredo manager, but let's keep it simple for now.
bail!("unexpected pending WAL record");
PageReconstructResult::from(anyhow::anyhow!("unexpected pending WAL record"))
}
} else {
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
@@ -1327,7 +1422,7 @@ fn twophase_key_range(xid: TransactionId) -> Range<Key> {
field2: 0,
field3: 0,
field4: 0,
field5: if overflowed { 1 } else { 0 },
field5: u8::from(overflowed),
field6: next_xid,
}
}
@@ -1354,7 +1449,7 @@ const CHECKPOINT_KEY: Key = Key {
// Reverse mappings for a few Keys.
// These are needed by WAL redo manager.
pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
Ok(match key.field1 {
0x00 => (
RelTag {
@@ -1365,7 +1460,7 @@ pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
},
key.field6,
),
_ => bail!("unexpected value kind 0x{:02x}", key.field1),
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
})
}
@@ -1384,21 +1479,21 @@ pub fn is_rel_vm_block_key(key: Key) -> bool {
&& key.field6 != 0xffffffff
}
pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
Ok(match key.field1 {
0x01 => {
let kind = match key.field2 {
0x00 => SlruKind::Clog,
0x01 => SlruKind::MultiXactMembers,
0x02 => SlruKind::MultiXactOffsets,
_ => bail!("unrecognized slru kind 0x{:02x}", key.field2),
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
};
let segno = key.field4;
let blknum = key.field6;
(kind, segno, blknum)
}
_ => bail!("unexpected value kind 0x{:02x}", key.field1),
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
})
}
@@ -1413,7 +1508,7 @@ pub fn create_test_timeline(
tenant: &crate::tenant::Tenant,
timeline_id: utils::id::TimelineId,
pg_version: u32,
) -> Result<std::sync::Arc<Timeline>> {
) -> anyhow::Result<std::sync::Arc<Timeline>> {
let tline = tenant
.create_empty_timeline(timeline_id, Lsn(8), pg_version)?
.initialize()?;

View File

@@ -1,107 +0,0 @@
//!
//! Support for profiling
//!
//! This relies on a modified version of the 'pprof-rs' crate. That's not very
//! nice, so to avoid a hard dependency on that, this is an optional feature.
//!
use crate::config::{PageServerConf, ProfilingConfig};
/// The actual implementation is in the `profiling_impl` submodule. If the profiling
/// feature is not enabled, it's just a dummy implementation that panics if you
/// try to enabled profiling in the configuration.
pub use profiling_impl::*;
#[cfg(feature = "profiling")]
mod profiling_impl {
use super::*;
use pprof;
use std::marker::PhantomData;
/// Start profiling the current thread. Returns a guard object;
/// the profiling continues until the guard is dropped.
///
/// Note: profiling is not re-entrant. If you call 'profpoint_start' while
/// profiling is already started, nothing happens, and the profiling will be
/// stopped when either guard object is dropped.
#[inline]
pub fn profpoint_start(
conf: &crate::config::PageServerConf,
point: ProfilingConfig,
) -> Option<ProfilingGuard> {
if conf.profiling == point {
pprof::start_profiling();
Some(ProfilingGuard(PhantomData))
} else {
None
}
}
/// A hack to remove Send and Sync from the ProfilingGuard. Because the
/// profiling is attached to current thread.
////
/// See comments in https://github.com/rust-lang/rust/issues/68318
type PhantomUnsend = std::marker::PhantomData<*mut u8>;
pub struct ProfilingGuard(PhantomUnsend);
impl Drop for ProfilingGuard {
fn drop(&mut self) {
pprof::stop_profiling();
}
}
/// Initialize the profiler. This must be called before any 'profpoint_start' calls.
pub fn init_profiler(conf: &PageServerConf) -> Option<pprof::ProfilerGuard> {
if conf.profiling != ProfilingConfig::Disabled {
Some(pprof::ProfilerGuardBuilder::default().build().unwrap())
} else {
None
}
}
/// Exit the profiler. Writes the flamegraph to current workdir.
pub fn exit_profiler(_conf: &PageServerConf, profiler_guard: &Option<pprof::ProfilerGuard>) {
// Write out the flamegraph
if let Some(profiler_guard) = profiler_guard {
if let Ok(report) = profiler_guard.report().build() {
// this gets written under the workdir
let file = std::fs::File::create("flamegraph.svg").unwrap();
let mut options = pprof::flamegraph::Options::default();
options.image_width = Some(2500);
report.flamegraph_with_options(file, &mut options).unwrap();
}
}
}
}
/// Dummy implementation when compiling without profiling feature or for non-linux OSes.
#[cfg(not(feature = "profiling"))]
mod profiling_impl {
use super::*;
pub struct DummyProfilerGuard;
impl Drop for DummyProfilerGuard {
fn drop(&mut self) {
// do nothing, this exists to calm Clippy down
}
}
pub fn profpoint_start(
_conf: &PageServerConf,
_point: ProfilingConfig,
) -> Option<DummyProfilerGuard> {
None
}
pub fn init_profiler(conf: &PageServerConf) -> Option<DummyProfilerGuard> {
if conf.profiling != ProfilingConfig::Disabled {
// shouldn't happen, we don't allow profiling in the config if the support
// for it is disabled.
panic!("profiling enabled but the binary was compiled without profiling support");
}
None
}
pub fn exit_profiler(_conf: &PageServerConf, _guard: &Option<DummyProfilerGuard>) {}
}

View File

@@ -1,232 +0,0 @@
//! Helper functions to download files from remote storage with a RemoteStorage
use std::collections::HashSet;
use std::path::Path;
use anyhow::{bail, Context};
use futures::stream::{FuturesUnordered, StreamExt};
use tokio::fs;
use tokio::io::AsyncWriteExt;
use tracing::{debug, info_span, Instrument};
use crate::config::PageServerConf;
use crate::storage_sync::index::LayerFileMetadata;
use crate::tenant::filename::LayerFileName;
use remote_storage::{DownloadError, GenericRemoteStorage};
use utils::crashsafe::path_with_suffix_extension;
use utils::id::{TenantId, TimelineId};
use super::index::{IndexPart, IndexPartUnclean};
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
fs::File::open(path).await?.sync_all().await
}
///
/// If 'metadata' is given, we will validate that the downloaded file's size matches that
/// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
///
/// Returns the size of the downloaded file.
pub async fn download_layer_file<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
tenant_id: TenantId,
timeline_id: TimelineId,
layer_file_name: &'a LayerFileName,
layer_metadata: &'a LayerFileMetadata,
) -> anyhow::Result<u64> {
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
let local_path = timeline_path.join(layer_file_name.file_name());
let remote_path = conf.remote_path(&local_path)?;
// Perform a rename inspired by durable_rename from file_utils.c.
// The sequence:
// write(tmp)
// fsync(tmp)
// rename(tmp, new)
// fsync(new)
// fsync(parent)
// For more context about durable_rename check this email from postgres mailing list:
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
// TODO: this doesn't use the cached fd for some reason?
let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
format!(
"Failed to create a destination file for layer '{}'",
temp_file_path.display()
)
})?;
let mut download = storage.download(&remote_path).await.with_context(|| {
format!(
"Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
)
})?;
let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
})?;
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
// A file will not be closed immediately when it goes out of scope if there are any IO operations
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
// you should call flush before dropping it.
//
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
// But for additional safety lets check/wait for any pending operations.
destination_file.flush().await.with_context(|| {
format!(
"failed to flush source file at {}",
temp_file_path.display()
)
})?;
match layer_metadata.file_size() {
Some(expected) if expected != bytes_amount => {
anyhow::bail!(
"According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
temp_file_path.display()
);
}
Some(_) | None => {
// matches, or upgrading from an earlier IndexPart version
}
}
// not using sync_data because it can lose file size update
destination_file.sync_all().await.with_context(|| {
format!(
"failed to fsync source file at {}",
temp_file_path.display()
)
})?;
drop(destination_file);
fail::fail_point!("remote-storage-download-pre-rename", |_| {
bail!("remote-storage-download-pre-rename failpoint triggered")
});
fs::rename(&temp_file_path, &local_path).await?;
fsync_path(&local_path)
.await
.with_context(|| format!("Could not fsync layer file {}", local_path.display(),))?;
tracing::info!("download complete: {}", local_path.display());
Ok(bytes_amount)
}
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
pub fn is_temp_download_file(path: &Path) -> bool {
let extension = path.extension().map(|pname| {
pname
.to_str()
.expect("paths passed to this function must be valid Rust strings")
});
match extension {
Some(TEMP_DOWNLOAD_EXTENSION) => true,
Some(_) => false,
None => false,
}
}
/// List timelines of given tenant in remote storage
pub async fn list_remote_timelines<'a>(
storage: &'a GenericRemoteStorage,
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> anyhow::Result<Vec<(TimelineId, IndexPart)>> {
let tenant_path = conf.timelines_path(&tenant_id);
let tenant_storage_path = conf.remote_path(&tenant_path)?;
let timelines = storage
.list_prefixes(Some(&tenant_storage_path))
.await
.with_context(|| {
format!(
"Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download"
)
})?;
if timelines.is_empty() {
anyhow::bail!("no timelines found on the remote storage")
}
let mut timeline_ids = HashSet::new();
let mut part_downloads = FuturesUnordered::new();
for timeline_remote_storage_key in timelines {
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
})?;
let timeline_id: TimelineId = object_name.parse().with_context(|| {
format!("failed to parse object name into timeline id '{object_name}'")
})?;
// list_prefixes returns all files with the prefix. If we haven't seen this timeline ID
// yet, launch a download task for it.
if !timeline_ids.contains(&timeline_id) {
timeline_ids.insert(timeline_id);
let storage_clone = storage.clone();
part_downloads.push(async move {
(
timeline_id,
download_index_part(conf, &storage_clone, tenant_id, timeline_id)
.instrument(info_span!("download_index_part", timeline=%timeline_id))
.await,
)
});
}
}
// Wait for all the download tasks to complete.
let mut timeline_parts = Vec::new();
while let Some((timeline_id, part_upload_result)) = part_downloads.next().await {
let index_part = part_upload_result
.with_context(|| format!("Failed to fetch index part for timeline {timeline_id}"))?;
debug!("Successfully fetched index part for timeline {timeline_id}");
timeline_parts.push((timeline_id, index_part));
}
Ok(timeline_parts)
}
pub async fn download_index_part(
conf: &'static PageServerConf,
storage: &GenericRemoteStorage,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<IndexPart, DownloadError> {
let index_part_path = conf
.metadata_path(timeline_id, tenant_id)
.with_file_name(IndexPart::FILE_NAME);
let part_storage_path = conf
.remote_path(&index_part_path)
.map_err(DownloadError::BadInput)?;
let mut index_part_download = storage.download(&part_storage_path).await?;
let mut index_part_bytes = Vec::new();
tokio::io::copy(
&mut index_part_download.download_stream,
&mut index_part_bytes,
)
.await
.with_context(|| format!("Failed to download an index part into file {index_part_path:?}"))
.map_err(DownloadError::Other)?;
let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes)
.with_context(|| {
format!("Failed to deserialize index part file into file {index_part_path:?}")
})
.map_err(DownloadError::Other)?;
let index_part = index_part.remove_unclean_layer_file_names();
Ok(index_part)
}

View File

@@ -25,7 +25,6 @@
//! the current task has been requested to shut down. You can use that with
//! Tokio select!().
//!
//!
//! TODO: This would be a good place to also handle panics in a somewhat sane way.
//! Depending on what task panics, we might want to kill the whole server, or
//! only a single tenant or timeline.
@@ -36,6 +35,7 @@
#![allow(clippy::declare_interior_mutable_const)]
use std::collections::HashMap;
use std::fmt;
use std::future::Future;
use std::panic::AssertUnwindSafe;
use std::sync::atomic::{AtomicU64, Ordering};
@@ -43,9 +43,9 @@ use std::sync::{Arc, Mutex};
use futures::FutureExt;
use tokio::runtime::Runtime;
use tokio::sync::watch;
use tokio::task::JoinHandle;
use tokio::task_local;
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, warn};
@@ -135,8 +135,15 @@ pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
.expect("Failed to create background op runtime")
});
#[derive(Debug, Clone, Copy)]
pub struct PageserverTaskId(u64);
impl fmt::Display for PageserverTaskId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.0.fmt(f)
}
}
/// Each task that we track is associated with a "task ID". It's just an
/// increasing number that we assign. Note that it is different from tokio::task::Id.
static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1);
@@ -146,11 +153,10 @@ static TASKS: Lazy<Mutex<HashMap<u64, Arc<PageServerTask>>>> =
Lazy::new(|| Mutex::new(HashMap::new()));
task_local! {
// There is a Tokio watch channel for each task, which can be used to signal the
// task that it needs to shut down. This task local variable holds the receiving
// end of the channel. The sender is kept in the global registry, so that anyone
// can send the signal to request task shutdown.
static SHUTDOWN_RX: watch::Receiver<bool>;
// This is a cancellation token which will be cancelled when a task needs to shut down. The
// root token is kept in the global registry, so that anyone can send the signal to request
// task shutdown.
static SHUTDOWN_TOKEN: CancellationToken;
// Each task holds reference to its own PageServerTask here.
static CURRENT_TASK: Arc<PageServerTask>;
@@ -200,11 +206,20 @@ pub enum TaskKind {
// Task that uploads a file to remote storage
RemoteUploadTask,
// Task that downloads a file from remote storage
RemoteDownloadTask,
// task that handles the initial downloading of all tenants
InitialLoad,
// task that handles attaching a tenant
Attach,
// task that handhes metrics collection
MetricsCollection,
// task that drives downloading layers
DownloadAllRemoteLayers,
}
#[derive(Default)]
@@ -226,8 +241,8 @@ struct PageServerTask {
name: String,
// To request task shutdown, send 'true' to the channel to notify the task.
shutdown_tx: watch::Sender<bool>,
// To request task shutdown, just cancel this token.
cancel: CancellationToken,
mutable: Mutex<MutableTaskState>,
}
@@ -247,13 +262,13 @@ pub fn spawn<F>(
where
F: Future<Output = anyhow::Result<()>> + Send + 'static,
{
let (shutdown_tx, shutdown_rx) = watch::channel(false);
let cancel = CancellationToken::new();
let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed);
let task = Arc::new(PageServerTask {
task_id: PageserverTaskId(task_id),
kind,
name: name.to_string(),
shutdown_tx,
cancel: cancel.clone(),
mutable: Mutex::new(MutableTaskState {
tenant_id,
timeline_id,
@@ -271,7 +286,7 @@ where
task_name,
task_id,
task_cloned,
shutdown_rx,
cancel,
shutdown_process_on_error,
future,
));
@@ -288,7 +303,7 @@ async fn task_wrapper<F>(
task_name: String,
task_id: u64,
task: Arc<PageServerTask>,
shutdown_rx: watch::Receiver<bool>,
shutdown_token: CancellationToken,
shutdown_process_on_error: bool,
future: F,
) where
@@ -296,9 +311,9 @@ async fn task_wrapper<F>(
{
debug!("Starting task '{}'", task_name);
let result = SHUTDOWN_RX
let result = SHUTDOWN_TOKEN
.scope(
shutdown_rx,
shutdown_token,
CURRENT_TASK.scope(task, {
// We use AssertUnwindSafe here so that the payload function
// doesn't need to be UnwindSafe. We don't do anything after the
@@ -408,7 +423,7 @@ pub async fn shutdown_tasks(
&& (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
&& (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
{
let _ = task.shutdown_tx.send_replace(true);
task.cancel.cancel();
victim_tasks.push(Arc::clone(task));
}
}
@@ -436,24 +451,35 @@ pub fn current_task_kind() -> Option<TaskKind> {
CURRENT_TASK.try_with(|ct| ct.kind).ok()
}
pub fn current_task_id() -> Option<PageserverTaskId> {
CURRENT_TASK.try_with(|ct| ct.task_id).ok()
}
/// A Future that can be used to check if the current task has been requested to
/// shut down.
pub async fn shutdown_watcher() {
let mut shutdown_rx = SHUTDOWN_RX
.try_with(|rx| rx.clone())
let token = SHUTDOWN_TOKEN
.try_with(|t| t.clone())
.expect("shutdown_requested() called in an unexpected task or thread");
while !*shutdown_rx.borrow() {
if shutdown_rx.changed().await.is_err() {
break;
}
}
token.cancelled().await;
}
/// Clone the current task's cancellation token, which can be moved across tasks.
///
/// When the task which is currently executing is shutdown, the cancellation token will be
/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
/// `tokio::task::JoinSet::spawn`.
pub fn shutdown_token() -> CancellationToken {
SHUTDOWN_TOKEN
.try_with(|t| t.clone())
.expect("shutdown_token() called in an unexpected task or thread")
}
/// Has the current task been requested to shut down?
pub fn is_shutdown_requested() -> bool {
if let Ok(shutdown_rx) = SHUTDOWN_RX.try_with(|rx| rx.clone()) {
*shutdown_rx.borrow()
if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
cancel.is_cancelled()
} else {
if !cfg!(test) {
warn!("is_shutdown_requested() called in an unexpected task or thread");

File diff suppressed because it is too large Load Diff

View File

@@ -30,7 +30,7 @@ pub mod defaults {
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
pub const DEFAULT_GC_PERIOD: &str = "100 s";
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
@@ -191,11 +191,10 @@ impl TenantConfOpt {
}
}
impl TenantConf {
pub fn default() -> TenantConf {
impl Default for TenantConf {
fn default() -> Self {
use defaults::*;
TenantConf {
Self {
checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
.expect("cannot parse default checkpoint timeout"),
@@ -220,29 +219,4 @@ impl TenantConf {
trace_read_requests: false,
}
}
pub fn dummy_conf() -> Self {
TenantConf {
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
checkpoint_timeout: Duration::from_secs(600),
compaction_target_size: 4 * 1024 * 1024,
compaction_period: Duration::from_secs(10),
compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD,
gc_horizon: defaults::DEFAULT_GC_HORIZON,
gc_period: Duration::from_secs(10),
image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD,
pitr_interval: Duration::from_secs(60 * 60),
walreceiver_connect_timeout: humantime::parse_duration(
defaults::DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
)
.unwrap(),
lagging_wal_timeout: humantime::parse_duration(
defaults::DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT,
)
.unwrap(),
max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
.unwrap(),
trace_read_requests: false,
}
}
}

View File

@@ -139,7 +139,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
off += keys_len as u64;
let values_off = off as usize;
let values_len = num_children as usize * VALUE_SZ as usize;
let values_len = num_children as usize * VALUE_SZ;
//off += values_len as u64;
let prefix = &buf[prefix_off..prefix_off + prefix_len as usize];
@@ -177,7 +177,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
while low < high {
let mid = low + size / 2;
let key_off = mid as usize * self.suffix_len as usize;
let key_off = mid * self.suffix_len as usize;
let suffix = &self.keys[key_off..key_off + self.suffix_len as usize];
// Does this match?
keybuf[self.prefix_len as usize..].copy_from_slice(suffix);
@@ -328,7 +328,7 @@ where
while idx < node.num_children as usize {
let suffix = &node.keys[key_off..key_off + suffix_len];
keybuf[prefix_len..].copy_from_slice(suffix);
let value = node.value(idx as usize);
let value = node.value(idx);
#[allow(clippy::collapsible_if)]
if node.level == 0 {
// leaf
@@ -368,7 +368,7 @@ where
key_off -= suffix_len;
let suffix = &node.keys[key_off..key_off + suffix_len];
keybuf[prefix_len..].copy_from_slice(suffix);
let value = node.value(idx as usize);
let value = node.value(idx);
#[allow(clippy::collapsible_if)]
if node.level == 0 {
// leaf
@@ -629,7 +629,7 @@ impl<const L: usize> BuildNode<L> {
self.keys.extend(&key[self.prefix.len()..]);
self.values.extend(value.0);
assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
self.size += self.suffix_len + VALUE_SZ;
@@ -674,7 +674,7 @@ impl<const L: usize> BuildNode<L> {
self.size -= prefix_len * self.num_children as usize;
self.size += prefix_len;
assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
true
@@ -684,7 +684,7 @@ impl<const L: usize> BuildNode<L> {
/// Serialize the node to on-disk format.
///
fn pack(&self) -> Bytes {
assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
assert!(self.num_children > 0);
@@ -940,7 +940,7 @@ mod tests {
let t = -(f64::ln(u));
let key_int = (t * 1000000.0) as u128;
all_data.insert(key_int as u128, idx as u64);
all_data.insert(key_int, idx as u64);
}
// Build a tree from it

View File

@@ -91,7 +91,7 @@ impl EphemeralFile {
break;
}
off += n as usize;
off += n;
}
Ok(())
}

View File

@@ -12,7 +12,6 @@
use crate::metrics::NUM_ONDISK_LAYERS;
use crate::repository::Key;
use crate::tenant::inmemory_layer::InMemoryLayer;
use crate::tenant::storage_layer::{range_eq, range_overlaps};
use amplify_num::i256;
use anyhow::Result;
@@ -27,7 +26,7 @@ use std::sync::Arc;
use tracing::*;
use utils::lsn::Lsn;
use super::storage_layer::Layer;
use super::storage_layer::{InMemoryLayer, Layer};
///
/// LayerMap tracks what layers exist on a timeline.
@@ -261,8 +260,10 @@ where
/// contain the version, even if it's missing from the returned
/// layer.
///
pub fn search(&self, key: Key, end_lsn: Lsn) -> Result<Option<SearchResult<L>>> {
// linear search
/// NOTE: This only searches the 'historic' layers, *not* the
/// 'open' and 'frozen' layers!
///
pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
// Find the latest image layer that covers the given key
let mut latest_img: Option<Arc<L>> = None;
let mut latest_img_lsn: Option<Lsn> = None;
@@ -286,10 +287,10 @@ where
assert!(img_lsn < end_lsn);
if Lsn(img_lsn.0 + 1) == end_lsn {
// found exact match
return Ok(Some(SearchResult {
return Some(SearchResult {
layer: Arc::clone(l),
lsn_floor: img_lsn,
}));
});
}
if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
latest_img = Some(Arc::clone(l));
@@ -327,14 +328,16 @@ where
latest_delta.replace(Arc::clone(l));
break;
}
// this layer's end LSN is smaller than the requested point. If there's
// nothing newer, this is what we need to return. Remember this.
if let Some(old_candidate) = &latest_delta {
if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
if l.get_lsn_range().end > latest_img_lsn.unwrap_or(Lsn(0)) {
// this layer's end LSN is smaller than the requested point. If there's
// nothing newer, this is what we need to return. Remember this.
if let Some(old_candidate) = &latest_delta {
if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
latest_delta.replace(Arc::clone(l));
}
} else {
latest_delta.replace(Arc::clone(l));
}
} else {
latest_delta.replace(Arc::clone(l));
}
}
if let Some(l) = latest_delta {
@@ -346,19 +349,19 @@ where
Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
l.get_lsn_range().start,
);
Ok(Some(SearchResult {
Some(SearchResult {
lsn_floor,
layer: l,
}))
})
} else if let Some(l) = latest_img {
trace!("found img layer and no deltas for request on {key} at {end_lsn}");
Ok(Some(SearchResult {
Some(SearchResult {
lsn_floor: latest_img_lsn.unwrap(),
layer: l,
}))
})
} else {
trace!("no layer found for request on {key} at {end_lsn}");
Ok(None)
None
}
}

View File

@@ -255,8 +255,7 @@ pub fn save_metadata(
// fsync the parent directory to ensure the directory entry is durable
if first_save {
let timeline_dir = File::open(
&path
.parent()
path.parent()
.expect("Metadata should always have a parent dir"),
)?;
timeline_dir.sync_all()?;

View File

@@ -17,8 +17,8 @@ use utils::crashsafe;
use crate::config::PageServerConf;
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::TenantConfOpt;
use crate::tenant::{Tenant, TenantState};
use crate::tenant_config::TenantConfOpt;
use crate::IGNORED_TENANT_FILE_NAME;
use utils::fs_ext::PathExt;
@@ -196,7 +196,7 @@ pub async fn shutdown_all_tenants() {
let tenant_id = tenant.tenant_id();
debug!("shutdown tenant {tenant_id}");
if let Err(err) = tenant.checkpoint().await {
if let Err(err) = tenant.freeze_and_flush().await {
error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
}
}
@@ -216,8 +216,7 @@ pub async fn create_tenant(
hash_map::Entry::Vacant(v) => {
// Hold the write_tenants() lock, since all of this is local IO.
// If this section ever becomes contentious, introduce a new `TenantState::Creating`.
let tenant_directory =
super::tenant::create_tenant_files(conf, tenant_conf, tenant_id)?;
let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
let created_tenant =
schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
let crated_tenant_id = created_tenant.tenant_id();
@@ -262,27 +261,6 @@ pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Resul
}
pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> {
// Start with the shutdown of timeline tasks (this shuts down the walreceiver)
// It is important that we do not take locks here, and do not check whether the timeline exists
// because if we hold tenants_state::write_tenants() while awaiting for the tasks to join
// we cannot create new timelines and tenants, and that can take quite some time,
// it can even become stuck due to a bug making whole pageserver unavailable for some operations
// so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation
// and then try to actually remove timeline from inmemory state and this is the point when concurrent requests
// will synchronize and either fail with the not found error or succeed
debug!("waiting for wal receiver to shutdown");
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(tenant_id),
Some(timeline_id),
)
.await;
debug!("wal receiver shutdown confirmed");
info!("waiting for timeline tasks to shutdown");
task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await;
info!("timeline task shutdown completed");
match get_tenant(tenant_id, true).await {
Ok(tenant) => {
tenant.delete_timeline(timeline_id).await?;
@@ -452,7 +430,7 @@ where
Err(e) => {
let tenants_accessor = TENANTS.read().await;
match tenants_accessor.get(&tenant_id) {
Some(tenant) => tenant.set_broken(),
Some(tenant) => tenant.set_broken(&e.to_string()),
None => warn!("Tenant {tenant_id} got removed from memory"),
}
Err(e)
@@ -496,7 +474,7 @@ pub async fn immediate_gc(
async move {
fail::fail_point!("immediate_gc_task_pre");
let result = tenant
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
.gc_iteration(Some(timeline_id), gc_horizon, pitr)
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
.await;
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
@@ -514,3 +492,53 @@ pub async fn immediate_gc(
Ok(wait_task_done)
}
#[cfg(feature = "testing")]
pub async fn immediate_compact(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
let guard = TENANTS.read().await;
let tenant = guard
.get(&tenant_id)
.map(Arc::clone)
.with_context(|| format!("Tenant {tenant_id} not found"))
.map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)?;
// Run in task_mgr to avoid race with detach operation
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::Compaction,
Some(tenant_id),
Some(timeline_id),
&format!(
"timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
),
false,
async move {
let result = timeline
.compact()
.instrument(
info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id),
)
.await;
match task_done.send(result) {
Ok(_) => (),
Err(result) => error!("failed to send compaction result: {result:?}"),
}
Ok(())
},
);
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
drop(guard);
Ok(wait_task_done)
}

View File

@@ -32,7 +32,8 @@
//! the corresponding remote operation with the timeline's [`RemoteTimelineClient`]:
//!
//! - [`RemoteTimelineClient::schedule_layer_file_upload`] when we've created a new layer file.
//! - [`RemoteTimelineClient::schedule_index_upload`] when we've updated the timeline metadata file.
//! - [`RemoteTimelineClient::schedule_index_upload_for_metadata_update`] when we've updated the timeline metadata file.
//! - [`RemoteTimelineClient::schedule_index_upload_for_file_changes`] to upload an updated index file, after we've scheduled file uploads
//! - [`RemoteTimelineClient::schedule_layer_file_deletion`] when we've deleted one or more layer files.
//!
//! Internally, these functions create [`UploadOp`]s and put them in a queue.
@@ -57,7 +58,7 @@
//! To have a consistent remote structure, it's important that uploads and
//! deletions are performed in the right order. For example, the index file
//! contains a list of layer files, so it must not be uploaded until all the
//! layer files that are in its list have been succesfully uploaded.
//! layer files that are in its list have been successfully uploaded.
//!
//! The contract between client and its user is that the user is responsible of
//! scheduling operations in an order that keeps the remote consistent as
@@ -139,7 +140,7 @@
//! Note that if we crash during file deletion between the index update
//! that removes the file from the list of files, and deleting the remote file,
//! the file is leaked in the remote storage. Similarly, if a new file is created
//! and uploaded, but the pageserver dies permantently before updating the
//! and uploaded, but the pageserver dies permanently before updating the
//! remote index file, the new file is leaked in remote storage. We accept and
//! tolerate that for now.
//! Note further that we cannot easily fix this by scheduling deletes for every
@@ -147,31 +148,43 @@
//! following two cases:
//! - (1) We had the file locally, deleted it locally, scheduled a remote delete,
//! but crashed before it finished remotely.
//! - (2) We never had the file locally because we were still in tenant attach
//! when we crashed. (Similar case for on-demand download in the future.)
//! - (2) We never had the file locally because we haven't on-demand downloaded
//! it yet.
//!
//! # Downloads (= Tenant Attach)
//! # Downloads
//!
//! In addition to the upload queue, [`RemoteTimelineClient`] has functions for
//! downloading files from the remote storage. Downloads are performed immediately,
//! independently of the uploads.
//! downloading files from the remote storage. Downloads are performed immediately
//! against the `RemoteStorage`, independently of the upload queue.
//!
//! When we attach a tenant, we perform the following steps:
//! - create `Tenant` object in `TenantState::Attaching` state
//! - List timelines that are present in remote storage, and download their remote [`IndexPart`]s
//! - For each timeline, create `Timeline` struct and a `RemoteTimelineClient`, and initialize the client's upload queue with its `IndexPart`
//! - eagerly download all the remote layers using the client's download APIs
//! - transition tenant from `TenantState::Attaching` to `TenantState::Active` state.
//! - List timelines that are present in remote storage, and for each:
//! - download their remote [`IndexPart`]s
//! - create `Timeline` struct and a `RemoteTimelineClient`
//! - initialize the client's upload queue with its `IndexPart`
//! - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart`
//! but not present locally
//! - schedule uploads for layers that are only present locally.
//! - if the remote `IndexPart`'s metadata was newer than the metadata in
//! the local filesystem, write the remote metadata to the local filesystem
//! - After the above is done for each timeline, open the tenant for business by
//! transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
//! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
//!
//! Most of the above happens in [`Timeline::reconcile_with_remote`].
//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers.
//! We keep track of the fact that a client is in `Attaching` state in a marker
//! file on the local disk.
//! However, the distinction is moot for storage sync since we call
//! `reconcile_with_remote` for tenants both with and without the marker file.
//!
//! In the future, downloading will be done on-demand and `reconcile_with_remote`
//! will only be responsible for re-scheduling upload ops after a crash of an
//! `Active` tenant.
//! file on the local disk. This is critical because, when we restart the pageserver,
//! we do not want to do the `List timelines` step for each tenant that has already
//! been successfully attached (for performance & cost reasons).
//! Instead, for a tenant without the attach marker file, we assume that the
//! local state is in sync or ahead of the remote state. This includes the list
//! of all of the tenant's timelines, which is particularly critical to be up-to-date:
//! if there's a timeline on the remote that the pageserver doesn't know about,
//! the GC will not consider its branch point, leading to data loss.
//! So, for a tenant with the attach marker file, we know that we do not yet have
//! persisted all the remote timeline's metadata files locally. To exclude the
//! risk above, we re-run the procedure for such tenants
//!
//! # Operating Without Remote Storage
//!
@@ -194,39 +207,51 @@ mod upload;
// re-export these
pub use download::{is_temp_download_file, list_remote_timelines};
use std::collections::{HashMap, VecDeque};
use std::fmt::Debug;
use std::ops::DerefMut;
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, Mutex};
use anyhow::ensure;
use remote_storage::{DownloadError, GenericRemoteStorage};
use std::ops::DerefMut;
use tokio::runtime::Runtime;
use tracing::{debug, info, warn};
use tracing::{info_span, Instrument};
use utils::lsn::Lsn;
use self::index::IndexPart;
use crate::metrics::MeasureRemoteOp;
use crate::metrics::RemoteOpFileKind;
use crate::metrics::RemoteOpKind;
use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS;
use crate::tenant::filename::LayerFileName;
use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::{
config::PageServerConf,
storage_sync::index::LayerFileMetadata,
task_mgr,
task_mgr::TaskKind,
task_mgr::BACKGROUND_RUNTIME,
tenant::metadata::TimelineMetadata,
tenant::upload_queue::{
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
},
{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
};
use utils::id::{TenantId, TimelineId};
use self::index::IndexPart;
use super::storage_layer::LayerFileName;
// Occasional network issues and such can cause remote operations to fail, and
// that's expected. If a download fails, we log it at info-level, and retry.
// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
// level instead, as repeated failures can mean a more serious problem. If it
// fails more than FAILED_DOWNLOAD_RETRIES times, we give up
const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
const FAILED_DOWNLOAD_RETRIES: u32 = 10;
// Similarly log failed uploads and deletions at WARN level, after this many
// retries. Uploads and deletions are retried forever, though.
const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
/// A client for accessing a timeline's data in remote storage.
///
/// This takes care of managing the number of connections, and balancing them
@@ -256,209 +281,42 @@ pub struct RemoteTimelineClient {
upload_queue: Mutex<UploadQueue>,
metrics: Arc<RemoteTimelineClientMetrics>,
storage_impl: GenericRemoteStorage,
}
// clippy warns that Uninitialized is much smaller than Initialized, which wastes
// memory for Uninitialized variants. Doesn't matter in practice, there are not
// that many upload queues in a running pageserver, and most of them are initialized
// anyway.
#[allow(clippy::large_enum_variant)]
enum UploadQueue {
Uninitialized,
Initialized(UploadQueueInitialized),
Stopped(UploadQueueStopped),
}
impl UploadQueue {
fn as_str(&self) -> &'static str {
match self {
UploadQueue::Uninitialized => "Uninitialized",
UploadQueue::Initialized(_) => "Initialized",
UploadQueue::Stopped(_) => "Stopped",
}
}
}
/// This keeps track of queued and in-progress tasks.
struct UploadQueueInitialized {
/// Counter to assign task IDs
task_counter: u64,
/// All layer files stored in the remote storage, taking into account all
/// in-progress and queued operations
latest_files: HashMap<LayerFileName, LayerFileMetadata>,
/// Metadata stored in the remote storage, taking into account all
/// in-progress and queued operations.
/// DANGER: do not return to outside world, e.g., safekeepers.
latest_metadata: TimelineMetadata,
/// `disk_consistent_lsn` from the last metadata file that was successfully
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
/// Safekeeper can rely on it to make decisions for WAL storage.
last_uploaded_consistent_lsn: Lsn,
// Breakdown of different kinds of tasks currently in-progress
num_inprogress_layer_uploads: usize,
num_inprogress_metadata_uploads: usize,
num_inprogress_deletions: usize,
/// Tasks that are currently in-progress. In-progress means that a tokio Task
/// has been launched for it. An in-progress task can be busy uploading, but it can
/// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
/// be waiting for retry in `exponential_backoff`.
inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
/// Queued operations that have not been launched yet. They might depend on previous
/// tasks to finish. For example, metadata upload cannot be performed before all
/// preceding layer file uploads have completed.
queued_operations: VecDeque<UploadOp>,
}
struct UploadQueueStopped {
last_uploaded_consistent_lsn: Lsn,
}
impl UploadQueue {
fn initialize_empty_remote(
&mut self,
metadata: &TimelineMetadata,
) -> anyhow::Result<&mut UploadQueueInitialized> {
match self {
UploadQueue::Uninitialized => (),
UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
anyhow::bail!("already initialized, state {}", self.as_str())
}
}
info!("initializing upload queue for empty remote");
let state = UploadQueueInitialized {
// As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
latest_files: HashMap::new(),
latest_metadata: metadata.clone(),
// We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
// safekeepers from garbage-collecting anything.
last_uploaded_consistent_lsn: Lsn(0),
// what follows are boring default initializations
task_counter: 0,
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
inprogress_tasks: HashMap::new(),
queued_operations: VecDeque::new(),
};
*self = UploadQueue::Initialized(state);
Ok(self.initialized_mut().expect("we just set it"))
}
fn initialize_with_current_remote_index_part(
&mut self,
index_part: &IndexPart,
) -> anyhow::Result<&mut UploadQueueInitialized> {
match self {
UploadQueue::Uninitialized => (),
UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
anyhow::bail!("already initialized, state {}", self.as_str())
}
}
let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
for layer_name in &index_part.timeline_layers {
let layer_metadata = index_part
.layer_metadata
.get(layer_name)
.map(LayerFileMetadata::from)
.unwrap_or(LayerFileMetadata::MISSING);
files.insert(layer_name.to_owned(), layer_metadata);
}
let index_part_metadata = index_part.parse_metadata()?;
info!(
"initializing upload queue with remote index_part.disk_consistent_lsn: {}",
index_part_metadata.disk_consistent_lsn()
);
let state = UploadQueueInitialized {
latest_files: files,
latest_metadata: index_part_metadata.clone(),
last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
// what follows are boring default initializations
task_counter: 0,
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
inprogress_tasks: HashMap::new(),
queued_operations: VecDeque::new(),
};
*self = UploadQueue::Initialized(state);
Ok(self.initialized_mut().expect("we just set it"))
}
fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
match self {
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
anyhow::bail!("queue is in state {}", self.as_str())
}
UploadQueue::Initialized(x) => Ok(x),
}
}
}
/// An in-progress upload or delete task.
#[derive(Debug)]
struct UploadTask {
/// Unique ID of this task. Used as the key in `inprogress_tasks` above.
task_id: u64,
retries: AtomicU32,
op: UploadOp,
}
#[derive(Debug)]
enum UploadOp {
/// Upload a layer file
UploadLayer(LayerFileName, LayerFileMetadata),
/// Upload the metadata file
UploadMetadata(IndexPart, Lsn),
/// Delete a file.
Delete(RemoteOpFileKind, LayerFileName),
/// Barrier. When the barrier operation is reached,
Barrier(tokio::sync::watch::Sender<()>),
}
impl std::fmt::Display for UploadOp {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
UploadOp::UploadLayer(path, metadata) => {
write!(
f,
"UploadLayer({}, size={:?})",
path.file_name(),
metadata.file_size()
)
}
UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
UploadOp::Barrier(_) => write!(f, "Barrier"),
}
}
}
impl RemoteTimelineClient {
///
/// Create a remote storage client for given timeline
///
/// Note: the caller must initialize the upload queue before any uploads can be scheduled,
/// by calling init_upload_queue.
///
pub fn new(
remote_storage: GenericRemoteStorage,
conf: &'static PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> RemoteTimelineClient {
RemoteTimelineClient {
conf,
runtime: &BACKGROUND_RUNTIME,
tenant_id,
timeline_id,
storage_impl: remote_storage,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
}
}
/// Initialize the upload queue for a remote storage that already received
/// an index file upload, i.e., it's not empty.
/// The given `index_part` must be the one on the remote.
pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_with_current_remote_index_part(index_part)?;
self.update_remote_physical_size_gauge(Some(index_part));
Ok(())
}
@@ -470,6 +328,7 @@ impl RemoteTimelineClient {
) -> anyhow::Result<()> {
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_empty_remote(local_metadata)?;
self.update_remote_physical_size_gauge(None);
Ok(())
}
@@ -481,6 +340,24 @@ impl RemoteTimelineClient {
}
}
fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
current_remote_index_part
.layer_metadata
.values()
// If we don't have the file size for the layer, don't account for it in the metric.
.map(|ilmd| ilmd.file_size.unwrap_or(0))
.sum()
} else {
0
};
self.metrics.remote_physical_size_gauge().set(size);
}
pub fn get_remote_physical_size(&self) -> u64 {
self.metrics.remote_physical_size_gauge().get()
}
//
// Download operations.
//
@@ -490,6 +367,10 @@ impl RemoteTimelineClient {
/// Download index file
pub async fn download_index_file(&self) -> Result<IndexPart, DownloadError> {
let _unfinished_gauge_guard = self
.metrics
.call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download);
download::download_index_part(
self.conf,
&self.storage_impl,
@@ -501,6 +382,7 @@ impl RemoteTimelineClient {
self.timeline_id,
RemoteOpFileKind::Index,
RemoteOpKind::Download,
Arc::clone(&self.metrics),
)
.await
}
@@ -515,21 +397,27 @@ impl RemoteTimelineClient {
layer_file_name: &LayerFileName,
layer_metadata: &LayerFileMetadata,
) -> anyhow::Result<u64> {
let downloaded_size = download::download_layer_file(
self.conf,
&self.storage_impl,
self.tenant_id,
self.timeline_id,
layer_file_name,
layer_metadata,
)
.measure_remote_op(
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Download,
)
.await?;
let downloaded_size = {
let _unfinished_gauge_guard = self
.metrics
.call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download);
download::download_layer_file(
self.conf,
&self.storage_impl,
self.tenant_id,
self.timeline_id,
layer_file_name,
layer_metadata,
)
.measure_remote_op(
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Download,
Arc::clone(&self.metrics),
)
.await?
};
// Update the metadata for given layer file. The remote index file
// might be missing some information for the file; this allows us
@@ -539,7 +427,17 @@ impl RemoteTimelineClient {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
upgraded.merge(&new_metadata);
if upgraded.merge(&new_metadata) {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
}
// If we don't do an index file upload inbetween here and restart,
// the value will go back down after pageserver restart, since we will
// have lost this data point.
// But, we upload index part fairly frequently, and restart pageserver rarely.
// So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner.
self.metrics
.remote_physical_size_gauge()
.add(downloaded_size);
} else {
// The file should exist, since we just downloaded it.
warn!(
@@ -556,14 +454,20 @@ impl RemoteTimelineClient {
//
///
/// Launch an index-file upload operation in the background.
/// Launch an index-file upload operation in the background, with
/// updated metadata.
///
/// The upload will be added to the queue immediately, but it
/// won't be performed until all previosuly scheduled layer file
/// upload operations have completed successfully. This is to
/// ensure that when the index file claims that layers X, Y and Z
/// exist in remote storage, they really do.
pub fn schedule_index_upload(
/// exist in remote storage, they really do. To wait for the upload
/// to complete, use `wait_completion`.
///
/// If there were any changes to the list of files, i.e. if any
/// layer file uploads were scheduled, since the last index file
/// upload, those will be included too.
pub fn schedule_index_upload_for_metadata_update(
self: &Arc<Self>,
metadata: &TimelineMetadata,
) -> anyhow::Result<()> {
@@ -574,26 +478,60 @@ impl RemoteTimelineClient {
// ahead of what's _actually_ on the remote during index upload.
upload_queue.latest_metadata = metadata.clone();
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
self.schedule_index_upload(upload_queue, metadata_bytes);
Ok(())
}
///
/// Launch an index-file upload operation in the background, if necessary.
///
/// Use this function to schedule the update of the index file after
/// scheduling file uploads or deletions. If no file uploads or deletions
/// have been scheduled since the last index file upload, this does
/// nothing.
///
/// Like schedule_index_upload_for_metadata_update(), this merely adds
/// the upload to the upload queue and returns quickly.
pub fn schedule_index_upload_for_file_changes(self: &Arc<Self>) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
self.schedule_index_upload(upload_queue, metadata_bytes);
}
Ok(())
}
/// Launch an index-file upload operation in the background (internal function)
fn schedule_index_upload(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
metadata_bytes: Vec<u8>,
) {
info!(
"scheduling metadata upload with {} files ({} changed)",
upload_queue.latest_files.len(),
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
);
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
let index_part = IndexPart::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,
upload_queue.latest_metadata.to_bytes()?,
metadata_bytes,
);
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
self.update_upload_queue_unfinished_metric(1, &op);
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
info!(
"scheduled metadata upload with {} files",
upload_queue.latest_files.len()
);
upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
// Launch the task immediately, if possible
self.launch_queued_tasks(upload_queue);
Ok(())
}
///
@@ -617,9 +555,10 @@ impl RemoteTimelineClient {
upload_queue
.latest_files
.insert(layer_file_name.clone(), layer_metadata.clone());
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
self.update_upload_queue_unfinished_metric(1, &op);
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
info!(
@@ -635,8 +574,11 @@ impl RemoteTimelineClient {
///
/// Launch a delete operation in the background.
///
/// The deletion won't actually be performed, until all preceding
/// upload operations have completed succesfully.
/// Note: This schedules an index file upload before the deletions. The
/// deletion won't actually be performed, until any previously scheduled
/// upload operations, and the index file upload, have completed
/// succesfully.
///
pub fn schedule_layer_file_deletion(
self: &Arc<Self>,
names: &[LayerFileName],
@@ -647,7 +589,6 @@ impl RemoteTimelineClient {
// Deleting layers doesn't affect the values stored in TimelineMetadata,
// so we don't need update it. Just serialize it.
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
// Update the remote index file, removing the to-be-deleted files from the index,
// before deleting the actual files.
@@ -659,21 +600,17 @@ impl RemoteTimelineClient {
let no_bail_here = || {
for name in names {
upload_queue.latest_files.remove(name);
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
}
let index_part = IndexPart::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,
metadata_bytes,
);
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
self.update_upload_queue_unfinished_metric(1, &op);
upload_queue.queued_operations.push_back(op);
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
self.schedule_index_upload(upload_queue, metadata_bytes);
}
// schedule the actual deletions
for name in names {
let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
self.update_upload_queue_unfinished_metric(1, &op);
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
info!("scheduled layer file deletion {}", name.file_name());
}
@@ -825,7 +762,7 @@ impl RemoteTimelineClient {
// upload finishes or times out soon enough.
if task_mgr::is_shutdown_requested() {
info!("upload task cancelled by shutdown request");
self.update_upload_queue_unfinished_metric(-1, &task.op);
self.calls_unfinished_metric_end(&task.op);
self.stop();
return;
}
@@ -847,11 +784,12 @@ impl RemoteTimelineClient {
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Upload,
Arc::clone(&self.metrics),
)
.await
}
UploadOp::UploadMetadata(ref index_part, _lsn) => {
upload::upload_index_part(
let res = upload::upload_index_part(
self.conf,
&self.storage_impl,
self.tenant_id,
@@ -863,8 +801,13 @@ impl RemoteTimelineClient {
self.timeline_id,
RemoteOpFileKind::Index,
RemoteOpKind::Upload,
Arc::clone(&self.metrics),
)
.await
.await;
if res.is_ok() {
self.update_remote_physical_size_gauge(Some(index_part));
}
res
}
UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
let path = &self
@@ -877,6 +820,7 @@ impl RemoteTimelineClient {
self.timeline_id,
*metric_file_kind,
RemoteOpKind::Delete,
Arc::clone(&self.metrics),
)
.await
}
@@ -895,12 +839,14 @@ impl RemoteTimelineClient {
Err(e) => {
let retries = task.retries.fetch_add(1, Ordering::SeqCst);
// uploads may fail due to rate limts (IAM, S3) or spurious network and external errors
// such issues are relatively regular, so don't use WARN or ERROR to avoid alerting
// people and tests until the retries are definitely causing delays.
if retries < 3 {
// Uploads can fail due to rate limits (IAM, S3), spurious network problems,
// or other external reasons. Such issues are relatively regular, so log them
// at info level at first, and only WARN if the operation fails repeatedly.
//
// (See similar logic for downloads in `download::download_retry`)
if retries < FAILED_UPLOAD_WARN_THRESHOLD {
info!(
"failed to perform remote task {}, will retry (attempt {}): {:?}",
"failed to perform remote task {}, will retry (attempt {}): {:#}",
task.op, retries, e
);
} else {
@@ -964,28 +910,40 @@ impl RemoteTimelineClient {
// Launch any queued tasks that were unblocked by this one.
self.launch_queued_tasks(upload_queue);
}
self.update_upload_queue_unfinished_metric(-1, &task.op);
self.calls_unfinished_metric_end(&task.op);
}
fn update_upload_queue_unfinished_metric(&self, delta: i64, op: &UploadOp) {
let (file_kind, op_kind) = match op {
fn calls_unfinished_metric_impl(
&self,
op: &UploadOp,
) -> Option<(RemoteOpFileKind, RemoteOpKind)> {
let res = match op {
UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload),
UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload),
UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete),
UploadOp::Barrier(_) => {
// we do not account these
return;
return None;
}
};
REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
file_kind.as_str(),
op_kind.as_str(),
])
.unwrap()
.add(delta)
Some(res)
}
fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
Some(x) => x,
None => return,
};
let guard = self.metrics.call_begin(&file_kind, &op_kind);
guard.will_decrement_manually(); // in unfinished_ops_metric_end()
}
fn calls_unfinished_metric_end(&self, op: &UploadOp) {
let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
Some(x) => x,
None => return,
};
self.metrics.call_end(&file_kind, &op_kind);
}
fn stop(&self) {
@@ -1036,7 +994,7 @@ impl RemoteTimelineClient {
// Tear down queued ops
for op in qi.queued_operations.into_iter() {
self.update_upload_queue_unfinished_metric(-1, &op);
self.calls_unfinished_metric_end(&op);
// Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
// which is exactly what we want to happen.
drop(op);
@@ -1049,28 +1007,6 @@ impl RemoteTimelineClient {
}
}
///
/// Create a remote storage client for given timeline
///
/// Note: the caller must initialize the upload queue before any uploads can be scheduled,
/// by calling init_upload_queue.
///
pub fn create_remote_timeline_client(
remote_storage: GenericRemoteStorage,
conf: &'static PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> anyhow::Result<RemoteTimelineClient> {
Ok(RemoteTimelineClient {
conf,
runtime: &BACKGROUND_RUNTIME,
tenant_id,
timeline_id,
storage_impl: remote_storage,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
})
}
#[cfg(test)]
mod tests {
use super::*;
@@ -1180,6 +1116,10 @@ mod tests {
timeline_id: TIMELINE_ID,
storage_impl,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&harness.tenant_id,
&TIMELINE_ID,
)),
});
let remote_timeline_dir =
@@ -1211,15 +1151,19 @@ mod tests {
assert!(upload_queue.queued_operations.is_empty());
assert!(upload_queue.inprogress_tasks.len() == 2);
assert!(upload_queue.num_inprogress_layer_uploads == 2);
// also check that `latest_file_changes` was updated
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
}
// Schedule upload of index. Check that it is queued
let metadata = dummy_metadata(Lsn(0x20));
client.schedule_index_upload(&metadata)?;
client.schedule_index_upload_for_metadata_update(&metadata)?;
{
let mut guard = client.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut().unwrap();
assert!(upload_queue.queued_operations.len() == 1);
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
}
// Wait for the uploads to finish
@@ -1255,6 +1199,7 @@ mod tests {
assert!(upload_queue.inprogress_tasks.len() == 1);
assert!(upload_queue.num_inprogress_layer_uploads == 1);
assert!(upload_queue.num_inprogress_deletions == 0);
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
}
assert_remote_files(&["foo", "bar", "index_part.json"], &remote_timeline_dir);

View File

@@ -0,0 +1,316 @@
//! Helper functions to download files from remote storage with a RemoteStorage
//!
//! The functions in this module retry failed operations automatically, according
//! to the FAILED_DOWNLOAD_RETRIES constant.
use std::collections::HashSet;
use std::future::Future;
use std::path::Path;
use anyhow::{anyhow, Context};
use tokio::fs;
use tokio::io::AsyncWriteExt;
use tracing::{error, info, warn};
use crate::config::PageServerConf;
use crate::tenant::storage_layer::LayerFileName;
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
use remote_storage::{DownloadError, GenericRemoteStorage};
use utils::crashsafe::path_with_suffix_extension;
use utils::id::{TenantId, TimelineId};
use super::index::{IndexPart, IndexPartUnclean, LayerFileMetadata};
use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
fs::File::open(path).await?.sync_all().await
}
///
/// If 'metadata' is given, we will validate that the downloaded file's size matches that
/// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
///
/// Returns the size of the downloaded file.
pub async fn download_layer_file<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
tenant_id: TenantId,
timeline_id: TimelineId,
layer_file_name: &'a LayerFileName,
layer_metadata: &'a LayerFileMetadata,
) -> Result<u64, DownloadError> {
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
let local_path = timeline_path.join(layer_file_name.file_name());
let remote_path = conf
.remote_path(&local_path)
.map_err(DownloadError::Other)?;
// Perform a rename inspired by durable_rename from file_utils.c.
// The sequence:
// write(tmp)
// fsync(tmp)
// rename(tmp, new)
// fsync(new)
// fsync(parent)
// For more context about durable_rename check this email from postgres mailing list:
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
let (mut destination_file, bytes_amount) = download_retry(
|| async {
// TODO: this doesn't use the cached fd for some reason?
let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
format!(
"Failed to create a destination file for layer '{}'",
temp_file_path.display()
)
})
.map_err(DownloadError::Other)?;
let mut download = storage.download(&remote_path).await.with_context(|| {
format!(
"Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
)
})
.map_err(DownloadError::Other)?;
let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
})
.map_err(DownloadError::Other)?;
Ok((destination_file, bytes_amount))
},
&format!("download {remote_path:?}"),
).await?;
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
// A file will not be closed immediately when it goes out of scope if there are any IO operations
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
// you should call flush before dropping it.
//
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
// But for additional safety lets check/wait for any pending operations.
destination_file
.flush()
.await
.with_context(|| {
format!(
"failed to flush source file at {}",
temp_file_path.display()
)
})
.map_err(DownloadError::Other)?;
match layer_metadata.file_size() {
Some(expected) if expected != bytes_amount => {
return Err(DownloadError::Other(anyhow!(
"According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
temp_file_path.display()
)));
}
Some(_) | None => {
// matches, or upgrading from an earlier IndexPart version
}
}
// not using sync_data because it can lose file size update
destination_file
.sync_all()
.await
.with_context(|| {
format!(
"failed to fsync source file at {}",
temp_file_path.display()
)
})
.map_err(DownloadError::Other)?;
drop(destination_file);
fail::fail_point!("remote-storage-download-pre-rename", |_| {
Err(DownloadError::Other(anyhow!(
"remote-storage-download-pre-rename failpoint triggered"
)))
});
fs::rename(&temp_file_path, &local_path)
.await
.with_context(|| {
format!(
"Could not rename download layer file to {}",
local_path.display(),
)
})
.map_err(DownloadError::Other)?;
fsync_path(&local_path)
.await
.with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
.map_err(DownloadError::Other)?;
tracing::info!("download complete: {}", local_path.display());
Ok(bytes_amount)
}
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
pub fn is_temp_download_file(path: &Path) -> bool {
let extension = path.extension().map(|pname| {
pname
.to_str()
.expect("paths passed to this function must be valid Rust strings")
});
match extension {
Some(TEMP_DOWNLOAD_EXTENSION) => true,
Some(_) => false,
None => false,
}
}
/// List timelines of given tenant in remote storage
pub async fn list_remote_timelines<'a>(
storage: &'a GenericRemoteStorage,
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> anyhow::Result<HashSet<TimelineId>> {
let tenant_path = conf.timelines_path(&tenant_id);
let tenant_storage_path = conf.remote_path(&tenant_path)?;
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
anyhow::bail!("storage-sync-list-remote-timelines");
});
let timelines = download_retry(
|| storage.list_prefixes(Some(&tenant_storage_path)),
&format!("list prefixes for {tenant_path:?}"),
)
.await?;
if timelines.is_empty() {
anyhow::bail!("no timelines found on the remote storage")
}
let mut timeline_ids = HashSet::new();
for timeline_remote_storage_key in timelines {
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
})?;
let timeline_id: TimelineId = object_name.parse().with_context(|| {
format!("failed to parse object name into timeline id '{object_name}'")
})?;
// list_prefixes is assumed to return unique names. Ensure this here.
// NB: it's safer to bail out than warn-log this because the pageserver
// needs to absolutely know about _all_ timelines that exist, so that
// GC knows all the branchpoints. If we skipped over a timeline instead,
// GC could delete a layer that's still needed by that timeline.
anyhow::ensure!(
!timeline_ids.contains(&timeline_id),
"list_prefixes contains duplicate timeline id {timeline_id}"
);
timeline_ids.insert(timeline_id);
}
Ok(timeline_ids)
}
pub(super) async fn download_index_part(
conf: &'static PageServerConf,
storage: &GenericRemoteStorage,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<IndexPart, DownloadError> {
let index_part_path = conf
.metadata_path(timeline_id, tenant_id)
.with_file_name(IndexPart::FILE_NAME);
let part_storage_path = conf
.remote_path(&index_part_path)
.map_err(DownloadError::BadInput)?;
let index_part_bytes = download_retry(
|| async {
let mut index_part_download = storage.download(&part_storage_path).await?;
let mut index_part_bytes = Vec::new();
tokio::io::copy(
&mut index_part_download.download_stream,
&mut index_part_bytes,
)
.await
.with_context(|| {
format!("Failed to download an index part into file {index_part_path:?}")
})
.map_err(DownloadError::Other)?;
Ok(index_part_bytes)
},
&format!("download {part_storage_path:?}"),
)
.await?;
let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes)
.with_context(|| {
format!("Failed to deserialize index part file into file {index_part_path:?}")
})
.map_err(DownloadError::Other)?;
let index_part = index_part.remove_unclean_layer_file_names();
Ok(index_part)
}
///
/// Helper function to handle retries for a download operation.
///
/// Remote operations can fail due to rate limits (IAM, S3), spurious network
/// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times,
/// with backoff.
///
/// (See similar logic for uploads in `perform_upload_task`)
async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
where
O: FnMut() -> F,
F: Future<Output = Result<T, DownloadError>>,
{
let mut attempts = 0;
loop {
let result = op().await;
match result {
Ok(_) => {
if attempts > 0 {
info!("{description} succeeded after {attempts} retries");
}
return result;
}
// These are "permanent" errors that should not be retried.
Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
return result;
}
// Assume that any other failure might be transient, and the operation might
// succeed if we just keep trying.
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
info!("{description} failed, will retry (attempt {attempts}): {err:#}");
}
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
}
Err(DownloadError::Other(ref err)) => {
// Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
error!("{description} still failed after {attempts} retries, giving up: {err:?}");
return result;
}
}
// sleep and retry
exponential_backoff(
attempts,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
)
.await;
attempts += 1;
}
}

Some files were not shown because too many files have changed in this diff Show More