mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-29 02:00:37 +00:00
Compare commits
7 Commits
proxy-refa
...
refactor-r
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9b5792b9bf | ||
|
|
be0dfa9d3a | ||
|
|
292c42731e | ||
|
|
867b35ce55 | ||
|
|
14ff793582 | ||
|
|
5aaa5302eb | ||
|
|
6a53b8fac6 |
@@ -4,7 +4,7 @@
|
|||||||
hakari-package = "workspace_hack"
|
hakari-package = "workspace_hack"
|
||||||
|
|
||||||
# Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above.
|
# Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above.
|
||||||
dep-format-version = "3"
|
dep-format-version = "2"
|
||||||
|
|
||||||
# Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended.
|
# Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended.
|
||||||
# Hakari works much better with the new feature resolver.
|
# Hakari works much better with the new feature resolver.
|
||||||
|
|||||||
@@ -123,8 +123,8 @@ runs:
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
|
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
|
||||||
# -n16 uses sixteen processes to run tests via pytest-xdist
|
# -n4 uses four processes to run tests via pytest-xdist
|
||||||
EXTRA_PARAMS="-n16 $EXTRA_PARAMS"
|
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
|
||||||
|
|
||||||
# --dist=loadgroup points tests marked with @pytest.mark.xdist_group
|
# --dist=loadgroup points tests marked with @pytest.mark.xdist_group
|
||||||
# to the same worker to make @pytest.mark.order work with xdist
|
# to the same worker to make @pytest.mark.order work with xdist
|
||||||
|
|||||||
6
.github/ansible/deploy.yaml
vendored
6
.github/ansible/deploy.yaml
vendored
@@ -117,8 +117,7 @@
|
|||||||
shell:
|
shell:
|
||||||
cmd: |
|
cmd: |
|
||||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||||
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
|
curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
|
||||||
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers
|
|
||||||
tags:
|
tags:
|
||||||
- pageserver
|
- pageserver
|
||||||
|
|
||||||
@@ -187,7 +186,6 @@
|
|||||||
shell:
|
shell:
|
||||||
cmd: |
|
cmd: |
|
||||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||||
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
|
curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
|
||||||
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers
|
|
||||||
tags:
|
tags:
|
||||||
- safekeeper
|
- safekeeper
|
||||||
|
|||||||
@@ -6,8 +6,6 @@ storage:
|
|||||||
broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
|
broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
|
||||||
pageserver_config_stub:
|
pageserver_config_stub:
|
||||||
pg_distrib_dir: /usr/local
|
pg_distrib_dir: /usr/local
|
||||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
|
||||||
metric_collection_interval: 10min
|
|
||||||
remote_storage:
|
remote_storage:
|
||||||
bucket_name: "{{ bucket_name }}"
|
bucket_name: "{{ bucket_name }}"
|
||||||
bucket_region: "{{ bucket_region }}"
|
bucket_region: "{{ bucket_region }}"
|
||||||
|
|||||||
2
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
2
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
@@ -6,8 +6,6 @@ storage:
|
|||||||
broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
|
broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
|
||||||
pageserver_config_stub:
|
pageserver_config_stub:
|
||||||
pg_distrib_dir: /usr/local
|
pg_distrib_dir: /usr/local
|
||||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
|
||||||
metric_collection_interval: 10min
|
|
||||||
remote_storage:
|
remote_storage:
|
||||||
bucket_name: "{{ bucket_name }}"
|
bucket_name: "{{ bucket_name }}"
|
||||||
bucket_region: "{{ bucket_region }}"
|
bucket_region: "{{ bucket_region }}"
|
||||||
|
|||||||
4
.github/ansible/prod.us-east-2.hosts.yaml
vendored
4
.github/ansible/prod.us-east-2.hosts.yaml
vendored
@@ -6,8 +6,6 @@ storage:
|
|||||||
broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
|
broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
|
||||||
pageserver_config_stub:
|
pageserver_config_stub:
|
||||||
pg_distrib_dir: /usr/local
|
pg_distrib_dir: /usr/local
|
||||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
|
||||||
metric_collection_interval: 10min
|
|
||||||
remote_storage:
|
remote_storage:
|
||||||
bucket_name: "{{ bucket_name }}"
|
bucket_name: "{{ bucket_name }}"
|
||||||
bucket_region: "{{ bucket_region }}"
|
bucket_region: "{{ bucket_region }}"
|
||||||
@@ -36,4 +34,4 @@ storage:
|
|||||||
ansible_host: i-06d113fb73bfddeb0
|
ansible_host: i-06d113fb73bfddeb0
|
||||||
safekeeper-2.us-east-2.aws.neon.tech:
|
safekeeper-2.us-east-2.aws.neon.tech:
|
||||||
ansible_host: i-09f66c8e04afff2e8
|
ansible_host: i-09f66c8e04afff2e8
|
||||||
|
|
||||||
|
|||||||
2
.github/ansible/prod.us-west-2.hosts.yaml
vendored
2
.github/ansible/prod.us-west-2.hosts.yaml
vendored
@@ -6,8 +6,6 @@ storage:
|
|||||||
broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
|
broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
|
||||||
pageserver_config_stub:
|
pageserver_config_stub:
|
||||||
pg_distrib_dir: /usr/local
|
pg_distrib_dir: /usr/local
|
||||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
|
||||||
metric_collection_interval: 10min
|
|
||||||
remote_storage:
|
remote_storage:
|
||||||
bucket_name: "{{ bucket_name }}"
|
bucket_name: "{{ bucket_name }}"
|
||||||
bucket_region: "{{ bucket_region }}"
|
bucket_region: "{{ bucket_region }}"
|
||||||
|
|||||||
2
.github/ansible/production.hosts.yaml
vendored
2
.github/ansible/production.hosts.yaml
vendored
@@ -7,8 +7,6 @@ storage:
|
|||||||
broker_endpoint: http://storage-broker.prod.local:50051
|
broker_endpoint: http://storage-broker.prod.local:50051
|
||||||
pageserver_config_stub:
|
pageserver_config_stub:
|
||||||
pg_distrib_dir: /usr/local
|
pg_distrib_dir: /usr/local
|
||||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
|
||||||
metric_collection_interval: 10min
|
|
||||||
remote_storage:
|
remote_storage:
|
||||||
bucket_name: "{{ bucket_name }}"
|
bucket_name: "{{ bucket_name }}"
|
||||||
bucket_region: "{{ bucket_region }}"
|
bucket_region: "{{ bucket_region }}"
|
||||||
|
|||||||
2
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
2
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
@@ -18,7 +18,7 @@ storage:
|
|||||||
ansible_aws_ssm_region: eu-west-1
|
ansible_aws_ssm_region: eu-west-1
|
||||||
ansible_aws_ssm_bucket_name: neon-dev-storage-eu-west-1
|
ansible_aws_ssm_bucket_name: neon-dev-storage-eu-west-1
|
||||||
console_region_id: aws-eu-west-1
|
console_region_id: aws-eu-west-1
|
||||||
sentry_environment: staging
|
sentry_environment: development
|
||||||
|
|
||||||
children:
|
children:
|
||||||
pageservers:
|
pageservers:
|
||||||
|
|||||||
4
.github/ansible/staging.us-east-2.hosts.yaml
vendored
4
.github/ansible/staging.us-east-2.hosts.yaml
vendored
@@ -18,7 +18,7 @@ storage:
|
|||||||
ansible_aws_ssm_region: us-east-2
|
ansible_aws_ssm_region: us-east-2
|
||||||
ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
|
ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
|
||||||
console_region_id: aws-us-east-2
|
console_region_id: aws-us-east-2
|
||||||
sentry_environment: staging
|
sentry_environment: development
|
||||||
|
|
||||||
children:
|
children:
|
||||||
pageservers:
|
pageservers:
|
||||||
@@ -29,8 +29,6 @@ storage:
|
|||||||
ansible_host: i-0565a8b4008aa3f40
|
ansible_host: i-0565a8b4008aa3f40
|
||||||
pageserver-2.us-east-2.aws.neon.build:
|
pageserver-2.us-east-2.aws.neon.build:
|
||||||
ansible_host: i-01e31cdf7e970586a
|
ansible_host: i-01e31cdf7e970586a
|
||||||
pageserver-3.us-east-2.aws.neon.build:
|
|
||||||
ansible_host: i-0602a0291365ef7cc
|
|
||||||
|
|
||||||
safekeepers:
|
safekeepers:
|
||||||
hosts:
|
hosts:
|
||||||
|
|||||||
@@ -8,10 +8,8 @@ settings:
|
|||||||
authBackend: "console"
|
authBackend: "console"
|
||||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||||
domain: "*.eu-west-1.aws.neon.build"
|
domain: "*.eu-west-1.aws.neon.build"
|
||||||
sentryEnvironment: "staging"
|
sentryEnvironment: "development"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
|
||||||
metricCollectionInterval: "1min"
|
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy pods
|
# -- Additional labels for neon-proxy pods
|
||||||
podLabels:
|
podLabels:
|
||||||
|
|||||||
@@ -49,4 +49,4 @@ extraManifests:
|
|||||||
- "{{ .Release.Namespace }}"
|
- "{{ .Release.Namespace }}"
|
||||||
|
|
||||||
settings:
|
settings:
|
||||||
sentryEnvironment: "staging"
|
sentryEnvironment: "development"
|
||||||
|
|||||||
@@ -8,10 +8,7 @@ settings:
|
|||||||
authBackend: "link"
|
authBackend: "link"
|
||||||
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
|
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
|
||||||
uri: "https://console.stage.neon.tech/psql_session/"
|
uri: "https://console.stage.neon.tech/psql_session/"
|
||||||
domain: "pg.neon.build"
|
sentryEnvironment: "development"
|
||||||
sentryEnvironment: "staging"
|
|
||||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
|
||||||
metricCollectionInterval: "1min"
|
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy-link pods
|
# -- Additional labels for neon-proxy-link pods
|
||||||
podLabels:
|
podLabels:
|
||||||
|
|||||||
@@ -8,10 +8,8 @@ settings:
|
|||||||
authBackend: "console"
|
authBackend: "console"
|
||||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||||
domain: "*.cloud.stage.neon.tech"
|
domain: "*.cloud.stage.neon.tech"
|
||||||
sentryEnvironment: "staging"
|
sentryEnvironment: "development"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
|
||||||
metricCollectionInterval: "1min"
|
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy pods
|
# -- Additional labels for neon-proxy pods
|
||||||
podLabels:
|
podLabels:
|
||||||
|
|||||||
@@ -8,10 +8,8 @@ settings:
|
|||||||
authBackend: "console"
|
authBackend: "console"
|
||||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||||
domain: "*.us-east-2.aws.neon.build"
|
domain: "*.us-east-2.aws.neon.build"
|
||||||
sentryEnvironment: "staging"
|
sentryEnvironment: "development"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
|
||||||
metricCollectionInterval: "1min"
|
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy pods
|
# -- Additional labels for neon-proxy pods
|
||||||
podLabels:
|
podLabels:
|
||||||
|
|||||||
@@ -49,4 +49,4 @@ extraManifests:
|
|||||||
- "{{ .Release.Namespace }}"
|
- "{{ .Release.Namespace }}"
|
||||||
|
|
||||||
settings:
|
settings:
|
||||||
sentryEnvironment: "staging"
|
sentryEnvironment: "development"
|
||||||
|
|||||||
@@ -10,8 +10,6 @@ settings:
|
|||||||
domain: "*.ap-southeast-1.aws.neon.tech"
|
domain: "*.ap-southeast-1.aws.neon.tech"
|
||||||
sentryEnvironment: "production"
|
sentryEnvironment: "production"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
|
||||||
metricCollectionInterval: "10min"
|
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy pods
|
# -- Additional labels for neon-proxy pods
|
||||||
podLabels:
|
podLabels:
|
||||||
|
|||||||
@@ -10,8 +10,6 @@ settings:
|
|||||||
domain: "*.eu-central-1.aws.neon.tech"
|
domain: "*.eu-central-1.aws.neon.tech"
|
||||||
sentryEnvironment: "production"
|
sentryEnvironment: "production"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
|
||||||
metricCollectionInterval: "10min"
|
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy pods
|
# -- Additional labels for neon-proxy pods
|
||||||
podLabels:
|
podLabels:
|
||||||
|
|||||||
@@ -1,59 +0,0 @@
|
|||||||
# Helm chart values for neon-proxy-link.
|
|
||||||
# This is a YAML-formatted file.
|
|
||||||
|
|
||||||
image:
|
|
||||||
repository: neondatabase/neon
|
|
||||||
|
|
||||||
settings:
|
|
||||||
authBackend: "link"
|
|
||||||
authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
|
|
||||||
uri: "https://console.neon.tech/psql_session/"
|
|
||||||
domain: "pg.neon.tech"
|
|
||||||
sentryEnvironment: "production"
|
|
||||||
|
|
||||||
# -- Additional labels for zenith-proxy pods
|
|
||||||
podLabels:
|
|
||||||
zenith_service: proxy
|
|
||||||
zenith_env: production
|
|
||||||
zenith_region: us-east-2
|
|
||||||
zenith_region_slug: us-east-2
|
|
||||||
|
|
||||||
service:
|
|
||||||
type: LoadBalancer
|
|
||||||
annotations:
|
|
||||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
|
||||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
|
||||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal
|
|
||||||
external-dns.alpha.kubernetes.io/hostname: neon-proxy-link-mgmt.delta.us-east-2.aws.neon.tech
|
|
||||||
|
|
||||||
exposedService:
|
|
||||||
annotations:
|
|
||||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
|
||||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
|
||||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
|
||||||
external-dns.alpha.kubernetes.io/hostname: neon-proxy-link.delta.us-east-2.aws.neon.tech
|
|
||||||
|
|
||||||
extraManifests:
|
|
||||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
|
||||||
kind: VMServiceScrape
|
|
||||||
metadata:
|
|
||||||
name: "{{ include \"neon-proxy.fullname\" . }}"
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
|
|
||||||
app.kubernetes.io/name: neon-proxy
|
|
||||||
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
|
|
||||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
namespace: "{{ .Release.Namespace }}"
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: "neon-proxy"
|
|
||||||
endpoints:
|
|
||||||
- port: http
|
|
||||||
path: /metrics
|
|
||||||
interval: 10s
|
|
||||||
scrapeTimeout: 10s
|
|
||||||
namespaceSelector:
|
|
||||||
matchNames:
|
|
||||||
- "{{ .Release.Namespace }}"
|
|
||||||
@@ -10,8 +10,6 @@ settings:
|
|||||||
domain: "*.us-east-2.aws.neon.tech"
|
domain: "*.us-east-2.aws.neon.tech"
|
||||||
sentryEnvironment: "production"
|
sentryEnvironment: "production"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
|
||||||
metricCollectionInterval: "10min"
|
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy pods
|
# -- Additional labels for neon-proxy pods
|
||||||
podLabels:
|
podLabels:
|
||||||
|
|||||||
@@ -10,8 +10,6 @@ settings:
|
|||||||
domain: "*.us-west-2.aws.neon.tech"
|
domain: "*.us-west-2.aws.neon.tech"
|
||||||
sentryEnvironment: "production"
|
sentryEnvironment: "production"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
|
||||||
metricCollectionInterval: "10min"
|
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy pods
|
# -- Additional labels for neon-proxy pods
|
||||||
podLabels:
|
podLabels:
|
||||||
|
|||||||
@@ -4,8 +4,6 @@ settings:
|
|||||||
domain: "*.cloud.neon.tech"
|
domain: "*.cloud.neon.tech"
|
||||||
sentryEnvironment: "production"
|
sentryEnvironment: "production"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
|
||||||
metricCollectionInterval: "10min"
|
|
||||||
|
|
||||||
podLabels:
|
podLabels:
|
||||||
zenith_service: proxy-scram
|
zenith_service: proxy-scram
|
||||||
|
|||||||
105
.github/workflows/benchmarking.yml
vendored
105
.github/workflows/benchmarking.yml
vendored
@@ -489,108 +489,3 @@ jobs:
|
|||||||
slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||||
env:
|
env:
|
||||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
user-examples-compare:
|
|
||||||
if: success() || failure()
|
|
||||||
needs: [ tpch-compare ]
|
|
||||||
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
# neon-captest-prefetch: We have pre-created projects with prefetch enabled
|
|
||||||
# rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
|
||||||
# rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
|
||||||
platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
|
|
||||||
|
|
||||||
env:
|
|
||||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
|
||||||
DEFAULT_PG_VERSION: 14
|
|
||||||
TEST_OUTPUT: /tmp/test_output
|
|
||||||
BUILD_TYPE: remote
|
|
||||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
|
|
||||||
PLATFORM: ${{ matrix.platform }}
|
|
||||||
|
|
||||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
|
||||||
container:
|
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
|
||||||
options: --init
|
|
||||||
|
|
||||||
timeout-minutes: 360 # 6h
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
|
|
||||||
- name: Download Neon artifact
|
|
||||||
uses: ./.github/actions/download
|
|
||||||
with:
|
|
||||||
name: neon-${{ runner.os }}-release-artifact
|
|
||||||
path: /tmp/neon/
|
|
||||||
prefix: latest
|
|
||||||
|
|
||||||
- name: Add Postgres binaries to PATH
|
|
||||||
run: |
|
|
||||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
|
||||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
|
||||||
|
|
||||||
- name: Set up Connection String
|
|
||||||
id: set-up-connstr
|
|
||||||
run: |
|
|
||||||
case "${PLATFORM}" in
|
|
||||||
neon-captest-prefetch)
|
|
||||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
|
|
||||||
;;
|
|
||||||
rds-aurora)
|
|
||||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_AURORA_CONNSTR }}
|
|
||||||
;;
|
|
||||||
rds-postgres)
|
|
||||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
psql ${CONNSTR} -c "SELECT version();"
|
|
||||||
|
|
||||||
- name: Set database options
|
|
||||||
if: matrix.platform == 'neon-captest-prefetch'
|
|
||||||
run: |
|
|
||||||
DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
|
|
||||||
|
|
||||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
|
|
||||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
|
|
||||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
|
|
||||||
env:
|
|
||||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
|
||||||
|
|
||||||
- name: Run user examples
|
|
||||||
uses: ./.github/actions/run-python-test-set
|
|
||||||
with:
|
|
||||||
build_type: ${{ env.BUILD_TYPE }}
|
|
||||||
test_selection: performance/test_perf_olap.py
|
|
||||||
run_in_parallel: false
|
|
||||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
|
||||||
extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
|
|
||||||
env:
|
|
||||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
|
||||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
|
||||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
|
||||||
|
|
||||||
- name: Create Allure report
|
|
||||||
if: success() || failure()
|
|
||||||
uses: ./.github/actions/allure-report
|
|
||||||
with:
|
|
||||||
action: generate
|
|
||||||
build_type: ${{ env.BUILD_TYPE }}
|
|
||||||
|
|
||||||
- name: Post to a Slack channel
|
|
||||||
if: ${{ github.event.schedule && failure() }}
|
|
||||||
uses: slackapi/slack-github-action@v1
|
|
||||||
with:
|
|
||||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
|
||||||
slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
|
||||||
env:
|
|
||||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
|
||||||
|
|||||||
177
.github/workflows/build_and_test.yml
vendored
177
.github/workflows/build_and_test.yml
vendored
@@ -19,12 +19,10 @@ concurrency:
|
|||||||
env:
|
env:
|
||||||
RUST_BACKTRACE: 1
|
RUST_BACKTRACE: 1
|
||||||
COPT: '-Werror'
|
COPT: '-Werror'
|
||||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
|
||||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
tag:
|
tag:
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||||
outputs:
|
outputs:
|
||||||
build-tag: ${{steps.build-tag.outputs.tag}}
|
build-tag: ${{steps.build-tag.outputs.tag}}
|
||||||
@@ -52,7 +50,7 @@ jobs:
|
|||||||
id: build-tag
|
id: build-tag
|
||||||
|
|
||||||
check-codestyle-python:
|
check-codestyle-python:
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cloud:pinned
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cloud:pinned
|
||||||
options: --init
|
options: --init
|
||||||
@@ -87,7 +85,7 @@ jobs:
|
|||||||
run: poetry run mypy .
|
run: poetry run mypy .
|
||||||
|
|
||||||
check-codestyle-rust:
|
check-codestyle-rust:
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
options: --init
|
options: --init
|
||||||
@@ -99,16 +97,16 @@ jobs:
|
|||||||
submodules: true
|
submodules: true
|
||||||
fetch-depth: 1
|
fetch-depth: 1
|
||||||
|
|
||||||
# Disabled for now
|
- name: Restore cargo deps cache
|
||||||
# - name: Restore cargo deps cache
|
id: cache_cargo
|
||||||
# id: cache_cargo
|
uses: actions/cache@v3
|
||||||
# uses: actions/cache@v3
|
with:
|
||||||
# with:
|
path: |
|
||||||
# path: |
|
~/.cargo/registry/
|
||||||
# !~/.cargo/registry/src
|
!~/.cargo/registry/src
|
||||||
# ~/.cargo/git/
|
~/.cargo/git/
|
||||||
# target/
|
target/
|
||||||
# key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||||
|
|
||||||
# Some of our rust modules use FFI and need those to be checked
|
# Some of our rust modules use FFI and need those to be checked
|
||||||
- name: Get postgres headers
|
- name: Get postgres headers
|
||||||
@@ -135,7 +133,7 @@ jobs:
|
|||||||
run: cargo deny check
|
run: cargo deny check
|
||||||
|
|
||||||
build-neon:
|
build-neon:
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
options: --init
|
options: --init
|
||||||
@@ -143,6 +141,7 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
build_type: [ debug, release ]
|
build_type: [ debug, release ]
|
||||||
|
|
||||||
env:
|
env:
|
||||||
BUILD_TYPE: ${{ matrix.build_type }}
|
BUILD_TYPE: ${{ matrix.build_type }}
|
||||||
GIT_VERSION: ${{ github.sha }}
|
GIT_VERSION: ${{ github.sha }}
|
||||||
@@ -195,26 +194,24 @@ jobs:
|
|||||||
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
|
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
|
||||||
echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
|
echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
|
||||||
echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
|
echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
|
||||||
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
# Disabled for now
|
|
||||||
# Don't include the ~/.cargo/registry/src directory. It contains just
|
# Don't include the ~/.cargo/registry/src directory. It contains just
|
||||||
# uncompressed versions of the crates in ~/.cargo/registry/cache
|
# uncompressed versions of the crates in ~/.cargo/registry/cache
|
||||||
# directory, and it's faster to let 'cargo' to rebuild it from the
|
# directory, and it's faster to let 'cargo' to rebuild it from the
|
||||||
# compressed crates.
|
# compressed crates.
|
||||||
# - name: Cache cargo deps
|
- name: Cache cargo deps
|
||||||
# id: cache_cargo
|
id: cache_cargo
|
||||||
# uses: actions/cache@v3
|
uses: actions/cache@v3
|
||||||
# with:
|
with:
|
||||||
# path: |
|
path: |
|
||||||
# ~/.cargo/registry/
|
~/.cargo/registry/
|
||||||
# !~/.cargo/registry/src
|
!~/.cargo/registry/src
|
||||||
# ~/.cargo/git/
|
~/.cargo/git/
|
||||||
# target/
|
target/
|
||||||
# # Fall back to older versions of the key, if no cache for current Cargo.lock was found
|
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
|
||||||
# key: |
|
key: |
|
||||||
# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||||
# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
|
v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
|
||||||
|
|
||||||
- name: Cache postgres v14 build
|
- name: Cache postgres v14 build
|
||||||
id: cache_pg_14
|
id: cache_pg_14
|
||||||
@@ -304,7 +301,7 @@ jobs:
|
|||||||
uses: ./.github/actions/save-coverage-data
|
uses: ./.github/actions/save-coverage-data
|
||||||
|
|
||||||
regress-tests:
|
regress-tests:
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
options: --init
|
options: --init
|
||||||
@@ -337,7 +334,7 @@ jobs:
|
|||||||
uses: ./.github/actions/save-coverage-data
|
uses: ./.github/actions/save-coverage-data
|
||||||
|
|
||||||
benchmarks:
|
benchmarks:
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
options: --init
|
options: --init
|
||||||
@@ -368,7 +365,7 @@ jobs:
|
|||||||
# while coverage is currently collected for the debug ones
|
# while coverage is currently collected for the debug ones
|
||||||
|
|
||||||
merge-allure-report:
|
merge-allure-report:
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
options: --init
|
options: --init
|
||||||
@@ -405,7 +402,7 @@ jobs:
|
|||||||
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
|
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
|
||||||
|
|
||||||
coverage-report:
|
coverage-report:
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
options: --init
|
options: --init
|
||||||
@@ -421,17 +418,16 @@ jobs:
|
|||||||
submodules: true
|
submodules: true
|
||||||
fetch-depth: 1
|
fetch-depth: 1
|
||||||
|
|
||||||
# Disabled for now
|
- name: Restore cargo deps cache
|
||||||
# - name: Restore cargo deps cache
|
id: cache_cargo
|
||||||
# id: cache_cargo
|
uses: actions/cache@v3
|
||||||
# uses: actions/cache@v3
|
with:
|
||||||
# with:
|
path: |
|
||||||
# path: |
|
~/.cargo/registry/
|
||||||
# ~/.cargo/registry/
|
!~/.cargo/registry/src
|
||||||
# !~/.cargo/registry/src
|
~/.cargo/git/
|
||||||
# ~/.cargo/git/
|
target/
|
||||||
# target/
|
key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||||
# key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
|
||||||
|
|
||||||
- name: Get Neon artifact
|
- name: Get Neon artifact
|
||||||
uses: ./.github/actions/download
|
uses: ./.github/actions/download
|
||||||
@@ -481,7 +477,7 @@ jobs:
|
|||||||
}"
|
}"
|
||||||
|
|
||||||
trigger-e2e-tests:
|
trigger-e2e-tests:
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||||
options: --init
|
options: --init
|
||||||
@@ -526,10 +522,9 @@ jobs:
|
|||||||
}"
|
}"
|
||||||
|
|
||||||
neon-image:
|
neon-image:
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
needs: [ tag ]
|
needs: [ tag ]
|
||||||
# https://github.com/GoogleContainerTools/kaniko/issues/2005
|
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
|
||||||
defaults:
|
defaults:
|
||||||
run:
|
run:
|
||||||
shell: sh -eu {0}
|
shell: sh -eu {0}
|
||||||
@@ -545,16 +540,12 @@ jobs:
|
|||||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||||
|
|
||||||
- name: Kaniko build neon
|
- name: Kaniko build neon
|
||||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||||
|
|
||||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
|
||||||
- name: Cleanup ECR folder
|
|
||||||
run: rm -rf ~/.ecr
|
|
||||||
|
|
||||||
compute-tools-image:
|
compute-tools-image:
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
needs: [ tag ]
|
needs: [ tag ]
|
||||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||||
defaults:
|
defaults:
|
||||||
run:
|
run:
|
||||||
shell: sh -eu {0}
|
shell: sh -eu {0}
|
||||||
@@ -567,14 +558,11 @@ jobs:
|
|||||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||||
|
|
||||||
- name: Kaniko build compute tools
|
- name: Kaniko build compute tools
|
||||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||||
|
|
||||||
- name: Cleanup ECR folder
|
|
||||||
run: rm -rf ~/.ecr
|
|
||||||
|
|
||||||
compute-node-image:
|
compute-node-image:
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||||
needs: [ tag ]
|
needs: [ tag ]
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
@@ -595,13 +583,10 @@ jobs:
|
|||||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||||
|
|
||||||
- name: Kaniko build compute node with extensions
|
- name: Kaniko build compute node with extensions
|
||||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-${{ matrix.version }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||||
|
|
||||||
- name: Cleanup ECR folder
|
|
||||||
run: rm -rf ~/.ecr
|
|
||||||
|
|
||||||
vm-compute-node-image:
|
vm-compute-node-image:
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
needs: [ tag, compute-node-image ]
|
needs: [ tag, compute-node-image ]
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
@@ -610,8 +595,6 @@ jobs:
|
|||||||
defaults:
|
defaults:
|
||||||
run:
|
run:
|
||||||
shell: sh -eu {0}
|
shell: sh -eu {0}
|
||||||
env:
|
|
||||||
VM_INFORMANT_VERSION: 0.1.1
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Downloading latest vm-builder
|
- name: Downloading latest vm-builder
|
||||||
@@ -623,22 +606,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||||
|
|
||||||
- name: Downloading VM informant version ${{ env.VM_INFORMANT_VERSION }}
|
|
||||||
run: |
|
|
||||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/${{ env.VM_INFORMANT_VERSION }}/vm-informant -o vm-informant
|
|
||||||
chmod +x vm-informant
|
|
||||||
|
|
||||||
- name: Adding VM informant to compute-node image
|
|
||||||
run: |
|
|
||||||
ID=$(docker create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}})
|
|
||||||
docker cp vm-informant $ID:/bin/vm-informant
|
|
||||||
docker commit $ID temp-vm-compute-node
|
|
||||||
docker rm -f $ID
|
|
||||||
|
|
||||||
- name: Build vm image
|
- name: Build vm image
|
||||||
run: |
|
run: |
|
||||||
# note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
|
./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||||
./vm-builder -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
|
||||||
|
|
||||||
- name: Pushing vm-compute-node image
|
- name: Pushing vm-compute-node image
|
||||||
run: |
|
run: |
|
||||||
@@ -646,7 +616,7 @@ jobs:
|
|||||||
|
|
||||||
test-images:
|
test-images:
|
||||||
needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
|
needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -688,7 +658,7 @@ jobs:
|
|||||||
docker compose -f ./docker-compose/docker-compose.yml down
|
docker compose -f ./docker-compose/docker-compose.yml down
|
||||||
|
|
||||||
promote-images:
|
promote-images:
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
needs: [ tag, test-images, vm-compute-node-image ]
|
needs: [ tag, test-images, vm-compute-node-image ]
|
||||||
if: github.event_name != 'workflow_dispatch'
|
if: github.event_name != 'workflow_dispatch'
|
||||||
container: amazon/aws-cli
|
container: amazon/aws-cli
|
||||||
@@ -696,8 +666,6 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
name: [ neon, compute-node-v14, vm-compute-node-v14, compute-node-v15, vm-compute-node-v15, compute-tools]
|
name: [ neon, compute-node-v14, vm-compute-node-v14, compute-node-v15, vm-compute-node-v15, compute-tools]
|
||||||
env:
|
|
||||||
AWS_DEFAULT_REGION: eu-central-1
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Promote image to latest
|
- name: Promote image to latest
|
||||||
@@ -793,11 +761,8 @@ jobs:
|
|||||||
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||||
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||||
|
|
||||||
- name: Cleanup ECR folder
|
|
||||||
run: rm -rf ~/.ecr
|
|
||||||
|
|
||||||
calculate-deploy-targets:
|
calculate-deploy-targets:
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
if: |
|
if: |
|
||||||
github.ref_name == 'release' &&
|
github.ref_name == 'release' &&
|
||||||
github.event_name != 'workflow_dispatch'
|
github.event_name != 'workflow_dispatch'
|
||||||
@@ -815,7 +780,7 @@ jobs:
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
deploy:
|
deploy:
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||||
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
||||||
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||||
@@ -863,7 +828,7 @@ jobs:
|
|||||||
rm -f neon_install.tar.gz .neon_current_version
|
rm -f neon_install.tar.gz .neon_current_version
|
||||||
|
|
||||||
deploy-new:
|
deploy-new:
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||||
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
||||||
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||||
@@ -903,7 +868,7 @@ jobs:
|
|||||||
rm -f neon_install.tar.gz .neon_current_version
|
rm -f neon_install.tar.gz .neon_current_version
|
||||||
|
|
||||||
deploy-pr-test-new:
|
deploy-pr-test-new:
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||||
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
||||||
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||||
@@ -978,7 +943,7 @@ jobs:
|
|||||||
rm -f neon_install.tar.gz .neon_current_version
|
rm -f neon_install.tar.gz .neon_current_version
|
||||||
|
|
||||||
deploy-proxy:
|
deploy-proxy:
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||||
@@ -1023,7 +988,7 @@ jobs:
|
|||||||
|
|
||||||
deploy-storage-broker:
|
deploy-storage-broker:
|
||||||
name: deploy storage broker on old staging and old prod
|
name: deploy storage broker on old staging and old prod
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||||
@@ -1065,7 +1030,7 @@ jobs:
|
|||||||
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
||||||
|
|
||||||
deploy-proxy-new:
|
deploy-proxy-new:
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||||
needs: [ push-docker-hub, tag, regress-tests ]
|
needs: [ push-docker-hub, tag, regress-tests ]
|
||||||
@@ -1118,7 +1083,7 @@ jobs:
|
|||||||
helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||||
|
|
||||||
deploy-storage-broker-dev-new:
|
deploy-storage-broker-dev-new:
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||||
needs: [ push-docker-hub, tag, regress-tests ]
|
needs: [ push-docker-hub, tag, regress-tests ]
|
||||||
@@ -1169,16 +1134,12 @@ jobs:
|
|||||||
include:
|
include:
|
||||||
- target_region: us-east-2
|
- target_region: us-east-2
|
||||||
target_cluster: prod-us-east-2-delta
|
target_cluster: prod-us-east-2-delta
|
||||||
deploy_link_proxy: true
|
|
||||||
- target_region: us-west-2
|
- target_region: us-west-2
|
||||||
target_cluster: prod-us-west-2-eta
|
target_cluster: prod-us-west-2-eta
|
||||||
deploy_link_proxy: false
|
|
||||||
- target_region: eu-central-1
|
- target_region: eu-central-1
|
||||||
target_cluster: prod-eu-central-1-gamma
|
target_cluster: prod-eu-central-1-gamma
|
||||||
deploy_link_proxy: false
|
|
||||||
- target_region: ap-southeast-1
|
- target_region: ap-southeast-1
|
||||||
target_cluster: prod-ap-southeast-1-epsilon
|
target_cluster: prod-ap-southeast-1-epsilon
|
||||||
deploy_link_proxy: false
|
|
||||||
environment:
|
environment:
|
||||||
name: prod-${{ matrix.target_region }}
|
name: prod-${{ matrix.target_region }}
|
||||||
steps:
|
steps:
|
||||||
@@ -1193,17 +1154,11 @@ jobs:
|
|||||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||||
|
|
||||||
- name: Re-deploy scram proxy
|
- name: Re-deploy proxy
|
||||||
run: |
|
run: |
|
||||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||||
|
|
||||||
- name: Re-deploy link proxy
|
|
||||||
if: matrix.deploy_link_proxy
|
|
||||||
run: |
|
|
||||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
|
||||||
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
|
||||||
|
|
||||||
deploy-storage-broker-prod-new:
|
deploy-storage-broker-prod-new:
|
||||||
runs-on: prod
|
runs-on: prod
|
||||||
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||||
@@ -1245,7 +1200,7 @@ jobs:
|
|||||||
helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
||||||
|
|
||||||
promote-compatibility-data:
|
promote-compatibility-data:
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, dev, x64 ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
options: --init
|
options: --init
|
||||||
|
|||||||
823
Cargo.lock
generated
823
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
145
Cargo.toml
145
Cargo.toml
@@ -10,145 +10,6 @@ members = [
|
|||||||
"libs/*",
|
"libs/*",
|
||||||
]
|
]
|
||||||
|
|
||||||
[workspace.package]
|
|
||||||
edition = "2021"
|
|
||||||
license = "Apache-2.0"
|
|
||||||
|
|
||||||
## All dependency versions, used in the project
|
|
||||||
[workspace.dependencies]
|
|
||||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
|
||||||
async-stream = "0.3"
|
|
||||||
async-trait = "0.1"
|
|
||||||
atty = "0.2.14"
|
|
||||||
aws-config = { version = "0.51.0", default-features = false, features=["rustls"] }
|
|
||||||
aws-sdk-s3 = "0.21.0"
|
|
||||||
aws-smithy-http = "0.51.0"
|
|
||||||
aws-types = "0.51.0"
|
|
||||||
base64 = "0.13.0"
|
|
||||||
bincode = "1.3"
|
|
||||||
bindgen = "0.61"
|
|
||||||
bstr = "1.0"
|
|
||||||
byteorder = "1.4"
|
|
||||||
bytes = "1.0"
|
|
||||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
|
||||||
clap = "4.0"
|
|
||||||
close_fds = "0.3.2"
|
|
||||||
comfy-table = "6.1"
|
|
||||||
const_format = "0.2"
|
|
||||||
crc32c = "0.6"
|
|
||||||
crossbeam-utils = "0.8.5"
|
|
||||||
fail = "0.5.0"
|
|
||||||
fs2 = "0.4.3"
|
|
||||||
futures = "0.3"
|
|
||||||
futures-core = "0.3"
|
|
||||||
futures-util = "0.3"
|
|
||||||
git-version = "0.3"
|
|
||||||
hashbrown = "0.13"
|
|
||||||
hex = "0.4"
|
|
||||||
hex-literal = "0.3"
|
|
||||||
hmac = "0.12.1"
|
|
||||||
hostname = "0.3.1"
|
|
||||||
humantime = "2.1"
|
|
||||||
humantime-serde = "1.1.1"
|
|
||||||
hyper = "0.14"
|
|
||||||
hyper-tungstenite = "0.9"
|
|
||||||
itertools = "0.10"
|
|
||||||
jsonwebtoken = "8"
|
|
||||||
libc = "0.2"
|
|
||||||
md5 = "0.7.0"
|
|
||||||
memoffset = "0.8"
|
|
||||||
nix = "0.26"
|
|
||||||
notify = "5.0.0"
|
|
||||||
num-traits = "0.2.15"
|
|
||||||
once_cell = "1.13"
|
|
||||||
opentelemetry = "0.18.0"
|
|
||||||
opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
|
||||||
opentelemetry-semantic-conventions = "0.10.0"
|
|
||||||
tracing-opentelemetry = "0.18.0"
|
|
||||||
parking_lot = "0.12"
|
|
||||||
pin-project-lite = "0.2"
|
|
||||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
|
||||||
prost = "0.11"
|
|
||||||
rand = "0.8"
|
|
||||||
regex = "1.4"
|
|
||||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
|
||||||
routerify = "3"
|
|
||||||
rpds = "0.12.0"
|
|
||||||
rustls = "0.20"
|
|
||||||
rustls-pemfile = "1"
|
|
||||||
rustls-split = "0.3"
|
|
||||||
scopeguard = "1.1"
|
|
||||||
sentry = { version = "0.29", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
|
||||||
serde_json = "1"
|
|
||||||
serde_with = "2.0"
|
|
||||||
sha2 = "0.10.2"
|
|
||||||
signal-hook = "0.3"
|
|
||||||
socket2 = "0.4.4"
|
|
||||||
strum = "0.24"
|
|
||||||
strum_macros = "0.24"
|
|
||||||
svg_fmt = "0.4.1"
|
|
||||||
tar = "0.4"
|
|
||||||
thiserror = "1.0"
|
|
||||||
tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
|
|
||||||
tokio = { version = "1.17", features = ["macros"] }
|
|
||||||
tokio-postgres-rustls = "0.9.0"
|
|
||||||
tokio-rustls = "0.23"
|
|
||||||
tokio-stream = "0.1"
|
|
||||||
tokio-util = { version = "0.7", features = ["io"] }
|
|
||||||
toml = "0.5"
|
|
||||||
toml_edit = { version = "0.17", features = ["easy"] }
|
|
||||||
tonic = {version = "0.8", features = ["tls", "tls-roots"]}
|
|
||||||
tracing = "0.1"
|
|
||||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|
||||||
url = "2.2"
|
|
||||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
|
||||||
walkdir = "2.3.2"
|
|
||||||
webpki-roots = "0.22.5"
|
|
||||||
x509-parser = "0.14"
|
|
||||||
|
|
||||||
## TODO replace this with tracing
|
|
||||||
env_logger = "0.10"
|
|
||||||
log = "0.4"
|
|
||||||
|
|
||||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
|
||||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
|
||||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
|
||||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
|
||||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
|
||||||
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
|
|
||||||
|
|
||||||
## Local libraries
|
|
||||||
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
|
||||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
|
||||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
|
||||||
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
|
|
||||||
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
|
|
||||||
pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
|
|
||||||
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
|
|
||||||
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
|
|
||||||
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
|
||||||
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
|
||||||
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
|
||||||
utils = { version = "0.1", path = "./libs/utils/" }
|
|
||||||
|
|
||||||
## Common library dependency
|
|
||||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
|
||||||
|
|
||||||
## Build dependencies
|
|
||||||
criterion = "0.4"
|
|
||||||
rcgen = "0.10"
|
|
||||||
rstest = "0.16"
|
|
||||||
tempfile = "3.2"
|
|
||||||
tonic-build = "0.8"
|
|
||||||
|
|
||||||
# This is only needed for proxy's tests.
|
|
||||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
|
||||||
[patch.crates-io]
|
|
||||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
|
||||||
|
|
||||||
################# Binary contents sections
|
|
||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
# This is useful for profiling and, to some extent, debug.
|
# This is useful for profiling and, to some extent, debug.
|
||||||
# Besides, debug info should not affect the performance.
|
# Besides, debug info should not affect the performance.
|
||||||
@@ -209,3 +70,9 @@ inherits = "release"
|
|||||||
debug = false # true = 2 = all symbols, 1 = line only
|
debug = false # true = 2 = all symbols, 1 = line only
|
||||||
opt-level = "z"
|
opt-level = "z"
|
||||||
lto = true
|
lto = true
|
||||||
|
|
||||||
|
|
||||||
|
# This is only needed for proxy's tests.
|
||||||
|
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||||
|
[patch.crates-io]
|
||||||
|
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||||
|
|||||||
@@ -1,5 +1,8 @@
|
|||||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
#
|
||||||
ARG IMAGE=rust
|
# This file is identical to the Dockerfile.compute-node-v15 file
|
||||||
|
# except for the version of Postgres that is built.
|
||||||
|
#
|
||||||
|
|
||||||
ARG TAG=pinned
|
ARG TAG=pinned
|
||||||
|
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
@@ -19,8 +22,7 @@ RUN apt update && \
|
|||||||
#
|
#
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
FROM build-deps AS pg-build
|
FROM build-deps AS pg-build
|
||||||
ARG PG_VERSION
|
COPY vendor/postgres-v14 postgres
|
||||||
COPY vendor/postgres-${PG_VERSION} postgres
|
|
||||||
RUN cd postgres && \
|
RUN cd postgres && \
|
||||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
|
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
|
||||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||||
@@ -32,8 +34,7 @@ RUN cd postgres && \
|
|||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
|
|
||||||
|
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
#
|
#
|
||||||
@@ -61,7 +62,8 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
|||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_sfcgal.control
|
||||||
|
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
#
|
#
|
||||||
@@ -133,27 +135,6 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
|
|||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
|
||||||
|
|
||||||
#########################################################################################
|
|
||||||
#
|
|
||||||
# Layer "unit-pg-build"
|
|
||||||
# compile unit extension
|
|
||||||
#
|
|
||||||
#########################################################################################
|
|
||||||
FROM build-deps AS unit-pg-build
|
|
||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
|
|
||||||
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \
|
|
||||||
tar xvzf 7.7.tar.gz && \
|
|
||||||
cd postgresql-unit-7.7 && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
|
||||||
# unit extension's "create extension" script relies on absolute install path to fill some reference tables.
|
|
||||||
# We move the extension from '/usr/local/pgsql/' to '/usr/local/' after it is build. So we need to adjust the path.
|
|
||||||
# This one-liner removes pgsql/ part of the path.
|
|
||||||
# NOTE: Other extensions that rely on MODULEDIR variable after building phase will need the same fix.
|
|
||||||
find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control
|
|
||||||
|
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
#
|
#
|
||||||
# Layer "neon-pg-ext-build"
|
# Layer "neon-pg-ext-build"
|
||||||
@@ -165,7 +146,6 @@ COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
|||||||
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY --from=h3-pg-build /h3/usr /
|
COPY --from=h3-pg-build /h3/usr /
|
||||||
COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
COPY pgxn/ pgxn/
|
COPY pgxn/ pgxn/
|
||||||
|
|
||||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||||
@@ -178,7 +158,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
|||||||
# Compile and run the Neon-specific `compute_ctl` binary
|
# Compile and run the Neon-specific `compute_ctl` binary
|
||||||
#
|
#
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
|
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
|
||||||
USER nonroot
|
USER nonroot
|
||||||
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||||
COPY --chown=nonroot . .
|
COPY --chown=nonroot . .
|
||||||
220
Dockerfile.compute-node-v15
Normal file
220
Dockerfile.compute-node-v15
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
#
|
||||||
|
# This file is identical to the Dockerfile.compute-node-v14 file
|
||||||
|
# except for the version of Postgres that is built.
|
||||||
|
#
|
||||||
|
|
||||||
|
ARG TAG=pinned
|
||||||
|
|
||||||
|
#########################################################################################
|
||||||
|
#
|
||||||
|
# Layer "build-deps"
|
||||||
|
#
|
||||||
|
#########################################################################################
|
||||||
|
FROM debian:bullseye-slim AS build-deps
|
||||||
|
RUN apt update && \
|
||||||
|
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||||
|
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
|
||||||
|
|
||||||
|
#########################################################################################
|
||||||
|
#
|
||||||
|
# Layer "pg-build"
|
||||||
|
# Build Postgres from the neon postgres repository.
|
||||||
|
#
|
||||||
|
#########################################################################################
|
||||||
|
FROM build-deps AS pg-build
|
||||||
|
COPY vendor/postgres-v15 postgres
|
||||||
|
RUN cd postgres && \
|
||||||
|
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
|
||||||
|
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||||
|
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||||
|
# Install headers
|
||||||
|
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
||||||
|
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||||
|
# Enable some of contrib extensions
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control
|
||||||
|
|
||||||
|
#########################################################################################
|
||||||
|
#
|
||||||
|
# Layer "postgis-build"
|
||||||
|
# Build PostGIS from the upstream PostGIS mirror.
|
||||||
|
#
|
||||||
|
#########################################################################################
|
||||||
|
FROM build-deps AS postgis-build
|
||||||
|
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
RUN apt update && \
|
||||||
|
apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc
|
||||||
|
|
||||||
|
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||||
|
tar xvzf postgis-3.3.1.tar.gz && \
|
||||||
|
cd postgis-3.3.1 && \
|
||||||
|
./autogen.sh && \
|
||||||
|
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||||
|
./configure && \
|
||||||
|
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||||
|
cd extensions/postgis && \
|
||||||
|
make clean && \
|
||||||
|
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_sfcgal.control
|
||||||
|
|
||||||
|
#########################################################################################
|
||||||
|
#
|
||||||
|
# Layer "plv8-build"
|
||||||
|
# Build plv8
|
||||||
|
#
|
||||||
|
#########################################################################################
|
||||||
|
FROM build-deps AS plv8-build
|
||||||
|
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
RUN apt update && \
|
||||||
|
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
|
||||||
|
|
||||||
|
# https://github.com/plv8/plv8/issues/475:
|
||||||
|
# v8 uses gold for linking and sets `--thread-count=4` which breaks
|
||||||
|
# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
|
||||||
|
# Install newer gold version manually as debian-testing binutils version updates
|
||||||
|
# libc version, which in turn breaks other extension built against non-testing libc.
|
||||||
|
RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
|
||||||
|
tar xvzf binutils-2.38.tar.gz && \
|
||||||
|
cd binutils-2.38 && \
|
||||||
|
cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||||
|
cd ../bfd && ./configure && make bfdver.h && \
|
||||||
|
cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
|
||||||
|
cp /usr/local/bin/ld.gold /usr/bin/gold
|
||||||
|
|
||||||
|
# Sed is used to patch for https://github.com/plv8/plv8/issues/503
|
||||||
|
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||||
|
tar xvzf v3.1.4.tar.gz && \
|
||||||
|
cd plv8-3.1.4 && \
|
||||||
|
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||||
|
sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
|
||||||
|
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||||
|
rm -rf /plv8-* && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
|
||||||
|
|
||||||
|
#########################################################################################
|
||||||
|
#
|
||||||
|
# Layer "h3-pg-build"
|
||||||
|
# Build h3_pg
|
||||||
|
#
|
||||||
|
#########################################################################################
|
||||||
|
FROM build-deps AS h3-pg-build
|
||||||
|
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
|
||||||
|
# packaged cmake is too old
|
||||||
|
RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
|
||||||
|
-q -O /tmp/cmake-install.sh \
|
||||||
|
&& chmod u+x /tmp/cmake-install.sh \
|
||||||
|
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
|
||||||
|
&& rm /tmp/cmake-install.sh
|
||||||
|
|
||||||
|
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
|
||||||
|
tar xvzf h3.tgz && \
|
||||||
|
cd h3-4.0.1 && \
|
||||||
|
mkdir build && \
|
||||||
|
cd build && \
|
||||||
|
cmake .. -DCMAKE_BUILD_TYPE=Release && \
|
||||||
|
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||||
|
DESTDIR=/h3 make install && \
|
||||||
|
cp -R /h3/usr / && \
|
||||||
|
rm -rf build
|
||||||
|
|
||||||
|
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
|
||||||
|
tar xvzf h3-pg.tgz && \
|
||||||
|
cd h3-pg-4.0.1 && \
|
||||||
|
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||||
|
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||||
|
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
|
||||||
|
|
||||||
|
#########################################################################################
|
||||||
|
#
|
||||||
|
# Layer "neon-pg-ext-build"
|
||||||
|
# compile neon extensions
|
||||||
|
#
|
||||||
|
#########################################################################################
|
||||||
|
FROM build-deps AS neon-pg-ext-build
|
||||||
|
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
COPY --from=h3-pg-build /h3/usr /
|
||||||
|
COPY pgxn/ pgxn/
|
||||||
|
|
||||||
|
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||||
|
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||||
|
-C pgxn/neon \
|
||||||
|
-s install
|
||||||
|
|
||||||
|
#########################################################################################
|
||||||
|
#
|
||||||
|
# Compile and run the Neon-specific `compute_ctl` binary
|
||||||
|
#
|
||||||
|
#########################################################################################
|
||||||
|
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
|
||||||
|
USER nonroot
|
||||||
|
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||||
|
COPY --chown=nonroot . .
|
||||||
|
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
|
||||||
|
|
||||||
|
#########################################################################################
|
||||||
|
#
|
||||||
|
# Clean up postgres folder before inclusion
|
||||||
|
#
|
||||||
|
#########################################################################################
|
||||||
|
FROM neon-pg-ext-build AS postgres-cleanup-layer
|
||||||
|
COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
|
||||||
|
|
||||||
|
# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
|
||||||
|
RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
|
||||||
|
|
||||||
|
# Remove headers that we won't need anymore - we've completed installation of all extensions
|
||||||
|
RUN rm -r /usr/local/pgsql/include
|
||||||
|
|
||||||
|
# Remove static postgresql libraries - all compilation is finished, so we
|
||||||
|
# can now remove these files - they must be included in other binaries by now
|
||||||
|
# if they were to be used by other libraries.
|
||||||
|
RUN rm /usr/local/pgsql/lib/lib*.a
|
||||||
|
|
||||||
|
#########################################################################################
|
||||||
|
#
|
||||||
|
# Final layer
|
||||||
|
# Put it all together into the final image
|
||||||
|
#
|
||||||
|
#########################################################################################
|
||||||
|
FROM debian:bullseye-slim
|
||||||
|
# Add user postgres
|
||||||
|
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||||
|
echo "postgres:test_console_pass" | chpasswd && \
|
||||||
|
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||||
|
chown -R postgres:postgres /var/db/postgres && \
|
||||||
|
chmod 0750 /var/db/postgres/compute && \
|
||||||
|
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||||
|
|
||||||
|
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||||
|
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||||
|
|
||||||
|
# Install:
|
||||||
|
# libreadline8 for psql
|
||||||
|
# libossp-uuid16 for extension ossp-uuid
|
||||||
|
# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
|
||||||
|
RUN apt update && \
|
||||||
|
apt install --no-install-recommends -y \
|
||||||
|
libreadline8 \
|
||||||
|
libossp-uuid16 \
|
||||||
|
libgeos-c1v5 \
|
||||||
|
libgdal28 \
|
||||||
|
libproj19 \
|
||||||
|
libprotobuf-c1 \
|
||||||
|
gdb && \
|
||||||
|
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||||
|
|
||||||
|
USER postgres
|
||||||
|
ENTRYPOINT ["/usr/local/bin/compute_ctl"]
|
||||||
@@ -1,28 +1,24 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "compute_tools"
|
name = "compute_tools"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition.workspace = true
|
edition = "2021"
|
||||||
license.workspace = true
|
license = "Apache-2.0"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow = "1.0"
|
||||||
chrono.workspace = true
|
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||||
clap.workspace = true
|
clap = "4.0"
|
||||||
futures.workspace = true
|
env_logger = "0.9"
|
||||||
hyper = { workspace = true, features = ["full"] }
|
futures = "0.3.13"
|
||||||
notify.workspace = true
|
hyper = { version = "0.14", features = ["full"] }
|
||||||
opentelemetry.workspace = true
|
log = { version = "0.4", features = ["std", "serde"] }
|
||||||
postgres.workspace = true
|
notify = "5.0.0"
|
||||||
regex.workspace = true
|
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||||
serde.workspace = true
|
regex = "1"
|
||||||
serde_json.workspace = true
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
tar.workspace = true
|
serde_json = "1"
|
||||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
tar = "0.4"
|
||||||
tokio-postgres.workspace = true
|
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
|
||||||
tracing.workspace = true
|
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||||
tracing-opentelemetry.workspace = true
|
url = "2.2.2"
|
||||||
tracing-subscriber.workspace = true
|
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||||
tracing-utils.workspace = true
|
|
||||||
url.workspace = true
|
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
|
||||||
|
|||||||
@@ -19,10 +19,6 @@ Also `compute_ctl` spawns two separate service threads:
|
|||||||
- `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
- `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||||
last activity requests.
|
last activity requests.
|
||||||
|
|
||||||
If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
|
|
||||||
compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
|
|
||||||
downscaling and (eventually) will request immediate upscaling under resource pressure.
|
|
||||||
|
|
||||||
Usage example:
|
Usage example:
|
||||||
```sh
|
```sh
|
||||||
compute_ctl -D /var/db/postgres/compute \
|
compute_ctl -D /var/db/postgres/compute \
|
||||||
|
|||||||
@@ -18,10 +18,6 @@
|
|||||||
//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||||
//! last activity requests.
|
//! last activity requests.
|
||||||
//!
|
//!
|
||||||
//! If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
|
|
||||||
//! compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
|
|
||||||
//! downscaling and (eventually) will request immediate upscaling under resource pressure.
|
|
||||||
//!
|
|
||||||
//! Usage example:
|
//! Usage example:
|
||||||
//! ```sh
|
//! ```sh
|
||||||
//! compute_ctl -D /var/db/postgres/compute \
|
//! compute_ctl -D /var/db/postgres/compute \
|
||||||
@@ -40,11 +36,10 @@ use std::{thread, time::Duration};
|
|||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use clap::Arg;
|
use clap::Arg;
|
||||||
use tracing::{error, info};
|
use log::{error, info};
|
||||||
|
|
||||||
use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
|
use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
|
||||||
use compute_tools::http::api::launch_http_server;
|
use compute_tools::http::api::launch_http_server;
|
||||||
use compute_tools::informant::spawn_vm_informant_if_present;
|
|
||||||
use compute_tools::logger::*;
|
use compute_tools::logger::*;
|
||||||
use compute_tools::monitor::launch_monitor;
|
use compute_tools::monitor::launch_monitor;
|
||||||
use compute_tools::params::*;
|
use compute_tools::params::*;
|
||||||
@@ -53,7 +48,8 @@ use compute_tools::spec::*;
|
|||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
// TODO: re-use `utils::logging` later
|
||||||
|
init_logger(DEFAULT_LOG_LEVEL)?;
|
||||||
|
|
||||||
let matches = cli().get_matches();
|
let matches = cli().get_matches();
|
||||||
|
|
||||||
@@ -84,29 +80,6 @@ fn main() -> Result<()> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Extract OpenTelemetry context for the startup actions from the spec, and
|
|
||||||
// attach it to the current tracing context.
|
|
||||||
//
|
|
||||||
// This is used to propagate the context for the 'start_compute' operation
|
|
||||||
// from the neon control plane. This allows linking together the wider
|
|
||||||
// 'start_compute' operation that creates the compute container, with the
|
|
||||||
// startup actions here within the container.
|
|
||||||
//
|
|
||||||
// Switch to the startup context here, and exit it once the startup has
|
|
||||||
// completed and Postgres is up and running.
|
|
||||||
//
|
|
||||||
// NOTE: This is supposed to only cover the *startup* actions. Once
|
|
||||||
// postgres is configured and up-and-running, we exit this span. Any other
|
|
||||||
// actions that are performed on incoming HTTP requests, for example, are
|
|
||||||
// performed in separate spans.
|
|
||||||
let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context {
|
|
||||||
use opentelemetry::propagation::TextMapPropagator;
|
|
||||||
use opentelemetry::sdk::propagation::TraceContextPropagator;
|
|
||||||
Some(TraceContextPropagator::new().extract(carrier).attach())
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
let pageserver_connstr = spec
|
let pageserver_connstr = spec
|
||||||
.cluster
|
.cluster
|
||||||
.settings
|
.settings
|
||||||
@@ -141,55 +114,30 @@ fn main() -> Result<()> {
|
|||||||
// requests, while configuration is still in progress.
|
// requests, while configuration is still in progress.
|
||||||
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
|
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
|
||||||
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
||||||
// Also spawn the thread responsible for handling the VM informant -- if it's present
|
|
||||||
let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant");
|
|
||||||
|
|
||||||
// Start Postgres
|
// Run compute (Postgres) and hang waiting on it.
|
||||||
let mut delay_exit = false;
|
match compute.prepare_and_run() {
|
||||||
let mut exit_code = None;
|
Ok(ec) => {
|
||||||
let pg = match compute.start_compute() {
|
let code = ec.code().unwrap_or(1);
|
||||||
Ok(pg) => Some(pg),
|
info!("Postgres exited with code {}, shutting down", code);
|
||||||
Err(err) => {
|
exit(code)
|
||||||
error!("could not start the compute node: {:?}", err);
|
}
|
||||||
|
Err(error) => {
|
||||||
|
error!("could not start the compute node: {:?}", error);
|
||||||
|
|
||||||
let mut state = compute.state.write().unwrap();
|
let mut state = compute.state.write().unwrap();
|
||||||
state.error = Some(format!("{:?}", err));
|
state.error = Some(format!("{:?}", error));
|
||||||
state.status = ComputeStatus::Failed;
|
state.status = ComputeStatus::Failed;
|
||||||
drop(state);
|
drop(state);
|
||||||
delay_exit = true;
|
|
||||||
None
|
// Keep serving HTTP requests, so the cloud control plane was able to
|
||||||
|
// get the actual error.
|
||||||
|
info!("giving control plane 30s to collect the error before shutdown");
|
||||||
|
thread::sleep(Duration::from_secs(30));
|
||||||
|
info!("shutting down");
|
||||||
|
Err(error)
|
||||||
}
|
}
|
||||||
};
|
|
||||||
|
|
||||||
// Wait for the child Postgres process forever. In this state Ctrl+C will
|
|
||||||
// propagate to Postgres and it will be shut down as well.
|
|
||||||
if let Some(mut pg) = pg {
|
|
||||||
// Startup is finished, exit the startup tracing span
|
|
||||||
drop(startup_context_guard);
|
|
||||||
|
|
||||||
let ecode = pg
|
|
||||||
.wait()
|
|
||||||
.expect("failed to start waiting on Postgres process");
|
|
||||||
info!("Postgres exited with code {}, shutting down", ecode);
|
|
||||||
exit_code = ecode.code()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Err(err) = compute.check_for_core_dumps() {
|
|
||||||
error!("error while checking for core dumps: {err:?}");
|
|
||||||
}
|
|
||||||
|
|
||||||
// If launch failed, keep serving HTTP requests for a while, so the cloud
|
|
||||||
// control plane can get the actual error.
|
|
||||||
if delay_exit {
|
|
||||||
info!("giving control plane 30s to collect the error before shutdown");
|
|
||||||
thread::sleep(Duration::from_secs(30));
|
|
||||||
info!("shutting down");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
|
||||||
// pending traces before we exit.
|
|
||||||
tracing_utils::shutdown_tracing();
|
|
||||||
|
|
||||||
exit(exit_code.unwrap_or(1))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn cli() -> clap::Command {
|
fn cli() -> clap::Command {
|
||||||
|
|||||||
@@ -1,11 +1,10 @@
|
|||||||
use anyhow::{anyhow, Result};
|
use anyhow::{anyhow, Result};
|
||||||
|
use log::error;
|
||||||
use postgres::Client;
|
use postgres::Client;
|
||||||
use tokio_postgres::NoTls;
|
use tokio_postgres::NoTls;
|
||||||
use tracing::{error, instrument};
|
|
||||||
|
|
||||||
use crate::compute::ComputeNode;
|
use crate::compute::ComputeNode;
|
||||||
|
|
||||||
#[instrument(skip_all)]
|
|
||||||
pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
|
pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
|
||||||
let query = "
|
let query = "
|
||||||
CREATE TABLE IF NOT EXISTS health_check (
|
CREATE TABLE IF NOT EXISTS health_check (
|
||||||
@@ -22,7 +21,6 @@ pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all)]
|
|
||||||
pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
|
pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
|
||||||
let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
|
let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
|
||||||
if client.is_closed() {
|
if client.is_closed() {
|
||||||
|
|||||||
@@ -17,15 +17,15 @@
|
|||||||
use std::fs;
|
use std::fs;
|
||||||
use std::os::unix::fs::PermissionsExt;
|
use std::os::unix::fs::PermissionsExt;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::process::{Command, Stdio};
|
use std::process::{Command, ExitStatus, Stdio};
|
||||||
use std::sync::atomic::{AtomicU64, Ordering};
|
use std::sync::atomic::{AtomicU64, Ordering};
|
||||||
use std::sync::RwLock;
|
use std::sync::RwLock;
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
|
use log::{info, warn};
|
||||||
use postgres::{Client, NoTls};
|
use postgres::{Client, NoTls};
|
||||||
use serde::{Serialize, Serializer};
|
use serde::{Serialize, Serializer};
|
||||||
use tracing::{info, instrument, warn};
|
|
||||||
|
|
||||||
use crate::checker::create_writability_check_data;
|
use crate::checker::create_writability_check_data;
|
||||||
use crate::config;
|
use crate::config;
|
||||||
@@ -121,7 +121,6 @@ impl ComputeNode {
|
|||||||
|
|
||||||
// Get basebackup from the libpq connection to pageserver using `connstr` and
|
// Get basebackup from the libpq connection to pageserver using `connstr` and
|
||||||
// unarchive it to `pgdata` directory overriding all its previous content.
|
// unarchive it to `pgdata` directory overriding all its previous content.
|
||||||
#[instrument(skip(self))]
|
|
||||||
fn get_basebackup(&self, lsn: &str) -> Result<()> {
|
fn get_basebackup(&self, lsn: &str) -> Result<()> {
|
||||||
let start_time = Utc::now();
|
let start_time = Utc::now();
|
||||||
|
|
||||||
@@ -155,7 +154,6 @@ impl ComputeNode {
|
|||||||
|
|
||||||
// Run `postgres` in a special mode with `--sync-safekeepers` argument
|
// Run `postgres` in a special mode with `--sync-safekeepers` argument
|
||||||
// and return the reported LSN back to the caller.
|
// and return the reported LSN back to the caller.
|
||||||
#[instrument(skip(self))]
|
|
||||||
fn sync_safekeepers(&self) -> Result<String> {
|
fn sync_safekeepers(&self) -> Result<String> {
|
||||||
let start_time = Utc::now();
|
let start_time = Utc::now();
|
||||||
|
|
||||||
@@ -198,7 +196,6 @@ impl ComputeNode {
|
|||||||
|
|
||||||
/// Do all the preparations like PGDATA directory creation, configuration,
|
/// Do all the preparations like PGDATA directory creation, configuration,
|
||||||
/// safekeepers sync, basebackup, etc.
|
/// safekeepers sync, basebackup, etc.
|
||||||
#[instrument(skip(self))]
|
|
||||||
pub fn prepare_pgdata(&self) -> Result<()> {
|
pub fn prepare_pgdata(&self) -> Result<()> {
|
||||||
let spec = &self.spec;
|
let spec = &self.spec;
|
||||||
let pgdata_path = Path::new(&self.pgdata);
|
let pgdata_path = Path::new(&self.pgdata);
|
||||||
@@ -232,8 +229,9 @@ impl ComputeNode {
|
|||||||
|
|
||||||
/// Start Postgres as a child process and manage DBs/roles.
|
/// Start Postgres as a child process and manage DBs/roles.
|
||||||
/// After that this will hang waiting on the postmaster process to exit.
|
/// After that this will hang waiting on the postmaster process to exit.
|
||||||
#[instrument(skip(self))]
|
pub fn run(&self) -> Result<ExitStatus> {
|
||||||
pub fn start_postgres(&self) -> Result<std::process::Child> {
|
let start_time = Utc::now();
|
||||||
|
|
||||||
let pgdata_path = Path::new(&self.pgdata);
|
let pgdata_path = Path::new(&self.pgdata);
|
||||||
|
|
||||||
// Run postgres as a child process.
|
// Run postgres as a child process.
|
||||||
@@ -244,15 +242,10 @@ impl ComputeNode {
|
|||||||
|
|
||||||
wait_for_postgres(&mut pg, pgdata_path)?;
|
wait_for_postgres(&mut pg, pgdata_path)?;
|
||||||
|
|
||||||
Ok(pg)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip(self))]
|
|
||||||
pub fn apply_config(&self) -> Result<()> {
|
|
||||||
// If connection fails,
|
// If connection fails,
|
||||||
// it may be the old node with `zenith_admin` superuser.
|
// it may be the old node with `zenith_admin` superuser.
|
||||||
//
|
//
|
||||||
// In this case we need to connect with old `zenith_admin` name
|
// In this case we need to connect with old `zenith_admin`name
|
||||||
// and create new user. We cannot simply rename connected user,
|
// and create new user. We cannot simply rename connected user,
|
||||||
// but we can create a new one and grant it all privileges.
|
// but we can create a new one and grant it all privileges.
|
||||||
let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
|
let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
|
||||||
@@ -278,7 +271,6 @@ impl ComputeNode {
|
|||||||
Ok(client) => client,
|
Ok(client) => client,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
|
||||||
handle_roles(&self.spec, &mut client)?;
|
handle_roles(&self.spec, &mut client)?;
|
||||||
handle_databases(&self.spec, &mut client)?;
|
handle_databases(&self.spec, &mut client)?;
|
||||||
handle_role_deletions(self, &mut client)?;
|
handle_role_deletions(self, &mut client)?;
|
||||||
@@ -287,34 +279,8 @@ impl ComputeNode {
|
|||||||
|
|
||||||
// 'Close' connection
|
// 'Close' connection
|
||||||
drop(client);
|
drop(client);
|
||||||
|
|
||||||
info!(
|
|
||||||
"finished configuration of compute for project {}",
|
|
||||||
self.spec.cluster.cluster_id
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip(self))]
|
|
||||||
pub fn start_compute(&self) -> Result<std::process::Child> {
|
|
||||||
info!(
|
|
||||||
"starting compute for project {}, operation {}, tenant {}, timeline {}",
|
|
||||||
self.spec.cluster.cluster_id,
|
|
||||||
self.spec.operation_uuid.as_ref().unwrap(),
|
|
||||||
self.tenant,
|
|
||||||
self.timeline,
|
|
||||||
);
|
|
||||||
|
|
||||||
self.prepare_pgdata()?;
|
|
||||||
|
|
||||||
let start_time = Utc::now();
|
|
||||||
|
|
||||||
let pg = self.start_postgres()?;
|
|
||||||
|
|
||||||
self.apply_config()?;
|
|
||||||
|
|
||||||
let startup_end_time = Utc::now();
|
let startup_end_time = Utc::now();
|
||||||
|
|
||||||
self.metrics.config_ms.store(
|
self.metrics.config_ms.store(
|
||||||
startup_end_time
|
startup_end_time
|
||||||
.signed_duration_since(start_time)
|
.signed_duration_since(start_time)
|
||||||
@@ -334,7 +300,34 @@ impl ComputeNode {
|
|||||||
|
|
||||||
self.set_status(ComputeStatus::Running);
|
self.set_status(ComputeStatus::Running);
|
||||||
|
|
||||||
Ok(pg)
|
info!(
|
||||||
|
"finished configuration of compute for project {}",
|
||||||
|
self.spec.cluster.cluster_id
|
||||||
|
);
|
||||||
|
|
||||||
|
// Wait for child Postgres process basically forever. In this state Ctrl+C
|
||||||
|
// will propagate to Postgres and it will be shut down as well.
|
||||||
|
let ecode = pg
|
||||||
|
.wait()
|
||||||
|
.expect("failed to start waiting on Postgres process");
|
||||||
|
|
||||||
|
self.check_for_core_dumps()
|
||||||
|
.expect("failed to check for core dumps");
|
||||||
|
|
||||||
|
Ok(ecode)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn prepare_and_run(&self) -> Result<ExitStatus> {
|
||||||
|
info!(
|
||||||
|
"starting compute for project {}, operation {}, tenant {}, timeline {}",
|
||||||
|
self.spec.cluster.cluster_id,
|
||||||
|
self.spec.operation_uuid.as_ref().unwrap(),
|
||||||
|
self.tenant,
|
||||||
|
self.timeline,
|
||||||
|
);
|
||||||
|
|
||||||
|
self.prepare_pgdata()?;
|
||||||
|
self.run()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Look for core dumps and collect backtraces.
|
// Look for core dumps and collect backtraces.
|
||||||
@@ -347,7 +340,7 @@ impl ComputeNode {
|
|||||||
//
|
//
|
||||||
// Use that as a default location and pattern, except macos where core dumps are written
|
// Use that as a default location and pattern, except macos where core dumps are written
|
||||||
// to /cores/ directory by default.
|
// to /cores/ directory by default.
|
||||||
pub fn check_for_core_dumps(&self) -> Result<()> {
|
fn check_for_core_dumps(&self) -> Result<()> {
|
||||||
let core_dump_dir = match std::env::consts::OS {
|
let core_dump_dir = match std::env::consts::OS {
|
||||||
"macos" => Path::new("/cores/"),
|
"macos" => Path::new("/cores/"),
|
||||||
_ => Path::new(&self.pgdata),
|
_ => Path::new(&self.pgdata),
|
||||||
|
|||||||
@@ -6,19 +6,13 @@ use std::thread;
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use hyper::service::{make_service_fn, service_fn};
|
use hyper::service::{make_service_fn, service_fn};
|
||||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||||
|
use log::{error, info};
|
||||||
use serde_json;
|
use serde_json;
|
||||||
use tracing::{error, info};
|
|
||||||
use tracing_utils::http::OtelName;
|
|
||||||
|
|
||||||
use crate::compute::ComputeNode;
|
use crate::compute::ComputeNode;
|
||||||
|
|
||||||
// Service function to handle all available routes.
|
// Service function to handle all available routes.
|
||||||
async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
|
async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
|
||||||
//
|
|
||||||
// NOTE: The URI path is currently included in traces. That's OK because
|
|
||||||
// it doesn't contain any variable parts or sensitive information. But
|
|
||||||
// please keep that in mind if you change the routing here.
|
|
||||||
//
|
|
||||||
match (req.method(), req.uri().path()) {
|
match (req.method(), req.uri().path()) {
|
||||||
// Serialized compute state.
|
// Serialized compute state.
|
||||||
(&Method::GET, "/status") => {
|
(&Method::GET, "/status") => {
|
||||||
@@ -36,7 +30,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
|||||||
|
|
||||||
(&Method::POST, "/check_writability") => {
|
(&Method::POST, "/check_writability") => {
|
||||||
info!("serving /check_writability POST request");
|
info!("serving /check_writability POST request");
|
||||||
let res = crate::checker::check_writability(compute).await;
|
let res = crate::checker::check_writability(&compute).await;
|
||||||
match res {
|
match res {
|
||||||
Ok(_) => Response::new(Body::from("true")),
|
Ok(_) => Response::new(Body::from("true")),
|
||||||
Err(e) => Response::new(Body::from(e.to_string())),
|
Err(e) => Response::new(Body::from(e.to_string())),
|
||||||
@@ -62,19 +56,7 @@ async fn serve(state: Arc<ComputeNode>) {
|
|||||||
async move {
|
async move {
|
||||||
Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
|
Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
|
||||||
let state = state.clone();
|
let state = state.clone();
|
||||||
async move {
|
async move { Ok::<_, Infallible>(routes(req, state).await) }
|
||||||
Ok::<_, Infallible>(
|
|
||||||
// NOTE: We include the URI path in the string. It
|
|
||||||
// doesn't contain any variable parts or sensitive
|
|
||||||
// information in this API.
|
|
||||||
tracing_utils::http::tracing_handler(
|
|
||||||
req,
|
|
||||||
|req| routes(req, &state),
|
|
||||||
OtelName::UriPath,
|
|
||||||
)
|
|
||||||
.await,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -1,50 +0,0 @@
|
|||||||
use std::path::Path;
|
|
||||||
use std::process;
|
|
||||||
use std::thread;
|
|
||||||
use std::time::Duration;
|
|
||||||
use tracing::{info, warn};
|
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
|
||||||
|
|
||||||
const VM_INFORMANT_PATH: &str = "/bin/vm-informant";
|
|
||||||
const RESTART_INFORMANT_AFTER_MILLIS: u64 = 5000;
|
|
||||||
|
|
||||||
/// Launch a thread to start the VM informant if it's present (and restart, on failure)
|
|
||||||
pub fn spawn_vm_informant_if_present() -> Result<Option<thread::JoinHandle<()>>> {
|
|
||||||
let exists = Path::new(VM_INFORMANT_PATH)
|
|
||||||
.try_exists()
|
|
||||||
.context("could not check if path exists")?;
|
|
||||||
|
|
||||||
if !exists {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(Some(
|
|
||||||
thread::Builder::new()
|
|
||||||
.name("run-vm-informant".into())
|
|
||||||
.spawn(move || run_informant())?,
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn run_informant() -> ! {
|
|
||||||
let restart_wait = Duration::from_millis(RESTART_INFORMANT_AFTER_MILLIS);
|
|
||||||
|
|
||||||
info!("starting VM informant");
|
|
||||||
|
|
||||||
loop {
|
|
||||||
let mut cmd = process::Command::new(VM_INFORMANT_PATH);
|
|
||||||
// Block on subprocess:
|
|
||||||
let result = cmd.status();
|
|
||||||
|
|
||||||
match result {
|
|
||||||
Err(e) => warn!("failed to run VM informant at {VM_INFORMANT_PATH:?}: {e}"),
|
|
||||||
Ok(status) if !status.success() => {
|
|
||||||
warn!("{VM_INFORMANT_PATH} exited with code {status:?}, retrying")
|
|
||||||
}
|
|
||||||
Ok(_) => info!("{VM_INFORMANT_PATH} ended gracefully (unexpectedly). Retrying"),
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wait before retrying
|
|
||||||
thread::sleep(restart_wait);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -8,7 +8,6 @@ pub mod http;
|
|||||||
#[macro_use]
|
#[macro_use]
|
||||||
pub mod logger;
|
pub mod logger;
|
||||||
pub mod compute;
|
pub mod compute;
|
||||||
pub mod informant;
|
|
||||||
pub mod monitor;
|
pub mod monitor;
|
||||||
pub mod params;
|
pub mod params;
|
||||||
pub mod pg_helpers;
|
pub mod pg_helpers;
|
||||||
|
|||||||
@@ -1,37 +1,43 @@
|
|||||||
use tracing_opentelemetry::OpenTelemetryLayer;
|
use std::io::Write;
|
||||||
use tracing_subscriber::layer::SubscriberExt;
|
|
||||||
use tracing_subscriber::prelude::*;
|
|
||||||
|
|
||||||
/// Initialize logging to stderr, and OpenTelemetry tracing and exporter.
|
use anyhow::Result;
|
||||||
///
|
use chrono::Utc;
|
||||||
/// Logging is configured using either `default_log_level` or
|
use env_logger::{Builder, Env};
|
||||||
|
|
||||||
|
macro_rules! info_println {
|
||||||
|
($($tts:tt)*) => {
|
||||||
|
if log_enabled!(Level::Info) {
|
||||||
|
println!($($tts)*);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
macro_rules! info_print {
|
||||||
|
($($tts:tt)*) => {
|
||||||
|
if log_enabled!(Level::Info) {
|
||||||
|
print!($($tts)*);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Initialize `env_logger` using either `default_level` or
|
||||||
/// `RUST_LOG` environment variable as default log level.
|
/// `RUST_LOG` environment variable as default log level.
|
||||||
///
|
pub fn init_logger(default_level: &str) -> Result<()> {
|
||||||
/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up
|
let env = Env::default().filter_or("RUST_LOG", default_level);
|
||||||
/// configuration from environment variables. For example, to change the destination,
|
|
||||||
/// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See
|
|
||||||
/// `tracing-utils` package description.
|
|
||||||
///
|
|
||||||
pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
|
|
||||||
// Initialize Logging
|
|
||||||
let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
|
|
||||||
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
|
|
||||||
|
|
||||||
let fmt_layer = tracing_subscriber::fmt::layer()
|
Builder::from_env(env)
|
||||||
.with_target(false)
|
.format(|buf, record| {
|
||||||
.with_writer(std::io::stderr);
|
let thread_handle = std::thread::current();
|
||||||
|
writeln!(
|
||||||
// Initialize OpenTelemetry
|
buf,
|
||||||
let otlp_layer =
|
"{} [{}] {}: {}",
|
||||||
tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new);
|
Utc::now().format("%Y-%m-%d %H:%M:%S%.3f %Z"),
|
||||||
|
thread_handle.name().unwrap_or("main"),
|
||||||
// Put it all together
|
record.level(),
|
||||||
tracing_subscriber::registry()
|
record.args()
|
||||||
.with(env_filter)
|
)
|
||||||
.with(otlp_layer)
|
})
|
||||||
.with(fmt_layer)
|
|
||||||
.init();
|
.init();
|
||||||
tracing::info!("logging and tracing started");
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,8 +3,8 @@ use std::{thread, time};
|
|||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
|
use log::{debug, info};
|
||||||
use postgres::{Client, NoTls};
|
use postgres::{Client, NoTls};
|
||||||
use tracing::{debug, info};
|
|
||||||
|
|
||||||
use crate::compute::ComputeNode;
|
use crate::compute::ComputeNode;
|
||||||
|
|
||||||
|
|||||||
@@ -1,9 +1,3 @@
|
|||||||
pub const DEFAULT_LOG_LEVEL: &str = "info";
|
pub const DEFAULT_LOG_LEVEL: &str = "info";
|
||||||
// From Postgres docs:
|
pub const DEFAULT_CONNSTRING: &str = "host=localhost user=postgres";
|
||||||
// To ease transition from the md5 method to the newer SCRAM method, if md5 is specified
|
|
||||||
// as a method in pg_hba.conf but the user's password on the server is encrypted for SCRAM
|
|
||||||
// (see below), then SCRAM-based authentication will automatically be chosen instead.
|
|
||||||
// https://www.postgresql.org/docs/15/auth-password.html
|
|
||||||
//
|
|
||||||
// So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
|
|
||||||
pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
|
pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ use anyhow::{bail, Result};
|
|||||||
use notify::{RecursiveMode, Watcher};
|
use notify::{RecursiveMode, Watcher};
|
||||||
use postgres::{Client, Transaction};
|
use postgres::{Client, Transaction};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use tracing::{debug, instrument};
|
|
||||||
|
|
||||||
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
||||||
|
|
||||||
@@ -130,8 +129,8 @@ impl Role {
|
|||||||
/// Serialize a list of role parameters into a Postgres-acceptable
|
/// Serialize a list of role parameters into a Postgres-acceptable
|
||||||
/// string of arguments.
|
/// string of arguments.
|
||||||
pub fn to_pg_options(&self) -> String {
|
pub fn to_pg_options(&self) -> String {
|
||||||
// XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
|
// XXX: consider putting LOGIN as a default option somewhere higher, e.g. in Rails.
|
||||||
// For now, we do not use generic `options` for roles. Once used, add
|
// For now we do not use generic `options` for roles. Once used, add
|
||||||
// `self.options.as_pg_options()` somewhere here.
|
// `self.options.as_pg_options()` somewhere here.
|
||||||
let mut params: String = "LOGIN".to_string();
|
let mut params: String = "LOGIN".to_string();
|
||||||
|
|
||||||
@@ -230,7 +229,6 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
|
|||||||
/// Wait for Postgres to become ready to accept connections. It's ready to
|
/// Wait for Postgres to become ready to accept connections. It's ready to
|
||||||
/// accept connections when the state-field in `pgdata/postmaster.pid` says
|
/// accept connections when the state-field in `pgdata/postmaster.pid` says
|
||||||
/// 'ready'.
|
/// 'ready'.
|
||||||
#[instrument(skip(pg))]
|
|
||||||
pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
|
pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
|
||||||
let pid_path = pgdata.join("postmaster.pid");
|
let pid_path = pgdata.join("postmaster.pid");
|
||||||
|
|
||||||
@@ -289,18 +287,18 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let res = rx.recv_timeout(Duration::from_millis(100));
|
let res = rx.recv_timeout(Duration::from_millis(100));
|
||||||
debug!("woken up by notify: {res:?}");
|
log::debug!("woken up by notify: {res:?}");
|
||||||
// If there are multiple events in the channel already, we only need to be
|
// If there are multiple events in the channel already, we only need to be
|
||||||
// check once. Swallow the extra events before we go ahead to check the
|
// check once. Swallow the extra events before we go ahead to check the
|
||||||
// pid file.
|
// pid file.
|
||||||
while let Ok(res) = rx.try_recv() {
|
while let Ok(res) = rx.try_recv() {
|
||||||
debug!("swallowing extra event: {res:?}");
|
log::debug!("swallowing extra event: {res:?}");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check that we can open pid file first.
|
// Check that we can open pid file first.
|
||||||
if let Ok(file) = File::open(&pid_path) {
|
if let Ok(file) = File::open(&pid_path) {
|
||||||
if !postmaster_pid_seen {
|
if !postmaster_pid_seen {
|
||||||
debug!("postmaster.pid appeared");
|
log::debug!("postmaster.pid appeared");
|
||||||
watcher
|
watcher
|
||||||
.unwatch(pgdata)
|
.unwatch(pgdata)
|
||||||
.expect("Failed to remove pgdata dir watch");
|
.expect("Failed to remove pgdata dir watch");
|
||||||
@@ -316,7 +314,7 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
|
|||||||
// Pid file could be there and we could read it, but it could be empty, for example.
|
// Pid file could be there and we could read it, but it could be empty, for example.
|
||||||
if let Some(Ok(line)) = last_line {
|
if let Some(Ok(line)) = last_line {
|
||||||
let status = line.trim();
|
let status = line.trim();
|
||||||
debug!("last line of postmaster.pid: {status:?}");
|
log::debug!("last line of postmaster.pid: {status:?}");
|
||||||
|
|
||||||
// Now Postgres is ready to accept connections
|
// Now Postgres is ready to accept connections
|
||||||
if status == "ready" {
|
if status == "ready" {
|
||||||
@@ -332,7 +330,7 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tracing::info!("PostgreSQL is now running, continuing to configure it");
|
log::info!("PostgreSQL is now running, continuing to configure it");
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,12 +1,12 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
use log::{info, log_enabled, warn, Level};
|
||||||
use postgres::config::Config;
|
use postgres::config::Config;
|
||||||
use postgres::{Client, NoTls};
|
use postgres::{Client, NoTls};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use tracing::{info, info_span, instrument, span_enabled, warn, Level};
|
|
||||||
|
|
||||||
use crate::compute::ComputeNode;
|
use crate::compute::ComputeNode;
|
||||||
use crate::config;
|
use crate::config;
|
||||||
@@ -23,8 +23,6 @@ pub struct ComputeSpec {
|
|||||||
/// Expected cluster state at the end of transition process.
|
/// Expected cluster state at the end of transition process.
|
||||||
pub cluster: Cluster,
|
pub cluster: Cluster,
|
||||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||||
|
|
||||||
pub startup_tracing_context: Option<HashMap<String, String>>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Cluster state seen from the perspective of the external tools
|
/// Cluster state seen from the perspective of the external tools
|
||||||
@@ -82,25 +80,23 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
|||||||
|
|
||||||
/// Given a cluster spec json and open transaction it handles roles creation,
|
/// Given a cluster spec json and open transaction it handles roles creation,
|
||||||
/// deletion and update.
|
/// deletion and update.
|
||||||
#[instrument(skip_all)]
|
|
||||||
pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||||
let mut xact = client.transaction()?;
|
let mut xact = client.transaction()?;
|
||||||
let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
|
let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
|
||||||
|
|
||||||
// Print a list of existing Postgres roles (only in debug mode)
|
// Print a list of existing Postgres roles (only in debug mode)
|
||||||
if span_enabled!(Level::INFO) {
|
info!("postgres roles:");
|
||||||
info!("postgres roles:");
|
for r in &existing_roles {
|
||||||
for r in &existing_roles {
|
info_println!(
|
||||||
info!(
|
"{} - {}:{}",
|
||||||
" - {}:{}",
|
" ".repeat(27 + 5),
|
||||||
r.name,
|
r.name,
|
||||||
if r.encrypted_password.is_some() {
|
if r.encrypted_password.is_some() {
|
||||||
"[FILTERED]"
|
"[FILTERED]"
|
||||||
} else {
|
} else {
|
||||||
"(null)"
|
"(null)"
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process delta operations first
|
// Process delta operations first
|
||||||
@@ -141,80 +137,58 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
info!("cluster spec roles:");
|
info!("cluster spec roles:");
|
||||||
for role in &spec.cluster.roles {
|
for role in &spec.cluster.roles {
|
||||||
let name = &role.name;
|
let name = &role.name;
|
||||||
// XXX: with a limited number of roles it is fine, but consider making it a HashMap
|
|
||||||
let pg_role = existing_roles.iter().find(|r| r.name == *name);
|
|
||||||
|
|
||||||
enum RoleAction {
|
info_print!(
|
||||||
None,
|
"{} - {}:{}",
|
||||||
Update,
|
" ".repeat(27 + 5),
|
||||||
Create,
|
name,
|
||||||
}
|
if role.encrypted_password.is_some() {
|
||||||
let action = if let Some(r) = pg_role {
|
|
||||||
if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
|
|
||||||
|| (r.encrypted_password.is_some() && role.encrypted_password.is_none())
|
|
||||||
{
|
|
||||||
RoleAction::Update
|
|
||||||
} else if let Some(pg_pwd) = &r.encrypted_password {
|
|
||||||
// Check whether password changed or not (trim 'md5' prefix first if any)
|
|
||||||
//
|
|
||||||
// This is a backward compatibility hack, which comes from the times when we were using
|
|
||||||
// md5 for everyone and hashes were stored in the console db without md5 prefix. So when
|
|
||||||
// role comes from the control-plane (json spec) `Role.encrypted_password` doesn't have md5 prefix,
|
|
||||||
// but when role comes from Postgres (`get_existing_roles` / `existing_roles`) it has this prefix.
|
|
||||||
// Here is the only place so far where we compare hashes, so it seems to be the best candidate
|
|
||||||
// to place this compatibility layer.
|
|
||||||
let pg_pwd = if let Some(stripped) = pg_pwd.strip_prefix("md5") {
|
|
||||||
stripped
|
|
||||||
} else {
|
|
||||||
pg_pwd
|
|
||||||
};
|
|
||||||
if pg_pwd != *role.encrypted_password.as_ref().unwrap() {
|
|
||||||
RoleAction::Update
|
|
||||||
} else {
|
|
||||||
RoleAction::None
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
RoleAction::None
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
RoleAction::Create
|
|
||||||
};
|
|
||||||
|
|
||||||
match action {
|
|
||||||
RoleAction::None => {}
|
|
||||||
RoleAction::Update => {
|
|
||||||
let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
|
|
||||||
query.push_str(&role.to_pg_options());
|
|
||||||
xact.execute(query.as_str(), &[])?;
|
|
||||||
}
|
|
||||||
RoleAction::Create => {
|
|
||||||
let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
|
|
||||||
info!("role create query: '{}'", &query);
|
|
||||||
query.push_str(&role.to_pg_options());
|
|
||||||
xact.execute(query.as_str(), &[])?;
|
|
||||||
|
|
||||||
let grant_query = format!(
|
|
||||||
"GRANT pg_read_all_data, pg_write_all_data TO {}",
|
|
||||||
name.pg_quote()
|
|
||||||
);
|
|
||||||
xact.execute(grant_query.as_str(), &[])?;
|
|
||||||
info!("role grant query: '{}'", &grant_query);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if span_enabled!(Level::INFO) {
|
|
||||||
let pwd = if role.encrypted_password.is_some() {
|
|
||||||
"[FILTERED]"
|
"[FILTERED]"
|
||||||
} else {
|
} else {
|
||||||
"(null)"
|
"(null)"
|
||||||
};
|
}
|
||||||
let action_str = match action {
|
);
|
||||||
RoleAction::None => "",
|
|
||||||
RoleAction::Create => " -> create",
|
// XXX: with a limited number of roles it is fine, but consider making it a HashMap
|
||||||
RoleAction::Update => " -> update",
|
let pg_role = existing_roles.iter().find(|r| r.name == *name);
|
||||||
};
|
|
||||||
info!(" - {}:{}{}", name, pwd, action_str);
|
if let Some(r) = pg_role {
|
||||||
|
let mut update_role = false;
|
||||||
|
|
||||||
|
if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
|
||||||
|
|| (r.encrypted_password.is_some() && role.encrypted_password.is_none())
|
||||||
|
{
|
||||||
|
update_role = true;
|
||||||
|
} else if let Some(pg_pwd) = &r.encrypted_password {
|
||||||
|
// Check whether password changed or not (trim 'md5:' prefix first)
|
||||||
|
update_role = pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
if update_role {
|
||||||
|
let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
|
||||||
|
info_print!(" -> update");
|
||||||
|
|
||||||
|
query.push_str(&role.to_pg_options());
|
||||||
|
xact.execute(query.as_str(), &[])?;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
info!("role name: '{}'", &name);
|
||||||
|
let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
|
||||||
|
info!("role create query: '{}'", &query);
|
||||||
|
info_print!(" -> create");
|
||||||
|
|
||||||
|
query.push_str(&role.to_pg_options());
|
||||||
|
xact.execute(query.as_str(), &[])?;
|
||||||
|
|
||||||
|
let grant_query = format!(
|
||||||
|
"GRANT pg_read_all_data, pg_write_all_data TO {}",
|
||||||
|
name.pg_quote()
|
||||||
|
);
|
||||||
|
xact.execute(grant_query.as_str(), &[])?;
|
||||||
|
info!("role grant query: '{}'", &grant_query);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
info_print!("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
xact.commit()?;
|
xact.commit()?;
|
||||||
@@ -223,25 +197,12 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Reassign all dependent objects and delete requested roles.
|
/// Reassign all dependent objects and delete requested roles.
|
||||||
#[instrument(skip_all)]
|
|
||||||
pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||||
if let Some(ops) = &node.spec.delta_operations {
|
if let Some(ops) = &node.spec.delta_operations {
|
||||||
// First, reassign all dependent objects to db owners.
|
// First, reassign all dependent objects to db owners.
|
||||||
info!("reassigning dependent objects of to-be-deleted roles");
|
info!("reassigning dependent objects of to-be-deleted roles");
|
||||||
|
|
||||||
// Fetch existing roles. We could've exported and used `existing_roles` from
|
|
||||||
// `handle_roles()`, but we only make this list there before creating new roles.
|
|
||||||
// Which is probably fine as we never create to-be-deleted roles, but that'd
|
|
||||||
// just look a bit untidy. Anyway, the entire `pg_roles` should be in shared
|
|
||||||
// buffers already, so this shouldn't be a big deal.
|
|
||||||
let mut xact = client.transaction()?;
|
|
||||||
let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
|
|
||||||
xact.commit()?;
|
|
||||||
|
|
||||||
for op in ops {
|
for op in ops {
|
||||||
// Check that role is still present in Postgres, as this could be a
|
if op.action == "delete_role" {
|
||||||
// restart with the same spec after role deletion.
|
|
||||||
if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) {
|
|
||||||
reassign_owned_objects(node, &op.name)?;
|
reassign_owned_objects(node, &op.name)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -300,16 +261,13 @@ fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()>
|
|||||||
/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
|
/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
|
||||||
/// atomicity should be enough here due to the order of operations and various checks,
|
/// atomicity should be enough here due to the order of operations and various checks,
|
||||||
/// which together provide us idempotency.
|
/// which together provide us idempotency.
|
||||||
#[instrument(skip_all)]
|
|
||||||
pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||||
let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
|
let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
|
||||||
|
|
||||||
// Print a list of existing Postgres databases (only in debug mode)
|
// Print a list of existing Postgres databases (only in debug mode)
|
||||||
if span_enabled!(Level::INFO) {
|
info!("postgres databases:");
|
||||||
info!("postgres databases:");
|
for r in &existing_dbs {
|
||||||
for r in &existing_dbs {
|
info_println!("{} - {}:{}", " ".repeat(27 + 5), r.name, r.owner);
|
||||||
info!(" {}:{}", r.name, r.owner);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process delta operations first
|
// Process delta operations first
|
||||||
@@ -352,15 +310,13 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
for db in &spec.cluster.databases {
|
for db in &spec.cluster.databases {
|
||||||
let name = &db.name;
|
let name = &db.name;
|
||||||
|
|
||||||
|
info_print!("{} - {}:{}", " ".repeat(27 + 5), db.name, db.owner);
|
||||||
|
|
||||||
// XXX: with a limited number of databases it is fine, but consider making it a HashMap
|
// XXX: with a limited number of databases it is fine, but consider making it a HashMap
|
||||||
let pg_db = existing_dbs.iter().find(|r| r.name == *name);
|
let pg_db = existing_dbs.iter().find(|r| r.name == *name);
|
||||||
|
|
||||||
enum DatabaseAction {
|
let start_time = Instant::now();
|
||||||
None,
|
if let Some(r) = pg_db {
|
||||||
Update,
|
|
||||||
Create,
|
|
||||||
}
|
|
||||||
let action = if let Some(r) = pg_db {
|
|
||||||
// XXX: db owner name is returned as quoted string from Postgres,
|
// XXX: db owner name is returned as quoted string from Postgres,
|
||||||
// when quoting is needed.
|
// when quoting is needed.
|
||||||
let new_owner = if r.owner.starts_with('"') {
|
let new_owner = if r.owner.starts_with('"') {
|
||||||
@@ -370,42 +326,29 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
if new_owner != r.owner {
|
if new_owner != r.owner {
|
||||||
// Update the owner
|
|
||||||
DatabaseAction::Update
|
|
||||||
} else {
|
|
||||||
DatabaseAction::None
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
DatabaseAction::Create
|
|
||||||
};
|
|
||||||
|
|
||||||
match action {
|
|
||||||
DatabaseAction::None => {}
|
|
||||||
DatabaseAction::Update => {
|
|
||||||
let query: String = format!(
|
let query: String = format!(
|
||||||
"ALTER DATABASE {} OWNER TO {}",
|
"ALTER DATABASE {} OWNER TO {}",
|
||||||
name.pg_quote(),
|
name.pg_quote(),
|
||||||
db.owner.pg_quote()
|
db.owner.pg_quote()
|
||||||
);
|
);
|
||||||
let _ = info_span!("executing", query).entered();
|
info_print!(" -> update");
|
||||||
client.execute(query.as_str(), &[])?;
|
|
||||||
}
|
|
||||||
DatabaseAction::Create => {
|
|
||||||
let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
|
|
||||||
query.push_str(&db.to_pg_options());
|
|
||||||
let _ = info_span!("executing", query).entered();
|
|
||||||
client.execute(query.as_str(), &[])?;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if span_enabled!(Level::INFO) {
|
client.execute(query.as_str(), &[])?;
|
||||||
let action_str = match action {
|
let elapsed = start_time.elapsed().as_millis();
|
||||||
DatabaseAction::None => "",
|
info_print!(" ({} ms)", elapsed);
|
||||||
DatabaseAction::Create => " -> create",
|
}
|
||||||
DatabaseAction::Update => " -> update",
|
} else {
|
||||||
};
|
let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
|
||||||
info!(" - {}:{}{}", db.name, db.owner, action_str);
|
info_print!(" -> create");
|
||||||
|
|
||||||
|
query.push_str(&db.to_pg_options());
|
||||||
|
client.execute(query.as_str(), &[])?;
|
||||||
|
|
||||||
|
let elapsed = start_time.elapsed().as_millis();
|
||||||
|
info_print!(" ({} ms)", elapsed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
info_print!("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -413,7 +356,6 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
|
|
||||||
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
|
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
|
||||||
/// to allow users creating trusted extensions and re-creating `public` schema, for example.
|
/// to allow users creating trusted extensions and re-creating `public` schema, for example.
|
||||||
#[instrument(skip_all)]
|
|
||||||
pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||||
let spec = &node.spec;
|
let spec = &node.spec;
|
||||||
|
|
||||||
|
|||||||
@@ -1,31 +1,32 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "control_plane"
|
name = "control_plane"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition.workspace = true
|
edition = "2021"
|
||||||
license.workspace = true
|
license = "Apache-2.0"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow = "1.0"
|
||||||
clap.workspace = true
|
clap = "4.0"
|
||||||
comfy-table.workspace = true
|
comfy-table = "6.1"
|
||||||
git-version.workspace = true
|
git-version = "0.3.5"
|
||||||
nix.workspace = true
|
nix = "0.25"
|
||||||
once_cell.workspace = true
|
once_cell = "1.13.0"
|
||||||
postgres.workspace = true
|
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||||
regex.workspace = true
|
regex = "1"
|
||||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||||
serde.workspace = true
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_with.workspace = true
|
serde_with = "2.0"
|
||||||
tar.workspace = true
|
tar = "0.4.38"
|
||||||
thiserror.workspace = true
|
thiserror = "1"
|
||||||
toml.workspace = true
|
toml = "0.5"
|
||||||
url.workspace = true
|
url = "2.2.2"
|
||||||
|
|
||||||
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
|
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
|
||||||
# instead, so that recompile times are better.
|
# instead, so that recompile times are better.
|
||||||
pageserver_api.workspace = true
|
pageserver_api = { path = "../libs/pageserver_api" }
|
||||||
safekeeper_api.workspace = true
|
postgres_connection = { path = "../libs/postgres_connection" }
|
||||||
postgres_connection.workspace = true
|
safekeeper_api = { path = "../libs/safekeeper_api" }
|
||||||
storage_broker.workspace = true
|
# Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
||||||
utils.workspace = true
|
storage_broker = { version = "0.1", path = "../storage_broker" }
|
||||||
|
utils = { path = "../libs/utils" }
|
||||||
workspace_hack.workspace = true
|
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ name = "ring"
|
|||||||
version = "*"
|
version = "*"
|
||||||
expression = "MIT AND ISC AND OpenSSL"
|
expression = "MIT AND ISC AND OpenSSL"
|
||||||
license-files = [
|
license-files = [
|
||||||
{ path = "LICENSE", hash = 0xbd0eed23 }
|
{ path = "LICENSE", hash = 0xbd0eed23 },
|
||||||
]
|
]
|
||||||
|
|
||||||
[licenses.private]
|
[licenses.private]
|
||||||
|
|||||||
@@ -1,115 +0,0 @@
|
|||||||
### Overview
|
|
||||||
Pageserver and proxy periodically collect consumption metrics and push them to a HTTP endpoint.
|
|
||||||
|
|
||||||
This doc describes current implementation details.
|
|
||||||
For design details see [the RFC](./rfcs/021-metering.md) and [the discussion on Github](https://github.com/neondatabase/neon/pull/2884).
|
|
||||||
|
|
||||||
- The metrics are collected in a separate thread, and the collection interval and endpoint are configurable.
|
|
||||||
|
|
||||||
- Metrics are cached, so that we don't send unchanged metrics on every iteration.
|
|
||||||
|
|
||||||
- Metrics are sent in batches of 1000 (see CHUNK_SIZE const) metrics max with no particular grouping guarantees.
|
|
||||||
|
|
||||||
batch format is
|
|
||||||
```json
|
|
||||||
|
|
||||||
{ "events" : [metric1, metric2, ...]]}
|
|
||||||
|
|
||||||
```
|
|
||||||
See metric format examples below.
|
|
||||||
|
|
||||||
- All metrics values are in bytes, unless otherwise specified.
|
|
||||||
|
|
||||||
- Currently no retries are implemented.
|
|
||||||
|
|
||||||
### Pageserver metrics
|
|
||||||
|
|
||||||
#### Configuration
|
|
||||||
The endpoint and the collection interval are specified in the pageserver config file (or can be passed as command line arguments):
|
|
||||||
`metric_collection_endpoint` defaults to None, which means that metric collection is disabled by default.
|
|
||||||
`metric_collection_interval` defaults to 10min
|
|
||||||
|
|
||||||
#### Metrics
|
|
||||||
|
|
||||||
Currently, the following metrics are collected:
|
|
||||||
|
|
||||||
- `written_size`
|
|
||||||
|
|
||||||
Amount of WAL produced , by a timeline, i.e. last_record_lsn
|
|
||||||
This is an absolute, per-timeline metric.
|
|
||||||
|
|
||||||
- `resident_size`
|
|
||||||
|
|
||||||
Size of all the layer files in the tenant's directory on disk on the pageserver.
|
|
||||||
This is an absolute, per-tenant metric.
|
|
||||||
|
|
||||||
- `remote_storage_size`
|
|
||||||
|
|
||||||
Size of the remote storage (S3) directory.
|
|
||||||
This is an absolute, per-tenant metric.
|
|
||||||
|
|
||||||
- `timeline_logical_size`
|
|
||||||
Logical size of the data in the timeline
|
|
||||||
This is an absolute, per-timeline metric.
|
|
||||||
|
|
||||||
- `synthetic_storage_size`
|
|
||||||
Size of all tenant's branches including WAL
|
|
||||||
This is the same metric that `tenant/{tenant_id}/size` endpoint returns.
|
|
||||||
This is an absolute, per-tenant metric.
|
|
||||||
|
|
||||||
Synthetic storage size is calculated in a separate thread, so it might be slightly outdated.
|
|
||||||
|
|
||||||
#### Format example
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"metric": "remote_storage_size",
|
|
||||||
"type": "absolute",
|
|
||||||
"time": "2022-12-28T11:07:19.317310284Z",
|
|
||||||
"idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
|
|
||||||
"value": 12345454,
|
|
||||||
"tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
|
|
||||||
"timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
`idempotency_key` is a unique key for each metric, so that we can deduplicate metrics.
|
|
||||||
It is a combination of the time, node_id and a random number.
|
|
||||||
|
|
||||||
### Proxy consumption metrics
|
|
||||||
|
|
||||||
#### Configuration
|
|
||||||
The endpoint and the collection interval can be passed as command line arguments for proxy:
|
|
||||||
`metric_collection_endpoint` no default, which means that metric collection is disabled by default.
|
|
||||||
`metric_collection_interval` no default
|
|
||||||
|
|
||||||
#### Metrics
|
|
||||||
|
|
||||||
Currently, only one proxy metric is collected:
|
|
||||||
|
|
||||||
- `proxy_io_bytes_per_client`
|
|
||||||
Outbound traffic per client.
|
|
||||||
This is an incremental, per-endpoint metric.
|
|
||||||
|
|
||||||
#### Format example
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"metric": "proxy_io_bytes_per_client",
|
|
||||||
"type": "incremental",
|
|
||||||
"start_time": "2022-12-28T11:07:19.317310284Z",
|
|
||||||
"stop_time": "2022-12-28T11:07:19.317310284Z",
|
|
||||||
"idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
|
|
||||||
"value": 12345454,
|
|
||||||
"endpoint_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
The metric is incremental, so the value is the difference between the current and the previous value.
|
|
||||||
If there is no previous value, the value, the value is the current value and the `start_time` equals `stop_time`.
|
|
||||||
|
|
||||||
### TODO
|
|
||||||
|
|
||||||
- [ ] Handle errors better: currently if one tenant fails to gather metrics, the whole iteration fails and metrics are not sent for any tenant.
|
|
||||||
- [ ] Add retries
|
|
||||||
- [ ] Tune the interval
|
|
||||||
@@ -1,186 +0,0 @@
|
|||||||
# Consumption tracking
|
|
||||||
|
|
||||||
|
|
||||||
# Goals
|
|
||||||
|
|
||||||
This proposal is made with two mostly but not entirely overlapping goals:
|
|
||||||
|
|
||||||
* Collect info that is needed for consumption-based billing
|
|
||||||
* Cross-check AWS bills
|
|
||||||
|
|
||||||
|
|
||||||
# Metrics
|
|
||||||
|
|
||||||
There are six metrics to collect:
|
|
||||||
|
|
||||||
* CPU time. Wall clock seconds * the current number of cores. We have a fixed ratio of memory to cores, so the current memory size is the function of the number of cores. Measured per each `endpoint`.
|
|
||||||
|
|
||||||
* Traffic. In/out traffic on the proxy. Measured per each `endpoint`.
|
|
||||||
|
|
||||||
* Written size. Amount of data we write. That is different from both traffic and storage size, as only during the writing we
|
|
||||||
|
|
||||||
a) occupy some disk bandwidth on safekeepers
|
|
||||||
|
|
||||||
b) necessarily cross AZ boundaries delivering WAL to all safekeepers
|
|
||||||
|
|
||||||
Each timeline/branch has at most one writer, so the data is collected per branch.
|
|
||||||
|
|
||||||
* Synthetic storage size. That is what is exposed now with pageserver's `/v1/tenant/{}/size`. Looks like now it is per-tenant. (Side note: can we make it per branch to show as branch physical size in UI?)
|
|
||||||
|
|
||||||
* Real storage size. That is the size of the tenant directory on the pageservers disk. Per-tenant.
|
|
||||||
|
|
||||||
* S3 storage size. That is the size of the tenant data on S3. Per-tenant.
|
|
||||||
|
|
||||||
That info should be enough to build an internal model that predicts AWS price (hence tracking `written data` and `real storage size`). As for the billing model we probably can get away with mentioning only `CPU time`, `synthetic storage size`, and `traffic` consumption.
|
|
||||||
|
|
||||||
# Services participating in metrics collection
|
|
||||||
|
|
||||||
## Proxy
|
|
||||||
|
|
||||||
For actual implementation details check `/docs/consumption_metrics.md`
|
|
||||||
|
|
||||||
Proxy is the only place that knows about traffic flow, so it tracks it and reports it with quite a small interval, let's say 1 minute. A small interval is needed here since the proxy is stateless, and any restart will reset accumulated consumption. Also proxy should report deltas since the last report, not an absolute value of the counter. Such kind of events is easier to integrate over a period of time to get the amount of traffic during some time interval.
|
|
||||||
|
|
||||||
Example event:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"metric": "proxy_io_bytes_per_client",
|
|
||||||
"type": "incremental",
|
|
||||||
"start_time": "2022-12-28T11:07:19.317310284Z",
|
|
||||||
"stop_time": "2022-12-28T11:07:19.317310284Z",
|
|
||||||
"idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
|
|
||||||
"value": 12345454,
|
|
||||||
"endpoint_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Since we report deltas over some period of time, it makes sense to include `event_start_time`/`event_stop_time` where `event_start_time` is the time of the previous report. That will allow us to identify metering gaps better (e.g., failed send/delivery).
|
|
||||||
|
|
||||||
When there is no active connection proxy can avoid reporting anything. Also, deltas are additive, so several console instances serving the same user and endpoint can report traffic without coordination.
|
|
||||||
|
|
||||||
## Console
|
|
||||||
|
|
||||||
The console knows about start/stop events, so it knows the amount of CPU time allocated to each endpoint. It also knows about operation successes and failures and can avoid billing clients after unsuccessful 'suspend' events. The console doesn't know the current compute size within the allowed limits on the endpoint. So with CPU time, we do the following:
|
|
||||||
|
|
||||||
* While we don't yet have the autoscaling console can report `cpu time` as the number of seconds since the last `start_compute` event.
|
|
||||||
|
|
||||||
* When we have autoscaling, `autoscaler-agent` can report `cpu time`*`compute_units_count` in the same increments as the proxy reports traffic.
|
|
||||||
|
|
||||||
Example event:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"metric": "effective_compute_seconds",
|
|
||||||
"type": "increment",
|
|
||||||
"endpoint_id": "blazing-warrior-34",
|
|
||||||
"event_start_time": ...,
|
|
||||||
"event_stop_time": ...,
|
|
||||||
"value": 12345454,
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
I'd also suggest reporting one value, `cpu time`*`compute_units_count`, instead of two separate fields as it makes event schema simpler (it is possible to treat it the same way as traffic) and preserves additivity.
|
|
||||||
|
|
||||||
## Pageserver
|
|
||||||
|
|
||||||
For actual implementation details check `/docs/consumption_metrics.md`
|
|
||||||
|
|
||||||
Pageserver knows / has access to / can calculate the rest of the metrics:
|
|
||||||
|
|
||||||
* Written size -- that is basically `last_received_lsn`,
|
|
||||||
* Synthetic storage size -- there is a way to calculate it, albeit a costly one,
|
|
||||||
* Real storage size -- there is a way to calculate it using a layer map or filesystem,
|
|
||||||
* S3 storage size -- can calculate it by S3 API calls
|
|
||||||
|
|
||||||
Some of those metrics are expensive to calculate, so the reporting period here is driven mainly by implementation details. We can set it to, for example, once per hour. Not a big deal since the pageserver is stateful, and all metrics can be reported as an absolute value, not increments. At the same time, a smaller reporting period improves UX, so it would be good to have something more real-time.
|
|
||||||
|
|
||||||
`written size` is primarily a safekeeper-related metric, but since it is available on both pageserver and safekeeper, we can avoid reporting anything from the safekeeper.
|
|
||||||
|
|
||||||
Example event:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"metric": "remote_storage_size",
|
|
||||||
"type": "absolute",
|
|
||||||
"time": "2022-12-28T11:07:19.317310284Z",
|
|
||||||
"idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
|
|
||||||
"value": 12345454,
|
|
||||||
"tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
|
|
||||||
"timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
# Data collection
|
|
||||||
|
|
||||||
## Push vs. pull
|
|
||||||
|
|
||||||
We already have pull-based Prometheus metrics, so it is tempting to use them here too. However, in our setup, it is hard to tell when some metric changes. For example, garbage collection will constantly free some disk space over a week, even if the project is down for that week. We could also iterate through all existing tenants/branches/endpoints, but that means some amount of code to do that properly and most likely we will end up with some per-metric hacks in the collector to cut out some of the tenants that are surely not changing that metric.
|
|
||||||
|
|
||||||
With the push model, it is easier to publish data only about actively changing metrics -- pageserver knows when it performs s3 offloads, garbage collection and starts/stops consuming data from the safekeeper; proxy knows about connected clients; console / autoscaler-agent knows about active cpu time.
|
|
||||||
|
|
||||||
Hence, let's go with a push-based model.
|
|
||||||
|
|
||||||
## Common bus vs. proxying through the console
|
|
||||||
|
|
||||||
We can implement such push systems in a few ways:
|
|
||||||
|
|
||||||
a. Each component pushes its metrics to the "common bus", namely segment, Kafka, or something similar. That approach scales well, but it would be harder to test it locally, will introduce new dependencies, we will have to distribute secrets for that connection to all of the components, etc. We would also have to loop back some of the events and their aggregates to the console, as we want to show some that metrics to the user in real-time.
|
|
||||||
|
|
||||||
b. Each component can call HTTP `POST` with its events to the console, and the console can forward it to the segment for later integration with metronome / orb / onebill / etc. With that approach, only the console has to speak with segment. Also since that data passes through the console, the console can save the latest metrics values, so there is no need for constant feedback of that events back from the segment.
|
|
||||||
|
|
||||||
# Implementation
|
|
||||||
|
|
||||||
Each (proxy|pageserver|autoscaler-agent) sends consumption events to the single endpoint in the console:
|
|
||||||
|
|
||||||
```json
|
|
||||||
POST /usage_events HTTP/1.1
|
|
||||||
Content-Type: application/json
|
|
||||||
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"metric": "remote_storage_size",
|
|
||||||
"type": "absolute",
|
|
||||||
"time": "2022-12-28T11:07:19.317310284Z",
|
|
||||||
"idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
|
|
||||||
"value": 12345454,
|
|
||||||
"tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
|
|
||||||
"timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
|
|
||||||
},
|
|
||||||
...
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
Events could be either:
|
|
||||||
* `incremental` -- change in consumption since the previous event or service restart. That is `effective_cpu_seconds`, `traffic_in_bytes`, and `traffic_out_bytes`.
|
|
||||||
* `absolute` -- that is the current value of a metric. All of the size-related metrics are absolute.
|
|
||||||
|
|
||||||
Each service can post events at its own pace and bundle together data from different tenants/endpoints.
|
|
||||||
|
|
||||||
The console algorithm upon receive of events could be the following:
|
|
||||||
|
|
||||||
1. Create and send a segment event with the same content (possibly enriching it with tenant/timeline data for endpoint-based events).
|
|
||||||
2. Update the latest state of per-tenant and per-endpoint metrics in the database.
|
|
||||||
3. Check whether any of that metrics is above the allowed threshold and stop the project if necessary.
|
|
||||||
|
|
||||||
Since all the data comes in batches, we can do the batch update to reduce the number of queries in the database. Proxy traffic is probably the most frequent metric, so with batching, we will have extra `number_of_proxies` requests to the database each minute. This is most likely fine for now but will generate many dead tuples in the console database. If that is the case, we can change step 2 to the following:
|
|
||||||
|
|
||||||
2.1. Check if there $tenant_$metric / $endpoint_$metric key in Redis
|
|
||||||
|
|
||||||
2.2. If no stored value is found and the metric is incremental, then fetch the current value from DWH (which keeps aggregated value for all the events) and publish it.
|
|
||||||
|
|
||||||
2.3. Publish a new value (absolute metric) or add an increment to the stored value (incremental metric)
|
|
||||||
|
|
||||||
## Consumption watchdog
|
|
||||||
|
|
||||||
Since all the data goes through the console, we don't have to run any background thread/coroutines to check whether consumption is within the allowed limits. We only change consumption with `POST /usage_events`, so limit checks could be applied in the same handler.
|
|
||||||
|
|
||||||
## Extensibility
|
|
||||||
|
|
||||||
If we need to add a new metric (e.g. s3 traffic or something else), the console code should, by default, process it and publish segment event, even if the metric name is unknown to the console.
|
|
||||||
|
|
||||||
## Naming & schema
|
|
||||||
|
|
||||||
Each metric name should end up with units -- now `_seconds` and `_bytes`, and segment event should always have `tenant_id` and `timeline_id`/`endpoint_id` where applicable.
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 232 KiB |
@@ -18,6 +18,10 @@ Intended to be used in integration tests and in CLI tools for local installation
|
|||||||
Documentation of the Neon features and concepts.
|
Documentation of the Neon features and concepts.
|
||||||
Now it is mostly dev documentation.
|
Now it is mostly dev documentation.
|
||||||
|
|
||||||
|
`/monitoring`:
|
||||||
|
|
||||||
|
TODO
|
||||||
|
|
||||||
`/pageserver`:
|
`/pageserver`:
|
||||||
|
|
||||||
Neon storage service.
|
Neon storage service.
|
||||||
@@ -94,13 +98,6 @@ cargo hakari manage-deps
|
|||||||
|
|
||||||
If you don't have hakari installed (`error: no such subcommand: hakari`), install it by running `cargo install cargo-hakari`.
|
If you don't have hakari installed (`error: no such subcommand: hakari`), install it by running `cargo install cargo-hakari`.
|
||||||
|
|
||||||
### Checking Rust 3rd-parties
|
|
||||||
[Cargo deny](https://embarkstudios.github.io/cargo-deny/index.html) is a cargo plugin that lets us lint project's dependency graph to ensure all dependencies conform to requirements. It detects security issues, matches licenses, and ensures crates only come from trusted sources.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cargo deny check
|
|
||||||
```
|
|
||||||
|
|
||||||
## Using Python
|
## Using Python
|
||||||
Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
|
Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
|
||||||
so manual installation of dependencies is not recommended.
|
so manual installation of dependencies is not recommended.
|
||||||
|
|||||||
@@ -1,16 +0,0 @@
|
|||||||
[package]
|
|
||||||
name = "consumption_metrics"
|
|
||||||
version = "0.1.0"
|
|
||||||
edition = "2021"
|
|
||||||
license = "Apache-2.0"
|
|
||||||
|
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
anyhow = "1.0.68"
|
|
||||||
chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
|
|
||||||
rand = "0.8.3"
|
|
||||||
serde = "1.0.152"
|
|
||||||
serde_with = "2.1.0"
|
|
||||||
utils = { version = "0.1.0", path = "../utils" }
|
|
||||||
workspace_hack = { version = "0.1.0", path = "../../workspace_hack" }
|
|
||||||
@@ -1,50 +0,0 @@
|
|||||||
//!
|
|
||||||
//! Shared code for consumption metics collection
|
|
||||||
//!
|
|
||||||
use chrono::{DateTime, Utc};
|
|
||||||
use rand::Rng;
|
|
||||||
use serde::Serialize;
|
|
||||||
|
|
||||||
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
|
|
||||||
#[serde(tag = "type")]
|
|
||||||
pub enum EventType {
|
|
||||||
#[serde(rename = "absolute")]
|
|
||||||
Absolute { time: DateTime<Utc> },
|
|
||||||
#[serde(rename = "incremental")]
|
|
||||||
Incremental {
|
|
||||||
start_time: DateTime<Utc>,
|
|
||||||
stop_time: DateTime<Utc>,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
|
|
||||||
pub struct Event<Extra> {
|
|
||||||
#[serde(flatten)]
|
|
||||||
#[serde(rename = "type")]
|
|
||||||
pub kind: EventType,
|
|
||||||
|
|
||||||
pub metric: &'static str,
|
|
||||||
pub idempotency_key: String,
|
|
||||||
pub value: u64,
|
|
||||||
|
|
||||||
#[serde(flatten)]
|
|
||||||
pub extra: Extra,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn idempotency_key(node_id: String) -> String {
|
|
||||||
format!(
|
|
||||||
"{}-{}-{:04}",
|
|
||||||
Utc::now(),
|
|
||||||
node_id,
|
|
||||||
rand::thread_rng().gen_range(0..=9999)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub const CHUNK_SIZE: usize = 1000;
|
|
||||||
|
|
||||||
// Just a wrapper around a slice of events
|
|
||||||
// to serialize it as `{"events" : [ ] }
|
|
||||||
#[derive(serde::Serialize)]
|
|
||||||
pub struct EventChunk<'a, T> {
|
|
||||||
pub events: &'a [T],
|
|
||||||
}
|
|
||||||
@@ -1,12 +1,11 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "metrics"
|
name = "metrics"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition.workspace = true
|
edition = "2021"
|
||||||
license.workspace = true
|
license = "Apache-2.0"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
prometheus.workspace = true
|
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||||
libc.workspace = true
|
libc = "0.2"
|
||||||
once_cell.workspace = true
|
once_cell = "1.13.0"
|
||||||
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||||
workspace_hack.workspace = true
|
|
||||||
|
|||||||
@@ -1,17 +1,17 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "pageserver_api"
|
name = "pageserver_api"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition.workspace = true
|
edition = "2021"
|
||||||
license.workspace = true
|
license = "Apache-2.0"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
serde.workspace = true
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_with.workspace = true
|
serde_with = "2.0"
|
||||||
const_format.workspace = true
|
const_format = "0.2.21"
|
||||||
anyhow.workspace = true
|
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||||
bytes.workspace = true
|
bytes = "1.0.1"
|
||||||
byteorder.workspace = true
|
byteorder = "1.4.3"
|
||||||
utils.workspace = true
|
|
||||||
postgres_ffi.workspace = true
|
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
utils = { path = "../utils" }
|
||||||
|
postgres_ffi = { path = "../postgres_ffi" }
|
||||||
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use std::num::{NonZeroU64, NonZeroUsize};
|
use std::num::NonZeroU64;
|
||||||
|
|
||||||
use byteorder::{BigEndian, ReadBytesExt};
|
use byteorder::{BigEndian, ReadBytesExt};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
@@ -44,17 +44,18 @@ impl TenantState {
|
|||||||
/// A state of a timeline in pageserver's memory.
|
/// A state of a timeline in pageserver's memory.
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||||
pub enum TimelineState {
|
pub enum TimelineState {
|
||||||
/// The timeline is recognized by the pageserver but is not yet operational.
|
/// Timeline is fully operational. If the containing Tenant is Active, the timeline's
|
||||||
/// In particular, the walreceiver connection loop is not running for this timeline.
|
/// background jobs are running otherwise they will be launched when the tenant is activated.
|
||||||
/// It will eventually transition to state Active or Broken.
|
|
||||||
Loading,
|
|
||||||
/// The timeline is fully operational.
|
|
||||||
/// It can be queried, and the walreceiver connection loop is running.
|
|
||||||
Active,
|
Active,
|
||||||
/// The timeline was previously Loading or Active but is shutting down.
|
/// A timeline is recognized by pageserver, but not yet ready to operate.
|
||||||
/// It cannot transition back into any other state.
|
/// The status indicates, that the timeline could eventually go back to Active automatically:
|
||||||
|
/// for example, if the owning tenant goes back to Active again.
|
||||||
|
Suspended,
|
||||||
|
/// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
|
||||||
|
/// automatically become Active after certain events: only a management call can change this status.
|
||||||
Stopping,
|
Stopping,
|
||||||
/// The timeline is broken and not operational (previous states: Loading or Active).
|
/// A timeline is recognized by the pageserver, but can no longer be used for
|
||||||
|
/// any operations, because it failed to be activated.
|
||||||
Broken,
|
Broken,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -209,11 +210,6 @@ pub struct TimelineInfo {
|
|||||||
pub state: TimelineState,
|
pub state: TimelineState,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
|
||||||
pub struct DownloadRemoteLayersTaskSpawnRequest {
|
|
||||||
pub max_concurrent_downloads: NonZeroUsize,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
pub struct DownloadRemoteLayersTaskInfo {
|
pub struct DownloadRemoteLayersTaskInfo {
|
||||||
pub task_id: String,
|
pub task_id: String,
|
||||||
|
|||||||
@@ -1,17 +1,18 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "postgres_connection"
|
name = "postgres_connection"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition.workspace = true
|
edition = "2021"
|
||||||
license.workspace = true
|
license = "Apache-2.0"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow = "1.0"
|
||||||
itertools.workspace = true
|
itertools = "0.10.3"
|
||||||
postgres.workspace = true
|
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||||
tokio-postgres.workspace = true
|
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||||
url.workspace = true
|
url = "2.2.2"
|
||||||
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||||
workspace_hack.workspace = true
|
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
once_cell.workspace = true
|
once_cell = "1.13.0"
|
||||||
|
|||||||
@@ -1,31 +1,30 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "postgres_ffi"
|
name = "postgres_ffi"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition.workspace = true
|
edition = "2021"
|
||||||
license.workspace = true
|
license = "Apache-2.0"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
rand.workspace = true
|
rand = "0.8.3"
|
||||||
regex.workspace = true
|
regex = "1.4.5"
|
||||||
bytes.workspace = true
|
bytes = "1.0.1"
|
||||||
byteorder.workspace = true
|
byteorder = "1.4.3"
|
||||||
anyhow.workspace = true
|
anyhow = "1.0"
|
||||||
crc32c.workspace = true
|
crc32c = "0.6.0"
|
||||||
hex.workspace = true
|
hex = "0.4.3"
|
||||||
once_cell.workspace = true
|
once_cell = "1.13.0"
|
||||||
log.workspace = true
|
log = "0.4.14"
|
||||||
memoffset.workspace = true
|
memoffset = "0.7"
|
||||||
thiserror.workspace = true
|
thiserror = "1.0"
|
||||||
serde.workspace = true
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
utils.workspace = true
|
utils = { path = "../utils" }
|
||||||
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||||
workspace_hack.workspace = true
|
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
env_logger.workspace = true
|
env_logger = "0.9"
|
||||||
postgres.workspace = true
|
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||||
wal_craft = { path = "wal_craft" }
|
wal_craft = { path = "wal_craft" }
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
anyhow.workspace = true
|
anyhow = "1.0"
|
||||||
bindgen.workspace = true
|
bindgen = "0.61"
|
||||||
|
|||||||
@@ -1,17 +1,17 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "wal_craft"
|
name = "wal_craft"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition.workspace = true
|
edition = "2021"
|
||||||
license.workspace = true
|
license = "Apache-2.0"
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow = "1.0"
|
||||||
clap.workspace = true
|
clap = "4.0"
|
||||||
env_logger.workspace = true
|
env_logger = "0.9"
|
||||||
log.workspace = true
|
log = "0.4"
|
||||||
once_cell.workspace = true
|
once_cell = "1.13.0"
|
||||||
postgres.workspace = true
|
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||||
postgres_ffi.workspace = true
|
postgres_ffi = { path = "../" }
|
||||||
tempfile.workspace = true
|
tempfile = "3.2"
|
||||||
|
workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
|
||||||
workspace_hack.workspace = true
|
|
||||||
|
|||||||
@@ -1,18 +1,18 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "pq_proto"
|
name = "pq_proto"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition.workspace = true
|
edition = "2021"
|
||||||
license.workspace = true
|
license = "Apache-2.0"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow = "1.0"
|
||||||
bytes.workspace = true
|
bytes = "1.0.1"
|
||||||
pin-project-lite.workspace = true
|
pin-project-lite = "0.2.7"
|
||||||
postgres-protocol.workspace = true
|
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||||
rand.workspace = true
|
rand = "0.8.3"
|
||||||
serde.workspace = true
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
tokio.workspace = true
|
tokio = { version = "1.17", features = ["macros"] }
|
||||||
tracing.workspace = true
|
tracing = "0.1"
|
||||||
thiserror.workspace = true
|
thiserror = "1.0"
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||||
|
|||||||
@@ -1,28 +1,28 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "remote_storage"
|
name = "remote_storage"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition.workspace = true
|
edition = "2021"
|
||||||
license.workspace = true
|
license = "Apache-2.0"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||||
async-trait.workspace = true
|
async-trait = "0.1"
|
||||||
once_cell.workspace = true
|
metrics = { version = "0.1", path = "../metrics" }
|
||||||
aws-smithy-http.workspace = true
|
utils = { version = "0.1", path = "../utils" }
|
||||||
aws-types.workspace = true
|
once_cell = "1.13.0"
|
||||||
aws-config.workspace = true
|
aws-smithy-http = "0.51.0"
|
||||||
aws-sdk-s3.workspace = true
|
aws-types = "0.51.0"
|
||||||
hyper = { workspace = true, features = ["stream"] }
|
aws-config = { version = "0.51.0", default-features = false, features=["rustls"] }
|
||||||
serde.workspace = true
|
aws-sdk-s3 = "0.21.0"
|
||||||
serde_json.workspace = true
|
hyper = { version = "0.14", features = ["stream"] }
|
||||||
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
tokio-util.workspace = true
|
serde_json = "1"
|
||||||
toml_edit.workspace = true
|
tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
|
||||||
tracing.workspace = true
|
tokio-util = { version = "0.7", features = ["io"] }
|
||||||
metrics.workspace = true
|
toml_edit = { version = "0.14", features = ["easy"] }
|
||||||
utils.workspace = true
|
tracing = "0.1.27"
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
tempfile.workspace = true
|
tempfile = "3.2"
|
||||||
|
|||||||
@@ -1,13 +1,13 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "safekeeper_api"
|
name = "safekeeper_api"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition.workspace = true
|
edition = "2021"
|
||||||
license.workspace = true
|
license = "Apache-2.0"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
serde.workspace = true
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_with.workspace = true
|
serde_with = "2.0"
|
||||||
const_format.workspace = true
|
const_format = "0.2.21"
|
||||||
utils.workspace = true
|
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
utils = { path = "../utils" }
|
||||||
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||||
|
|||||||
@@ -1,11 +1,9 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tenant_size_model"
|
name = "tenant_size_model"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition.workspace = true
|
edition = "2021"
|
||||||
publish = false
|
publish = false
|
||||||
license.workspace = true
|
license = "Apache-2.0"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
|
||||||
|
|||||||
@@ -1,8 +1,6 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use anyhow::Context;
|
|
||||||
|
|
||||||
/// Pricing model or history size builder.
|
/// Pricing model or history size builder.
|
||||||
///
|
///
|
||||||
/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
|
/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
|
||||||
@@ -134,25 +132,22 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
|||||||
op: Cow<'static, str>,
|
op: Cow<'static, str>,
|
||||||
lsn: u64,
|
lsn: u64,
|
||||||
size: Option<u64>,
|
size: Option<u64>,
|
||||||
) -> anyhow::Result<()>
|
) where
|
||||||
where
|
|
||||||
K: std::borrow::Borrow<Q>,
|
K: std::borrow::Borrow<Q>,
|
||||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
Q: std::hash::Hash + Eq,
|
||||||
{
|
{
|
||||||
let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
|
let lastseg_id = *self.branches.get(branch).unwrap();
|
||||||
let newseg_id = self.segments.len();
|
let newseg_id = self.segments.len();
|
||||||
let lastseg = &mut self.segments[lastseg_id];
|
let lastseg = &mut self.segments[lastseg_id];
|
||||||
|
|
||||||
assert!(lsn > lastseg.end_lsn);
|
assert!(lsn > lastseg.end_lsn);
|
||||||
|
|
||||||
let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
|
|
||||||
|
|
||||||
let newseg = Segment {
|
let newseg = Segment {
|
||||||
op,
|
op,
|
||||||
parent: Some(lastseg_id),
|
parent: Some(lastseg_id),
|
||||||
start_lsn: lastseg.end_lsn,
|
start_lsn: lastseg.end_lsn,
|
||||||
end_lsn: lsn,
|
end_lsn: lsn,
|
||||||
start_size,
|
start_size: lastseg.end_size.unwrap(),
|
||||||
end_size: size,
|
end_size: size,
|
||||||
children_after: Vec::new(),
|
children_after: Vec::new(),
|
||||||
needed: false,
|
needed: false,
|
||||||
@@ -161,8 +156,6 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
|||||||
|
|
||||||
self.segments.push(newseg);
|
self.segments.push(newseg);
|
||||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
|
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
|
||||||
@@ -172,24 +165,21 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
|||||||
op: Cow<'static, str>,
|
op: Cow<'static, str>,
|
||||||
lsn_bytes: u64,
|
lsn_bytes: u64,
|
||||||
size_bytes: i64,
|
size_bytes: i64,
|
||||||
) -> anyhow::Result<()>
|
) where
|
||||||
where
|
|
||||||
K: std::borrow::Borrow<Q>,
|
K: std::borrow::Borrow<Q>,
|
||||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
Q: std::hash::Hash + Eq,
|
||||||
{
|
{
|
||||||
let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
|
let lastseg_id = *self.branches.get(branch).unwrap();
|
||||||
let newseg_id = self.segments.len();
|
let newseg_id = self.segments.len();
|
||||||
let lastseg = &mut self.segments[lastseg_id];
|
let lastseg = &mut self.segments[lastseg_id];
|
||||||
|
|
||||||
let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
|
|
||||||
|
|
||||||
let newseg = Segment {
|
let newseg = Segment {
|
||||||
op,
|
op,
|
||||||
parent: Some(lastseg_id),
|
parent: Some(lastseg_id),
|
||||||
start_lsn: lastseg.end_lsn,
|
start_lsn: lastseg.end_lsn,
|
||||||
end_lsn: lastseg.end_lsn + lsn_bytes,
|
end_lsn: lastseg.end_lsn + lsn_bytes,
|
||||||
start_size: last_end_size,
|
start_size: lastseg.end_size.unwrap(),
|
||||||
end_size: Some((last_end_size as i64 + size_bytes) as u64),
|
end_size: Some((lastseg.end_size.unwrap() as i64 + size_bytes) as u64),
|
||||||
children_after: Vec::new(),
|
children_after: Vec::new(),
|
||||||
needed: false,
|
needed: false,
|
||||||
};
|
};
|
||||||
@@ -197,54 +187,50 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
|||||||
|
|
||||||
self.segments.push(newseg);
|
self.segments.push(newseg);
|
||||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
|
||||||
where
|
where
|
||||||
K: std::borrow::Borrow<Q>,
|
K: std::borrow::Borrow<Q>,
|
||||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
Q: std::hash::Hash + Eq,
|
||||||
{
|
{
|
||||||
self.modify_branch(branch, "insert".into(), bytes, bytes as i64)
|
self.modify_branch(branch, "insert".into(), bytes, bytes as i64);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
|
||||||
where
|
where
|
||||||
K: std::borrow::Borrow<Q>,
|
K: std::borrow::Borrow<Q>,
|
||||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
Q: std::hash::Hash + Eq,
|
||||||
{
|
{
|
||||||
self.modify_branch(branch, "update".into(), bytes, 0i64)
|
self.modify_branch(branch, "update".into(), bytes, 0i64);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
|
||||||
where
|
where
|
||||||
K: std::borrow::Borrow<Q>,
|
K: std::borrow::Borrow<Q>,
|
||||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
Q: std::hash::Hash + Eq,
|
||||||
{
|
{
|
||||||
self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64))
|
self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64));
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K) -> anyhow::Result<()>
|
/// Panics if the parent branch cannot be found.
|
||||||
|
pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K)
|
||||||
where
|
where
|
||||||
K: std::borrow::Borrow<Q> + std::fmt::Debug,
|
K: std::borrow::Borrow<Q>,
|
||||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
Q: std::hash::Hash + Eq,
|
||||||
{
|
{
|
||||||
// Find the right segment
|
// Find the right segment
|
||||||
let branchseg_id = *self.branches.get(parent).with_context(|| {
|
let branchseg_id = *self
|
||||||
format!(
|
.branches
|
||||||
"should had found the parent {:?} by key. in branches {:?}",
|
.get(parent)
|
||||||
parent, self.branches
|
.expect("should had found the parent by key");
|
||||||
)
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let _branchseg = &mut self.segments[branchseg_id];
|
let _branchseg = &mut self.segments[branchseg_id];
|
||||||
|
|
||||||
// Create branch name for it
|
// Create branch name for it
|
||||||
self.branches.insert(name, branchseg_id);
|
self.branches.insert(name, branchseg_id);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result<SegmentSize> {
|
pub fn calculate(&mut self, retention_period: u64) -> SegmentSize {
|
||||||
// Phase 1: Mark all the segments that need to be retained
|
// Phase 1: Mark all the segments that need to be retained
|
||||||
for (_branch, &last_seg_id) in self.branches.iter() {
|
for (_branch, &last_seg_id) in self.branches.iter() {
|
||||||
let last_seg = &self.segments[last_seg_id];
|
let last_seg = &self.segments[last_seg_id];
|
||||||
@@ -269,7 +255,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
|||||||
self.size_from_snapshot_later(0)
|
self.size_from_snapshot_later(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn size_from_wal(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
|
fn size_from_wal(&self, seg_id: usize) -> SegmentSize {
|
||||||
let seg = &self.segments[seg_id];
|
let seg = &self.segments[seg_id];
|
||||||
|
|
||||||
let this_size = seg.end_lsn - seg.start_lsn;
|
let this_size = seg.end_lsn - seg.start_lsn;
|
||||||
@@ -280,10 +266,10 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
|||||||
for &child_id in seg.children_after.iter() {
|
for &child_id in seg.children_after.iter() {
|
||||||
// try each child both ways
|
// try each child both ways
|
||||||
let child = &self.segments[child_id];
|
let child = &self.segments[child_id];
|
||||||
let p1 = self.size_from_wal(child_id)?;
|
let p1 = self.size_from_wal(child_id);
|
||||||
|
|
||||||
let p = if !child.needed {
|
let p = if !child.needed {
|
||||||
let p2 = self.size_from_snapshot_later(child_id)?;
|
let p2 = self.size_from_snapshot_later(child_id);
|
||||||
if p1.total() < p2.total() {
|
if p1.total() < p2.total() {
|
||||||
p1
|
p1
|
||||||
} else {
|
} else {
|
||||||
@@ -294,15 +280,15 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
|||||||
};
|
};
|
||||||
children.push(p);
|
children.push(p);
|
||||||
}
|
}
|
||||||
Ok(SegmentSize {
|
SegmentSize {
|
||||||
seg_id,
|
seg_id,
|
||||||
method: if seg.needed { WalNeeded } else { Wal },
|
method: if seg.needed { WalNeeded } else { Wal },
|
||||||
this_size,
|
this_size,
|
||||||
children,
|
children,
|
||||||
})
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
|
fn size_from_snapshot_later(&self, seg_id: usize) -> SegmentSize {
|
||||||
// If this is needed, then it's time to do the snapshot and continue
|
// If this is needed, then it's time to do the snapshot and continue
|
||||||
// with wal method.
|
// with wal method.
|
||||||
let seg = &self.segments[seg_id];
|
let seg = &self.segments[seg_id];
|
||||||
@@ -313,10 +299,10 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
|||||||
for &child_id in seg.children_after.iter() {
|
for &child_id in seg.children_after.iter() {
|
||||||
// try each child both ways
|
// try each child both ways
|
||||||
let child = &self.segments[child_id];
|
let child = &self.segments[child_id];
|
||||||
let p1 = self.size_from_wal(child_id)?;
|
let p1 = self.size_from_wal(child_id);
|
||||||
|
|
||||||
let p = if !child.needed {
|
let p = if !child.needed {
|
||||||
let p2 = self.size_from_snapshot_later(child_id)?;
|
let p2 = self.size_from_snapshot_later(child_id);
|
||||||
if p1.total() < p2.total() {
|
if p1.total() < p2.total() {
|
||||||
p1
|
p1
|
||||||
} else {
|
} else {
|
||||||
@@ -327,12 +313,12 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
|||||||
};
|
};
|
||||||
children.push(p);
|
children.push(p);
|
||||||
}
|
}
|
||||||
Ok(SegmentSize {
|
SegmentSize {
|
||||||
seg_id,
|
seg_id,
|
||||||
method: WalNeeded,
|
method: WalNeeded,
|
||||||
this_size: seg.start_size,
|
this_size: seg.start_size,
|
||||||
children,
|
children,
|
||||||
})
|
}
|
||||||
} else {
|
} else {
|
||||||
// If any of the direct children are "needed", need to be able to reconstruct here
|
// If any of the direct children are "needed", need to be able to reconstruct here
|
||||||
let mut children_needed = false;
|
let mut children_needed = false;
|
||||||
@@ -347,7 +333,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
|||||||
let method1 = if !children_needed {
|
let method1 = if !children_needed {
|
||||||
let mut children = Vec::new();
|
let mut children = Vec::new();
|
||||||
for child in seg.children_after.iter() {
|
for child in seg.children_after.iter() {
|
||||||
children.push(self.size_from_snapshot_later(*child)?);
|
children.push(self.size_from_snapshot_later(*child));
|
||||||
}
|
}
|
||||||
Some(SegmentSize {
|
Some(SegmentSize {
|
||||||
seg_id,
|
seg_id,
|
||||||
@@ -363,25 +349,20 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
|||||||
let method2 = if children_needed || seg.children_after.len() >= 2 {
|
let method2 = if children_needed || seg.children_after.len() >= 2 {
|
||||||
let mut children = Vec::new();
|
let mut children = Vec::new();
|
||||||
for child in seg.children_after.iter() {
|
for child in seg.children_after.iter() {
|
||||||
children.push(self.size_from_wal(*child)?);
|
children.push(self.size_from_wal(*child));
|
||||||
}
|
}
|
||||||
let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") };
|
|
||||||
Some(SegmentSize {
|
Some(SegmentSize {
|
||||||
seg_id,
|
seg_id,
|
||||||
method: SnapshotAfter,
|
method: SnapshotAfter,
|
||||||
this_size,
|
this_size: seg.end_size.unwrap(),
|
||||||
children,
|
children,
|
||||||
})
|
})
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(match (method1, method2) {
|
match (method1, method2) {
|
||||||
(None, None) => anyhow::bail!(
|
(None, None) => panic!(),
|
||||||
"neither method was applicable: children_after={}, children_needed={}",
|
|
||||||
seg.children_after.len(),
|
|
||||||
children_needed
|
|
||||||
),
|
|
||||||
(Some(method), None) => method,
|
(Some(method), None) => method,
|
||||||
(None, Some(method)) => method,
|
(None, Some(method)) => method,
|
||||||
(Some(method1), Some(method2)) => {
|
(Some(method1), Some(method2)) => {
|
||||||
@@ -391,7 +372,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
|||||||
method2
|
method2
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -7,118 +7,118 @@
|
|||||||
use tenant_size_model::{Segment, SegmentSize, Storage};
|
use tenant_size_model::{Segment, SegmentSize, Storage};
|
||||||
|
|
||||||
// Main branch only. Some updates on it.
|
// Main branch only. Some updates on it.
|
||||||
fn scenario_1() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
fn scenario_1() -> (Vec<Segment>, SegmentSize) {
|
||||||
// Create main branch
|
// Create main branch
|
||||||
let mut storage = Storage::new("main");
|
let mut storage = Storage::new("main");
|
||||||
|
|
||||||
// Bulk load 5 GB of data to it
|
// Bulk load 5 GB of data to it
|
||||||
storage.insert("main", 5_000)?;
|
storage.insert("main", 5_000);
|
||||||
|
|
||||||
// Stream of updates
|
// Stream of updates
|
||||||
for _ in 0..5 {
|
for _ in 0..5 {
|
||||||
storage.update("main", 1_000)?;
|
storage.update("main", 1_000);
|
||||||
}
|
}
|
||||||
|
|
||||||
let size = storage.calculate(1000)?;
|
let size = storage.calculate(1000);
|
||||||
|
|
||||||
Ok((storage.into_segments(), size))
|
(storage.into_segments(), size)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Main branch only. Some updates on it.
|
// Main branch only. Some updates on it.
|
||||||
fn scenario_2() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
fn scenario_2() -> (Vec<Segment>, SegmentSize) {
|
||||||
// Create main branch
|
// Create main branch
|
||||||
let mut storage = Storage::new("main");
|
let mut storage = Storage::new("main");
|
||||||
|
|
||||||
// Bulk load 5 GB of data to it
|
// Bulk load 5 GB of data to it
|
||||||
storage.insert("main", 5_000)?;
|
storage.insert("main", 5_000);
|
||||||
|
|
||||||
// Stream of updates
|
// Stream of updates
|
||||||
for _ in 0..5 {
|
for _ in 0..5 {
|
||||||
storage.update("main", 1_000)?;
|
storage.update("main", 1_000);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Branch
|
// Branch
|
||||||
storage.branch("main", "child")?;
|
storage.branch("main", "child");
|
||||||
storage.update("child", 1_000)?;
|
storage.update("child", 1_000);
|
||||||
|
|
||||||
// More updates on parent
|
// More updates on parent
|
||||||
storage.update("main", 1_000)?;
|
storage.update("main", 1_000);
|
||||||
|
|
||||||
let size = storage.calculate(1000)?;
|
let size = storage.calculate(1000);
|
||||||
|
|
||||||
Ok((storage.into_segments(), size))
|
(storage.into_segments(), size)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Like 2, but more updates on main
|
// Like 2, but more updates on main
|
||||||
fn scenario_3() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
fn scenario_3() -> (Vec<Segment>, SegmentSize) {
|
||||||
// Create main branch
|
// Create main branch
|
||||||
let mut storage = Storage::new("main");
|
let mut storage = Storage::new("main");
|
||||||
|
|
||||||
// Bulk load 5 GB of data to it
|
// Bulk load 5 GB of data to it
|
||||||
storage.insert("main", 5_000)?;
|
storage.insert("main", 5_000);
|
||||||
|
|
||||||
// Stream of updates
|
// Stream of updates
|
||||||
for _ in 0..5 {
|
for _ in 0..5 {
|
||||||
storage.update("main", 1_000)?;
|
storage.update("main", 1_000);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Branch
|
// Branch
|
||||||
storage.branch("main", "child")?;
|
storage.branch("main", "child");
|
||||||
storage.update("child", 1_000)?;
|
storage.update("child", 1_000);
|
||||||
|
|
||||||
// More updates on parent
|
// More updates on parent
|
||||||
for _ in 0..5 {
|
for _ in 0..5 {
|
||||||
storage.update("main", 1_000)?;
|
storage.update("main", 1_000);
|
||||||
}
|
}
|
||||||
|
|
||||||
let size = storage.calculate(1000)?;
|
let size = storage.calculate(1000);
|
||||||
|
|
||||||
Ok((storage.into_segments(), size))
|
(storage.into_segments(), size)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Diverged branches
|
// Diverged branches
|
||||||
fn scenario_4() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
fn scenario_4() -> (Vec<Segment>, SegmentSize) {
|
||||||
// Create main branch
|
// Create main branch
|
||||||
let mut storage = Storage::new("main");
|
let mut storage = Storage::new("main");
|
||||||
|
|
||||||
// Bulk load 5 GB of data to it
|
// Bulk load 5 GB of data to it
|
||||||
storage.insert("main", 5_000)?;
|
storage.insert("main", 5_000);
|
||||||
|
|
||||||
// Stream of updates
|
// Stream of updates
|
||||||
for _ in 0..5 {
|
for _ in 0..5 {
|
||||||
storage.update("main", 1_000)?;
|
storage.update("main", 1_000);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Branch
|
// Branch
|
||||||
storage.branch("main", "child")?;
|
storage.branch("main", "child");
|
||||||
storage.update("child", 1_000)?;
|
storage.update("child", 1_000);
|
||||||
|
|
||||||
// More updates on parent
|
// More updates on parent
|
||||||
for _ in 0..8 {
|
for _ in 0..8 {
|
||||||
storage.update("main", 1_000)?;
|
storage.update("main", 1_000);
|
||||||
}
|
}
|
||||||
|
|
||||||
let size = storage.calculate(1000)?;
|
let size = storage.calculate(1000);
|
||||||
|
|
||||||
Ok((storage.into_segments(), size))
|
(storage.into_segments(), size)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn scenario_5() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
fn scenario_5() -> (Vec<Segment>, SegmentSize) {
|
||||||
let mut storage = Storage::new("a");
|
let mut storage = Storage::new("a");
|
||||||
storage.insert("a", 5000)?;
|
storage.insert("a", 5000);
|
||||||
storage.branch("a", "b")?;
|
storage.branch("a", "b");
|
||||||
storage.update("b", 4000)?;
|
storage.update("b", 4000);
|
||||||
storage.update("a", 2000)?;
|
storage.update("a", 2000);
|
||||||
storage.branch("a", "c")?;
|
storage.branch("a", "c");
|
||||||
storage.insert("c", 4000)?;
|
storage.insert("c", 4000);
|
||||||
storage.insert("a", 2000)?;
|
storage.insert("a", 2000);
|
||||||
|
|
||||||
let size = storage.calculate(5000)?;
|
let size = storage.calculate(5000);
|
||||||
|
|
||||||
Ok((storage.into_segments(), size))
|
(storage.into_segments(), size)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn scenario_6() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
fn scenario_6() -> (Vec<Segment>, SegmentSize) {
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
|
||||||
const NO_OP: Cow<'static, str> = Cow::Borrowed("");
|
const NO_OP: Cow<'static, str> = Cow::Borrowed("");
|
||||||
@@ -133,18 +133,18 @@ fn scenario_6() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
|||||||
|
|
||||||
let mut storage = Storage::new(None);
|
let mut storage = Storage::new(None);
|
||||||
|
|
||||||
storage.branch(&None, branches[0])?; // at 0
|
storage.branch(&None, branches[0]); // at 0
|
||||||
storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064
|
storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064
|
||||||
storage.branch(&branches[0], branches[1])?; // at 108951064
|
storage.branch(&branches[0], branches[1]); // at 108951064
|
||||||
storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472
|
storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472
|
||||||
storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424
|
storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424
|
||||||
storage.branch(&branches[0], branches[2])?; // at 283415424
|
storage.branch(&branches[0], branches[2]); // at 283415424
|
||||||
storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616
|
storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616
|
||||||
storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400
|
storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400
|
||||||
|
|
||||||
let size = storage.calculate(100_000)?;
|
let size = storage.calculate(100_000);
|
||||||
|
|
||||||
Ok((storage.into_segments(), size))
|
(storage.into_segments(), size)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
@@ -163,8 +163,7 @@ fn main() {
|
|||||||
eprintln!("invalid scenario {}", other);
|
eprintln!("invalid scenario {}", other);
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
}
|
}
|
||||||
}
|
};
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
graphviz_tree(&segments, &size);
|
graphviz_tree(&segments, &size);
|
||||||
}
|
}
|
||||||
@@ -252,7 +251,7 @@ fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn scenarios_return_same_size() {
|
fn scenarios_return_same_size() {
|
||||||
type ScenarioFn = fn() -> anyhow::Result<(Vec<Segment>, SegmentSize)>;
|
type ScenarioFn = fn() -> (Vec<Segment>, SegmentSize);
|
||||||
let truths: &[(u32, ScenarioFn, _)] = &[
|
let truths: &[(u32, ScenarioFn, _)] = &[
|
||||||
(line!(), scenario_1, 8000),
|
(line!(), scenario_1, 8000),
|
||||||
(line!(), scenario_2, 9000),
|
(line!(), scenario_2, 9000),
|
||||||
@@ -263,7 +262,7 @@ fn scenarios_return_same_size() {
|
|||||||
];
|
];
|
||||||
|
|
||||||
for (line, scenario, expected) in truths {
|
for (line, scenario, expected) in truths {
|
||||||
let (_, size) = scenario().unwrap();
|
let (_, size) = scenario();
|
||||||
assert_eq!(*expected, size.total_children(), "scenario on line {line}");
|
assert_eq!(*expected, size.total_children(), "scenario on line {line}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,17 +0,0 @@
|
|||||||
[package]
|
|
||||||
name = "tracing-utils"
|
|
||||||
version = "0.1.0"
|
|
||||||
edition.workspace = true
|
|
||||||
license.workspace = true
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
hyper.workspace = true
|
|
||||||
opentelemetry = { workspace = true, features=["rt-tokio"] }
|
|
||||||
opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
|
||||||
opentelemetry-semantic-conventions.workspace = true
|
|
||||||
reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
|
|
||||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
|
||||||
tracing.workspace = true
|
|
||||||
tracing-opentelemetry.workspace = true
|
|
||||||
tracing-subscriber.workspace = true
|
|
||||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
|
||||||
@@ -1,96 +0,0 @@
|
|||||||
//! Tracing wrapper for Hyper HTTP server
|
|
||||||
|
|
||||||
use hyper::HeaderMap;
|
|
||||||
use hyper::{Body, Request, Response};
|
|
||||||
use std::future::Future;
|
|
||||||
use tracing::Instrument;
|
|
||||||
use tracing_opentelemetry::OpenTelemetrySpanExt;
|
|
||||||
|
|
||||||
/// Configuration option for what to use as the "otel.name" field in the traces.
|
|
||||||
pub enum OtelName<'a> {
|
|
||||||
/// Use a constant string
|
|
||||||
Constant(&'a str),
|
|
||||||
|
|
||||||
/// Use the path from the request.
|
|
||||||
///
|
|
||||||
/// That's very useful information, but is not appropriate if the
|
|
||||||
/// path contains parameters that differ on ever request, or worse,
|
|
||||||
/// sensitive information like usernames or email addresses.
|
|
||||||
///
|
|
||||||
/// See <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/http.md#name>
|
|
||||||
UriPath,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Handle an incoming HTTP request using the given handler function,
|
|
||||||
/// with OpenTelemetry tracing.
|
|
||||||
///
|
|
||||||
/// This runs 'handler' on the request in a new span, with fields filled in
|
|
||||||
/// from the request. Notably, if the request contains tracing information,
|
|
||||||
/// it is propagated to the span, so that this request is traced as part of
|
|
||||||
/// the same trace.
|
|
||||||
///
|
|
||||||
/// XXX: Usually, this is handled by existing libraries, or built
|
|
||||||
/// directly into HTTP servers. However, I couldn't find one for Hyper,
|
|
||||||
/// so I had to write our own. OpenTelemetry website has a registry of
|
|
||||||
/// instrumentation libraries at:
|
|
||||||
/// https://opentelemetry.io/registry/?language=rust&component=instrumentation
|
|
||||||
/// If a Hyper crate appears, consider switching to that.
|
|
||||||
pub async fn tracing_handler<F, R>(
|
|
||||||
req: Request<Body>,
|
|
||||||
handler: F,
|
|
||||||
otel_name: OtelName<'_>,
|
|
||||||
) -> Response<Body>
|
|
||||||
where
|
|
||||||
F: Fn(Request<Body>) -> R,
|
|
||||||
R: Future<Output = Response<Body>>,
|
|
||||||
{
|
|
||||||
// Create a tracing span, with context propagated from the incoming
|
|
||||||
// request if any.
|
|
||||||
//
|
|
||||||
// See list of standard fields defined for HTTP requests at
|
|
||||||
// https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/http.md
|
|
||||||
// We only fill in a few of the most useful ones here.
|
|
||||||
let otel_name = match otel_name {
|
|
||||||
OtelName::Constant(s) => s,
|
|
||||||
OtelName::UriPath => req.uri().path(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let span = tracing::info_span!(
|
|
||||||
"http request",
|
|
||||||
otel.name= %otel_name,
|
|
||||||
http.method = %req.method(),
|
|
||||||
http.status_code = tracing::field::Empty,
|
|
||||||
);
|
|
||||||
let parent_ctx = extract_remote_context(req.headers());
|
|
||||||
span.set_parent(parent_ctx);
|
|
||||||
|
|
||||||
// Handle the request within the span
|
|
||||||
let response = handler(req).instrument(span.clone()).await;
|
|
||||||
|
|
||||||
// Fill in the fields from the response code
|
|
||||||
let status = response.status();
|
|
||||||
span.record("http.status_code", status.as_str());
|
|
||||||
span.record(
|
|
||||||
"otel.status_code",
|
|
||||||
if status.is_success() { "OK" } else { "ERROR" },
|
|
||||||
);
|
|
||||||
|
|
||||||
response
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract remote tracing context from the HTTP headers
|
|
||||||
fn extract_remote_context(headers: &HeaderMap) -> opentelemetry::Context {
|
|
||||||
struct HeaderExtractor<'a>(&'a HeaderMap);
|
|
||||||
|
|
||||||
impl<'a> opentelemetry::propagation::Extractor for HeaderExtractor<'a> {
|
|
||||||
fn get(&self, key: &str) -> Option<&str> {
|
|
||||||
self.0.get(key).and_then(|value| value.to_str().ok())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn keys(&self) -> Vec<&str> {
|
|
||||||
self.0.keys().map(|value| value.as_str()).collect()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let extractor = HeaderExtractor(headers);
|
|
||||||
opentelemetry::global::get_text_map_propagator(|propagator| propagator.extract(&extractor))
|
|
||||||
}
|
|
||||||
@@ -1,168 +0,0 @@
|
|||||||
//! Helper functions to set up OpenTelemetry tracing.
|
|
||||||
//!
|
|
||||||
//! This comes in two variants, depending on whether you have a Tokio runtime available.
|
|
||||||
//! If you do, call `init_tracing()`. It sets up the trace processor and exporter to use
|
|
||||||
//! the current tokio runtime. If you don't have a runtime available, or you don't want
|
|
||||||
//! to share the runtime with the tracing tasks, call `init_tracing_without_runtime()`
|
|
||||||
//! instead. It sets up a dedicated single-threaded Tokio runtime for the tracing tasks.
|
|
||||||
//!
|
|
||||||
//! Example:
|
|
||||||
//!
|
|
||||||
//! ```rust,no_run
|
|
||||||
//! use tracing_subscriber::prelude::*;
|
|
||||||
//! use tracing_opentelemetry::OpenTelemetryLayer;
|
|
||||||
//!
|
|
||||||
//! #[tokio::main]
|
|
||||||
//! async fn main() {
|
|
||||||
//! // Set up logging to stderr
|
|
||||||
//! let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
|
|
||||||
//! .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"));
|
|
||||||
//! let fmt_layer = tracing_subscriber::fmt::layer()
|
|
||||||
//! .with_target(false)
|
|
||||||
//! .with_writer(std::io::stderr);
|
|
||||||
//!
|
|
||||||
//! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces
|
|
||||||
//! let otlp_layer = tracing_utils::init_tracing("my_application").await.map(OpenTelemetryLayer::new);
|
|
||||||
//!
|
|
||||||
//! // Put it all together
|
|
||||||
//! tracing_subscriber::registry()
|
|
||||||
//! .with(env_filter)
|
|
||||||
//! .with(otlp_layer)
|
|
||||||
//! .with(fmt_layer)
|
|
||||||
//! .init();
|
|
||||||
//! }
|
|
||||||
//! ```
|
|
||||||
|
|
||||||
use opentelemetry::sdk::Resource;
|
|
||||||
use opentelemetry::KeyValue;
|
|
||||||
use opentelemetry_otlp::WithExportConfig;
|
|
||||||
use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT};
|
|
||||||
|
|
||||||
pub use tracing_opentelemetry::OpenTelemetryLayer;
|
|
||||||
|
|
||||||
pub mod http;
|
|
||||||
|
|
||||||
/// Set up OpenTelemetry exporter, using configuration from environment variables.
|
|
||||||
///
|
|
||||||
/// `service_name` is set as the OpenTelemetry 'service.name' resource (see
|
|
||||||
/// <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/resource/semantic_conventions/README.md#service>)
|
|
||||||
///
|
|
||||||
/// We try to follow the conventions for the environment variables specified in
|
|
||||||
/// <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables/>
|
|
||||||
///
|
|
||||||
/// However, we only support a subset of those options:
|
|
||||||
///
|
|
||||||
/// - OTEL_SDK_DISABLED is supported. The default is "false", meaning tracing
|
|
||||||
/// is enabled by default. Set it to "true" to disable.
|
|
||||||
///
|
|
||||||
/// - We use the OTLP exporter, with HTTP protocol. Most of the OTEL_EXPORTER_OTLP_*
|
|
||||||
/// settings specified in
|
|
||||||
/// <https://opentelemetry.io/docs/reference/specification/protocol/exporter/>
|
|
||||||
/// are supported, as they are handled by the `opentelemetry-otlp` crate.
|
|
||||||
/// Settings related to other exporters have no effect.
|
|
||||||
///
|
|
||||||
/// - Some other settings are supported by the `opentelemetry` crate.
|
|
||||||
///
|
|
||||||
/// If you need some other setting, please test if it works first. And perhaps
|
|
||||||
/// add a comment in the list above to save the effort of testing for the next
|
|
||||||
/// person.
|
|
||||||
///
|
|
||||||
/// This doesn't block, but is marked as 'async' to hint that this must be called in
|
|
||||||
/// asynchronous execution context.
|
|
||||||
pub async fn init_tracing(service_name: &str) -> Option<opentelemetry::sdk::trace::Tracer> {
|
|
||||||
if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
|
|
||||||
return None;
|
|
||||||
};
|
|
||||||
Some(init_tracing_internal(service_name.to_string()))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing
|
|
||||||
/// tasks.
|
|
||||||
pub fn init_tracing_without_runtime(
|
|
||||||
service_name: &str,
|
|
||||||
) -> Option<opentelemetry::sdk::trace::Tracer> {
|
|
||||||
if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
|
|
||||||
return None;
|
|
||||||
};
|
|
||||||
|
|
||||||
// The opentelemetry batch processor and the OTLP exporter needs a Tokio
|
|
||||||
// runtime. Create a dedicated runtime for them. One thread should be
|
|
||||||
// enough.
|
|
||||||
//
|
|
||||||
// (Alternatively, instead of batching, we could use the "simple
|
|
||||||
// processor", which doesn't need Tokio, and use "reqwest-blocking"
|
|
||||||
// feature for the OTLP exporter, which also doesn't need Tokio. However,
|
|
||||||
// batching is considered best practice, and also I have the feeling that
|
|
||||||
// the non-Tokio codepaths in the opentelemetry crate are less used and
|
|
||||||
// might be more buggy, so better to stay on the well-beaten path.)
|
|
||||||
//
|
|
||||||
// We leak the runtime so that it keeps running after we exit the
|
|
||||||
// function.
|
|
||||||
let runtime = Box::leak(Box::new(
|
|
||||||
tokio::runtime::Builder::new_multi_thread()
|
|
||||||
.enable_all()
|
|
||||||
.thread_name("otlp runtime thread")
|
|
||||||
.worker_threads(1)
|
|
||||||
.build()
|
|
||||||
.unwrap(),
|
|
||||||
));
|
|
||||||
let _guard = runtime.enter();
|
|
||||||
|
|
||||||
Some(init_tracing_internal(service_name.to_string()))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer {
|
|
||||||
// Set up exporter from the OTEL_EXPORTER_* environment variables
|
|
||||||
let mut exporter = opentelemetry_otlp::new_exporter().http().with_env();
|
|
||||||
|
|
||||||
// XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the
|
|
||||||
// OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the
|
|
||||||
// OpenTelemetry spec at
|
|
||||||
// <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/exporter.md#endpoint-urls-for-otlphttp>,
|
|
||||||
// the full exporter URL is formed by appending "/v1/traces" to the value
|
|
||||||
// of OTEL_EXPORTER_OTLP_ENDPOINT. However, opentelemetry-otlp only does
|
|
||||||
// that with the grpc-tonic exporter. Other exporters, like the HTTP
|
|
||||||
// exporter, use the URL from OTEL_EXPORTER_OTLP_ENDPOINT as is, without
|
|
||||||
// appending "/v1/traces".
|
|
||||||
//
|
|
||||||
// See https://github.com/open-telemetry/opentelemetry-rust/pull/950
|
|
||||||
//
|
|
||||||
// Work around that by checking OTEL_EXPORTER_OTLP_ENDPOINT, and setting
|
|
||||||
// the endpoint url with the "/v1/traces" path ourselves. If the bug is
|
|
||||||
// fixed in a later version, we can remove this code. But if we don't
|
|
||||||
// remember to remove this, it won't do any harm either, as the crate will
|
|
||||||
// just ignore the OTEL_EXPORTER_OTLP_ENDPOINT setting when the endpoint
|
|
||||||
// is set directly with `with_endpoint`.
|
|
||||||
if std::env::var(OTEL_EXPORTER_OTLP_TRACES_ENDPOINT).is_err() {
|
|
||||||
if let Ok(mut endpoint) = std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT) {
|
|
||||||
if !endpoint.ends_with('/') {
|
|
||||||
endpoint.push('/');
|
|
||||||
}
|
|
||||||
endpoint.push_str("v1/traces");
|
|
||||||
exporter = exporter.with_endpoint(endpoint);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Propagate trace information in the standard W3C TraceContext format.
|
|
||||||
opentelemetry::global::set_text_map_propagator(
|
|
||||||
opentelemetry::sdk::propagation::TraceContextPropagator::new(),
|
|
||||||
);
|
|
||||||
|
|
||||||
opentelemetry_otlp::new_pipeline()
|
|
||||||
.tracing()
|
|
||||||
.with_exporter(exporter)
|
|
||||||
.with_trace_config(
|
|
||||||
opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new(
|
|
||||||
opentelemetry_semantic_conventions::resource::SERVICE_NAME,
|
|
||||||
service_name,
|
|
||||||
)])),
|
|
||||||
)
|
|
||||||
.install_batch(opentelemetry::runtime::Tokio)
|
|
||||||
.expect("could not initialize opentelemetry exporter")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
|
||||||
// pending traces before we exit.
|
|
||||||
pub fn shutdown_tracing() {
|
|
||||||
opentelemetry::global::shutdown_tracer_provider();
|
|
||||||
}
|
|
||||||
@@ -1,50 +1,48 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "utils"
|
name = "utils"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition.workspace = true
|
edition = "2021"
|
||||||
license.workspace = true
|
license = "Apache-2.0"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
atty.workspace = true
|
sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||||
sentry.workspace = true
|
async-trait = "0.1"
|
||||||
async-trait.workspace = true
|
anyhow = "1.0"
|
||||||
anyhow.workspace = true
|
bincode = "1.3"
|
||||||
bincode.workspace = true
|
bytes = "1.0.1"
|
||||||
bytes.workspace = true
|
hyper = { version = "0.14.7", features = ["full"] }
|
||||||
hyper = { workspace = true, features = ["full"] }
|
routerify = "3"
|
||||||
routerify.workspace = true
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde.workspace = true
|
serde_json = "1"
|
||||||
serde_json.workspace = true
|
thiserror = "1.0"
|
||||||
thiserror.workspace = true
|
tokio = { version = "1.17", features = ["macros"]}
|
||||||
tokio.workspace = true
|
tokio-rustls = "0.23"
|
||||||
tokio-rustls.workspace = true
|
tracing = "0.1"
|
||||||
tracing.workspace = true
|
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
|
||||||
tracing-subscriber = { workspace = true, features = ["json"] }
|
nix = "0.25"
|
||||||
nix.workspace = true
|
signal-hook = "0.3.10"
|
||||||
signal-hook.workspace = true
|
rand = "0.8.3"
|
||||||
rand.workspace = true
|
jsonwebtoken = "8"
|
||||||
jsonwebtoken.workspace = true
|
hex = { version = "0.4.3", features = ["serde"] }
|
||||||
hex = { workspace = true, features = ["serde"] }
|
rustls = "0.20.2"
|
||||||
rustls.workspace = true
|
rustls-split = "0.3.0"
|
||||||
rustls-split.workspace = true
|
git-version = "0.3.5"
|
||||||
git-version.workspace = true
|
serde_with = "2.0"
|
||||||
serde_with.workspace = true
|
once_cell = "1.13.0"
|
||||||
once_cell.workspace = true
|
strum = "0.24"
|
||||||
strum.workspace = true
|
strum_macros = "0.24"
|
||||||
strum_macros.workspace = true
|
|
||||||
|
|
||||||
metrics.workspace = true
|
metrics = { path = "../metrics" }
|
||||||
pq_proto.workspace = true
|
pq_proto = { path = "../pq_proto" }
|
||||||
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||||
workspace_hack.workspace = true
|
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
byteorder.workspace = true
|
byteorder = "1.4.3"
|
||||||
bytes.workspace = true
|
bytes = "1.0.1"
|
||||||
hex-literal.workspace = true
|
hex-literal = "0.3"
|
||||||
tempfile.workspace = true
|
tempfile = "3.2"
|
||||||
criterion.workspace = true
|
criterion = "0.4"
|
||||||
rustls-pemfile.workspace = true
|
rustls-pemfile = "1"
|
||||||
|
|
||||||
[[bench]]
|
[[bench]]
|
||||||
name = "benchmarks"
|
name = "benchmarks"
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
use hyper::{header, Body, Response, StatusCode};
|
use hyper::{header, Body, Response, StatusCode};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tracing::error;
|
|
||||||
|
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
pub enum ApiError {
|
pub enum ApiError {
|
||||||
@@ -77,16 +76,8 @@ impl HttpErrorBody {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub async fn handler(err: routerify::RouteError) -> Response<Body> {
|
pub async fn handler(err: routerify::RouteError) -> Response<Body> {
|
||||||
let api_error = err
|
tracing::error!("Error processing HTTP request: {:?}", err);
|
||||||
.downcast::<ApiError>()
|
err.downcast::<ApiError>()
|
||||||
.expect("handler should always return api error");
|
.expect("handler should always return api error")
|
||||||
|
.into_response()
|
||||||
// Print a stack trace for Internal Server errors
|
|
||||||
if let ApiError::InternalServerError(_) = api_error.as_ref() {
|
|
||||||
error!("Error processing HTTP request: {api_error:?}");
|
|
||||||
} else {
|
|
||||||
error!("Error processing HTTP request: {api_error:#}");
|
|
||||||
}
|
|
||||||
|
|
||||||
api_error.into_response()
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ use strum_macros::{EnumString, EnumVariantNames};
|
|||||||
pub enum LogFormat {
|
pub enum LogFormat {
|
||||||
Plain,
|
Plain,
|
||||||
Json,
|
Json,
|
||||||
Test,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LogFormat {
|
impl LogFormat {
|
||||||
@@ -34,13 +33,12 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
|
|||||||
let base_logger = tracing_subscriber::fmt()
|
let base_logger = tracing_subscriber::fmt()
|
||||||
.with_env_filter(env_filter)
|
.with_env_filter(env_filter)
|
||||||
.with_target(false)
|
.with_target(false)
|
||||||
.with_ansi(atty::is(atty::Stream::Stdout))
|
.with_ansi(false)
|
||||||
.with_writer(std::io::stdout);
|
.with_writer(std::io::stdout);
|
||||||
|
|
||||||
match log_format {
|
match log_format {
|
||||||
LogFormat::Json => base_logger.json().init(),
|
LogFormat::Json => base_logger.json().init(),
|
||||||
LogFormat::Plain => base_logger.init(),
|
LogFormat::Plain => base_logger.init(),
|
||||||
LogFormat::Test => base_logger.with_test_writer().init(),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -7,12 +7,12 @@ use crate::postgres_backend::AuthType;
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use bytes::{Buf, Bytes, BytesMut};
|
use bytes::{Buf, Bytes, BytesMut};
|
||||||
use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR};
|
use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR};
|
||||||
|
use std::future::Future;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::net::SocketAddr;
|
use std::net::SocketAddr;
|
||||||
use std::pin::Pin;
|
use std::pin::Pin;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::task::Poll;
|
use std::task::Poll;
|
||||||
use std::{future::Future, task::ready};
|
|
||||||
use tracing::{debug, error, info, trace};
|
use tracing::{debug, error, info, trace};
|
||||||
|
|
||||||
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
|
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
|
||||||
@@ -253,9 +253,12 @@ impl PostgresBackend {
|
|||||||
cx: &mut std::task::Context<'_>,
|
cx: &mut std::task::Context<'_>,
|
||||||
) -> Poll<Result<(), std::io::Error>> {
|
) -> Poll<Result<(), std::io::Error>> {
|
||||||
while self.buf_out.has_remaining() {
|
while self.buf_out.has_remaining() {
|
||||||
match ready!(Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk())) {
|
match Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk()) {
|
||||||
Ok(bytes_written) => self.buf_out.advance(bytes_written),
|
Poll::Ready(Ok(bytes_written)) => {
|
||||||
Err(err) => return Poll::Ready(Err(err)),
|
self.buf_out.advance(bytes_written);
|
||||||
|
}
|
||||||
|
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
|
||||||
|
Poll::Pending => return Poll::Pending,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Poll::Ready(Ok(()))
|
Poll::Ready(Ok(()))
|
||||||
@@ -570,9 +573,10 @@ impl<'a> AsyncWrite for CopyDataWriter<'a> {
|
|||||||
// It's not strictly required to flush between each message, but makes it easier
|
// It's not strictly required to flush between each message, but makes it easier
|
||||||
// to view in wireshark, and usually the messages that the callers write are
|
// to view in wireshark, and usually the messages that the callers write are
|
||||||
// decently-sized anyway.
|
// decently-sized anyway.
|
||||||
match ready!(this.pgb.poll_write_buf(cx)) {
|
match this.pgb.poll_write_buf(cx) {
|
||||||
Ok(()) => {}
|
Poll::Ready(Ok(())) => {}
|
||||||
Err(err) => return Poll::Ready(Err(err)),
|
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
|
||||||
|
Poll::Pending => return Poll::Pending,
|
||||||
}
|
}
|
||||||
|
|
||||||
// CopyData
|
// CopyData
|
||||||
@@ -589,9 +593,10 @@ impl<'a> AsyncWrite for CopyDataWriter<'a> {
|
|||||||
cx: &mut std::task::Context<'_>,
|
cx: &mut std::task::Context<'_>,
|
||||||
) -> Poll<Result<(), std::io::Error>> {
|
) -> Poll<Result<(), std::io::Error>> {
|
||||||
let this = self.get_mut();
|
let this = self.get_mut();
|
||||||
match ready!(this.pgb.poll_write_buf(cx)) {
|
match this.pgb.poll_write_buf(cx) {
|
||||||
Ok(()) => {}
|
Poll::Ready(Ok(())) => {}
|
||||||
Err(err) => return Poll::Ready(Err(err)),
|
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
|
||||||
|
Poll::Pending => return Poll::Pending,
|
||||||
}
|
}
|
||||||
this.pgb.poll_flush(cx)
|
this.pgb.poll_flush(cx)
|
||||||
}
|
}
|
||||||
@@ -600,9 +605,10 @@ impl<'a> AsyncWrite for CopyDataWriter<'a> {
|
|||||||
cx: &mut std::task::Context<'_>,
|
cx: &mut std::task::Context<'_>,
|
||||||
) -> Poll<Result<(), std::io::Error>> {
|
) -> Poll<Result<(), std::io::Error>> {
|
||||||
let this = self.get_mut();
|
let this = self.get_mut();
|
||||||
match ready!(this.pgb.poll_write_buf(cx)) {
|
match this.pgb.poll_write_buf(cx) {
|
||||||
Ok(()) => {}
|
Poll::Ready(Ok(())) => {}
|
||||||
Err(err) => return Poll::Ready(Err(err)),
|
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
|
||||||
|
Poll::Pending => return Poll::Pending,
|
||||||
}
|
}
|
||||||
this.pgb.poll_flush(cx)
|
this.pgb.poll_flush(cx)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "pageserver"
|
name = "pageserver"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition.workspace = true
|
edition = "2021"
|
||||||
license.workspace = true
|
license = "Apache-2.0"
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = []
|
||||||
@@ -11,67 +11,68 @@ default = []
|
|||||||
testing = ["fail/failpoints"]
|
testing = ["fail/failpoints"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
|
||||||
async-stream.workspace = true
|
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||||
async-trait.workspace = true
|
async-stream = "0.3"
|
||||||
byteorder.workspace = true
|
async-trait = "0.1"
|
||||||
bytes.workspace = true
|
byteorder = "1.4.3"
|
||||||
chrono = { workspace = true, features = ["serde"] }
|
bytes = "1.0.1"
|
||||||
clap = { workspace = true, features = ["string"] }
|
chrono = { version = "0.4.23", default-features = false, features = ["clock", "serde"] }
|
||||||
close_fds.workspace = true
|
clap = { version = "4.0", features = ["string"] }
|
||||||
const_format.workspace = true
|
close_fds = "0.3.2"
|
||||||
consumption_metrics.workspace = true
|
const_format = "0.2.21"
|
||||||
crc32c.workspace = true
|
crc32c = "0.6.0"
|
||||||
crossbeam-utils.workspace = true
|
crossbeam-utils = "0.8.5"
|
||||||
fail.workspace = true
|
fail = "0.5.0"
|
||||||
futures.workspace = true
|
futures = "0.3.13"
|
||||||
git-version.workspace = true
|
git-version = "0.3.5"
|
||||||
hex.workspace = true
|
hex = "0.4.3"
|
||||||
humantime.workspace = true
|
humantime = "2.1.0"
|
||||||
humantime-serde.workspace = true
|
humantime-serde = "1.1.1"
|
||||||
hyper.workspace = true
|
hyper = "0.14"
|
||||||
itertools.workspace = true
|
itertools = "0.10.3"
|
||||||
nix.workspace = true
|
nix = "0.25"
|
||||||
num-traits.workspace = true
|
num-traits = "0.2.15"
|
||||||
once_cell.workspace = true
|
once_cell = "1.13.0"
|
||||||
pin-project-lite.workspace = true
|
pin-project-lite = "0.2.7"
|
||||||
postgres.workspace = true
|
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||||
postgres-protocol.workspace = true
|
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||||
postgres-types.workspace = true
|
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||||
rand.workspace = true
|
rand = "0.8.3"
|
||||||
regex.workspace = true
|
regex = "1.4.5"
|
||||||
scopeguard.workspace = true
|
rstar = "0.9.3"
|
||||||
serde.workspace = true
|
scopeguard = "1.1.0"
|
||||||
serde_json = { workspace = true, features = ["raw_value"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_with.workspace = true
|
serde_json = { version = "1.0", features = ["raw_value"] }
|
||||||
signal-hook.workspace = true
|
serde_with = "2.0"
|
||||||
svg_fmt.workspace = true
|
signal-hook = "0.3.10"
|
||||||
tokio-tar.workspace = true
|
svg_fmt = "0.4.1"
|
||||||
thiserror.workspace = true
|
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
|
||||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
thiserror = "1.0"
|
||||||
tokio-postgres.workspace = true
|
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||||
tokio-util.workspace = true
|
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||||
toml_edit.workspace = true
|
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
|
||||||
tracing.workspace = true
|
toml_edit = { version = "0.14", features = ["easy"] }
|
||||||
url.workspace = true
|
tracing = "0.1.36"
|
||||||
walkdir.workspace = true
|
url = "2"
|
||||||
metrics.workspace = true
|
walkdir = "2.3.2"
|
||||||
pageserver_api.workspace = true
|
|
||||||
postgres_connection.workspace = true
|
metrics = { path = "../libs/metrics" }
|
||||||
postgres_ffi.workspace = true
|
pageserver_api = { path = "../libs/pageserver_api" }
|
||||||
pq_proto.workspace = true
|
postgres_connection = { path = "../libs/postgres_connection" }
|
||||||
remote_storage.workspace = true
|
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||||
storage_broker.workspace = true
|
pq_proto = { path = "../libs/pq_proto" }
|
||||||
tenant_size_model.workspace = true
|
remote_storage = { path = "../libs/remote_storage" }
|
||||||
utils.workspace = true
|
storage_broker = { version = "0.1", path = "../storage_broker" }
|
||||||
workspace_hack.workspace = true
|
tenant_size_model = { path = "../libs/tenant_size_model" }
|
||||||
reqwest.workspace = true
|
utils = { path = "../libs/utils" }
|
||||||
rpds.workspace = true
|
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||||
|
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
criterion.workspace = true
|
criterion = "0.4"
|
||||||
hex-literal.workspace = true
|
hex-literal = "0.3"
|
||||||
tempfile.workspace = true
|
tempfile = "3.2"
|
||||||
|
|
||||||
[[bench]]
|
[[bench]]
|
||||||
name = "bench_layer_map"
|
name = "bench_layer_map"
|
||||||
|
|||||||
@@ -1,12 +1,13 @@
|
|||||||
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
use anyhow::Result;
|
||||||
use pageserver::repository::Key;
|
use pageserver::repository::Key;
|
||||||
use pageserver::tenant::layer_map::LayerMap;
|
use pageserver::tenant::layer_map::LayerMap;
|
||||||
use pageserver::tenant::storage_layer::Layer;
|
use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState};
|
||||||
use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, LayerDescriptor};
|
use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult};
|
||||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||||
use std::cmp::{max, min};
|
use std::cmp::{max, min};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{BufRead, BufReader};
|
use std::io::{BufRead, BufReader};
|
||||||
|
use std::ops::Range;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@@ -16,35 +17,102 @@ use utils::lsn::Lsn;
|
|||||||
|
|
||||||
use criterion::{criterion_group, criterion_main, Criterion};
|
use criterion::{criterion_group, criterion_main, Criterion};
|
||||||
|
|
||||||
fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
|
struct DummyDelta {
|
||||||
let mut layer_map = LayerMap::<LayerDescriptor>::default();
|
key_range: Range<Key>,
|
||||||
|
lsn_range: Range<Lsn>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Layer for DummyDelta {
|
||||||
|
fn get_key_range(&self) -> Range<Key> {
|
||||||
|
self.key_range.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||||
|
self.lsn_range.clone()
|
||||||
|
}
|
||||||
|
fn get_value_reconstruct_data(
|
||||||
|
&self,
|
||||||
|
_key: Key,
|
||||||
|
_lsn_range: Range<Lsn>,
|
||||||
|
_reconstruct_data: &mut ValueReconstructState,
|
||||||
|
) -> Result<ValueReconstructResult> {
|
||||||
|
panic!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_incremental(&self) -> bool {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn dump(&self, _verbose: bool) -> Result<()> {
|
||||||
|
unimplemented!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn short_id(&self) -> String {
|
||||||
|
unimplemented!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct DummyImage {
|
||||||
|
key_range: Range<Key>,
|
||||||
|
lsn: Lsn,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Layer for DummyImage {
|
||||||
|
fn get_key_range(&self) -> Range<Key> {
|
||||||
|
self.key_range.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||||
|
// End-bound is exclusive
|
||||||
|
self.lsn..(self.lsn + 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_value_reconstruct_data(
|
||||||
|
&self,
|
||||||
|
_key: Key,
|
||||||
|
_lsn_range: Range<Lsn>,
|
||||||
|
_reconstruct_data: &mut ValueReconstructState,
|
||||||
|
) -> Result<ValueReconstructResult> {
|
||||||
|
panic!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_incremental(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn dump(&self, _verbose: bool) -> Result<()> {
|
||||||
|
unimplemented!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn short_id(&self) -> String {
|
||||||
|
unimplemented!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_layer_map(filename_dump: PathBuf) -> LayerMap<dyn Layer> {
|
||||||
|
let mut layer_map = LayerMap::<dyn Layer>::default();
|
||||||
|
|
||||||
let mut min_lsn = Lsn(u64::MAX);
|
let mut min_lsn = Lsn(u64::MAX);
|
||||||
let mut max_lsn = Lsn(0);
|
let mut max_lsn = Lsn(0);
|
||||||
|
|
||||||
let filenames = BufReader::new(File::open(filename_dump).unwrap()).lines();
|
let filenames = BufReader::new(File::open(filename_dump).unwrap()).lines();
|
||||||
|
|
||||||
let mut updates = layer_map.batch_update();
|
|
||||||
for fname in filenames {
|
for fname in filenames {
|
||||||
let fname = &fname.unwrap();
|
let fname = &fname.unwrap();
|
||||||
if let Some(imgfilename) = ImageFileName::parse_str(fname) {
|
if let Some(imgfilename) = ImageFileName::parse_str(fname) {
|
||||||
let layer = LayerDescriptor {
|
let layer = DummyImage {
|
||||||
key: imgfilename.key_range,
|
key_range: imgfilename.key_range,
|
||||||
lsn: imgfilename.lsn..(imgfilename.lsn + 1),
|
lsn: imgfilename.lsn,
|
||||||
is_incremental: false,
|
|
||||||
short_id: fname.to_string(),
|
|
||||||
};
|
};
|
||||||
updates.insert_historic(Arc::new(layer));
|
layer_map.insert_historic(Arc::new(layer));
|
||||||
min_lsn = min(min_lsn, imgfilename.lsn);
|
min_lsn = min(min_lsn, imgfilename.lsn);
|
||||||
max_lsn = max(max_lsn, imgfilename.lsn);
|
max_lsn = max(max_lsn, imgfilename.lsn);
|
||||||
} else if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
|
} else if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
|
||||||
let layer = LayerDescriptor {
|
let layer = DummyDelta {
|
||||||
key: deltafilename.key_range.clone(),
|
key_range: deltafilename.key_range,
|
||||||
lsn: deltafilename.lsn_range.clone(),
|
lsn_range: deltafilename.lsn_range.clone(),
|
||||||
is_incremental: true,
|
|
||||||
short_id: fname.to_string(),
|
|
||||||
};
|
};
|
||||||
updates.insert_historic(Arc::new(layer));
|
layer_map.insert_historic(Arc::new(layer));
|
||||||
min_lsn = min(min_lsn, deltafilename.lsn_range.start);
|
min_lsn = min(min_lsn, deltafilename.lsn_range.start);
|
||||||
max_lsn = max(max_lsn, deltafilename.lsn_range.end);
|
max_lsn = max(max_lsn, deltafilename.lsn_range.end);
|
||||||
} else {
|
} else {
|
||||||
@@ -54,12 +122,11 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
|
|||||||
|
|
||||||
println!("min: {min_lsn}, max: {max_lsn}");
|
println!("min: {min_lsn}, max: {max_lsn}");
|
||||||
|
|
||||||
updates.flush();
|
|
||||||
layer_map
|
layer_map
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Construct a layer map query pattern for benchmarks
|
/// Construct a layer map query pattern for benchmarks
|
||||||
fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
|
fn uniform_query_pattern(layer_map: &LayerMap<dyn Layer>) -> Vec<(Key, Lsn)> {
|
||||||
// For each image layer we query one of the pages contained, at LSN right
|
// For each image layer we query one of the pages contained, at LSN right
|
||||||
// before the image layer was created. This gives us a somewhat uniform
|
// before the image layer was created. This gives us a somewhat uniform
|
||||||
// coverage of both the lsn and key space because image layers have
|
// coverage of both the lsn and key space because image layers have
|
||||||
@@ -83,41 +150,6 @@ fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn
|
|||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Construct a partitioning for testing get_difficulty map when we
|
|
||||||
// don't have an exact result of `collect_keyspace` to work with.
|
|
||||||
fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
|
|
||||||
let mut parts = Vec::new();
|
|
||||||
|
|
||||||
// We add a partition boundary at the start of each image layer,
|
|
||||||
// no matter what lsn range it covers. This is just the easiest
|
|
||||||
// thing to do. A better thing to do would be to get a real
|
|
||||||
// partitioning from some database. Even better, remove the need
|
|
||||||
// for key partitions by deciding where to create image layers
|
|
||||||
// directly based on a coverage-based difficulty map.
|
|
||||||
let mut keys: Vec<_> = layer_map
|
|
||||||
.iter_historic_layers()
|
|
||||||
.filter_map(|l| {
|
|
||||||
if l.is_incremental() {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
let kr = l.get_key_range();
|
|
||||||
Some(kr.start.next())
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
keys.sort();
|
|
||||||
|
|
||||||
let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
|
||||||
for key in keys {
|
|
||||||
parts.push(KeySpace {
|
|
||||||
ranges: vec![current_key..key],
|
|
||||||
});
|
|
||||||
current_key = key;
|
|
||||||
}
|
|
||||||
|
|
||||||
KeyPartitioning { parts }
|
|
||||||
}
|
|
||||||
|
|
||||||
// Benchmark using metadata extracted from our performance test environment, from
|
// Benchmark using metadata extracted from our performance test environment, from
|
||||||
// a project where we have run pgbench many timmes. The pgbench database was initialized
|
// a project where we have run pgbench many timmes. The pgbench database was initialized
|
||||||
// between each test run.
|
// between each test run.
|
||||||
@@ -151,68 +183,24 @@ fn bench_from_captest_env(c: &mut Criterion) {
|
|||||||
// Benchmark using metadata extracted from a real project that was taknig
|
// Benchmark using metadata extracted from a real project that was taknig
|
||||||
// too long processing layer map queries.
|
// too long processing layer map queries.
|
||||||
fn bench_from_real_project(c: &mut Criterion) {
|
fn bench_from_real_project(c: &mut Criterion) {
|
||||||
// Init layer map
|
// TODO consider compressing this file
|
||||||
let now = Instant::now();
|
|
||||||
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
|
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
|
||||||
println!("Finished layer map init in {:?}", now.elapsed());
|
|
||||||
|
|
||||||
// Choose uniformly distributed queries
|
|
||||||
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
|
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
|
||||||
|
|
||||||
// Choose inputs for get_difficulty_map
|
// Test with uniform query pattern
|
||||||
let latest_lsn = layer_map
|
c.bench_function("real_map_uniform_queries", |b| {
|
||||||
.iter_historic_layers()
|
|
||||||
.map(|l| l.get_lsn_range().end)
|
|
||||||
.max()
|
|
||||||
.unwrap();
|
|
||||||
let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
|
|
||||||
|
|
||||||
// Check correctness of get_difficulty_map
|
|
||||||
// TODO put this in a dedicated test outside of this mod
|
|
||||||
{
|
|
||||||
println!("running correctness check");
|
|
||||||
|
|
||||||
let now = Instant::now();
|
|
||||||
let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
|
|
||||||
assert!(result_bruteforce.len() == partitioning.parts.len());
|
|
||||||
println!("Finished bruteforce in {:?}", now.elapsed());
|
|
||||||
|
|
||||||
let now = Instant::now();
|
|
||||||
let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
|
|
||||||
assert!(result_fast.len() == partitioning.parts.len());
|
|
||||||
println!("Finished fast in {:?}", now.elapsed());
|
|
||||||
|
|
||||||
// Assert results are equal. Manually iterate for easier debugging.
|
|
||||||
let zip = std::iter::zip(
|
|
||||||
&partitioning.parts,
|
|
||||||
std::iter::zip(result_bruteforce, result_fast),
|
|
||||||
);
|
|
||||||
for (_part, (bruteforce, fast)) in zip {
|
|
||||||
assert_eq!(bruteforce, fast);
|
|
||||||
}
|
|
||||||
|
|
||||||
println!("No issues found");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Define and name the benchmark function
|
|
||||||
let mut group = c.benchmark_group("real_map");
|
|
||||||
group.bench_function("uniform_queries", |b| {
|
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
for q in queries.clone().into_iter() {
|
for q in queries.clone().into_iter() {
|
||||||
layer_map.search(q.0, q.1);
|
layer_map.search(q.0, q.1);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
group.bench_function("get_difficulty_map", |b| {
|
|
||||||
b.iter(|| {
|
|
||||||
layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
|
|
||||||
});
|
|
||||||
});
|
|
||||||
group.finish();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
|
// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
|
||||||
fn bench_sequential(c: &mut Criterion) {
|
fn bench_sequential(c: &mut Criterion) {
|
||||||
|
let mut layer_map: LayerMap<dyn Layer> = LayerMap::default();
|
||||||
|
|
||||||
// Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
|
// Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
|
||||||
//
|
//
|
||||||
// TODO This code is pretty slow and runs even if we're only running other
|
// TODO This code is pretty slow and runs even if we're only running other
|
||||||
@@ -220,39 +208,39 @@ fn bench_sequential(c: &mut Criterion) {
|
|||||||
// Putting it inside the `bench_function` closure is not a solution
|
// Putting it inside the `bench_function` closure is not a solution
|
||||||
// because then it runs multiple times during warmup.
|
// because then it runs multiple times during warmup.
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
let mut layer_map = LayerMap::default();
|
|
||||||
let mut updates = layer_map.batch_update();
|
|
||||||
for i in 0..100_000 {
|
for i in 0..100_000 {
|
||||||
|
// TODO try inserting a super-wide layer in between every 10 to reflect
|
||||||
|
// what often happens with L1 layers that include non-rel changes.
|
||||||
|
// Maybe do that as a separate test.
|
||||||
let i32 = (i as u32) % 100;
|
let i32 = (i as u32) % 100;
|
||||||
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
||||||
let layer = LayerDescriptor {
|
let layer = DummyImage {
|
||||||
key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
key_range: zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||||
lsn: Lsn(i)..Lsn(i + 1),
|
lsn: Lsn(10 * i),
|
||||||
is_incremental: false,
|
|
||||||
short_id: format!("Layer {}", i),
|
|
||||||
};
|
};
|
||||||
updates.insert_historic(Arc::new(layer));
|
layer_map.insert_historic(Arc::new(layer));
|
||||||
}
|
}
|
||||||
updates.flush();
|
|
||||||
println!("Finished layer map init in {:?}", now.elapsed());
|
// Manually measure runtime without criterion because criterion
|
||||||
|
// has a minimum sample size of 10 and I don't want to run it 10 times.
|
||||||
|
println!("Finished init in {:?}", now.elapsed());
|
||||||
|
|
||||||
// Choose 100 uniformly random queries
|
// Choose 100 uniformly random queries
|
||||||
let rng = &mut StdRng::seed_from_u64(1);
|
let rng = &mut StdRng::seed_from_u64(1);
|
||||||
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map)
|
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map)
|
||||||
.choose_multiple(rng, 100)
|
.choose_multiple(rng, 1)
|
||||||
.copied()
|
.copied()
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
// Define and name the benchmark function
|
// Define and name the benchmark function
|
||||||
let mut group = c.benchmark_group("sequential");
|
c.bench_function("sequential_uniform_queries", |b| {
|
||||||
group.bench_function("uniform_queries", |b| {
|
// Run the search queries
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
for q in queries.clone().into_iter() {
|
for q in queries.clone().into_iter() {
|
||||||
layer_map.search(q.0, q.1);
|
layer_map.search(q.0, q.1);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
group.finish();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
criterion_group!(group_1, bench_from_captest_env);
|
criterion_group!(group_1, bench_from_captest_env);
|
||||||
|
|||||||
@@ -30,44 +30,33 @@ fn redo_scenarios(c: &mut Criterion) {
|
|||||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||||
let conf = Box::leak(Box::new(conf));
|
let conf = Box::leak(Box::new(conf));
|
||||||
let tenant_id = TenantId::generate();
|
let tenant_id = TenantId::generate();
|
||||||
|
// std::fs::create_dir_all(conf.tenant_path(&tenant_id)).unwrap();
|
||||||
let manager = PostgresRedoManager::new(conf, tenant_id);
|
let mut manager = PostgresRedoManager::new(conf, tenant_id);
|
||||||
|
manager.launch_process(14).unwrap();
|
||||||
|
|
||||||
let manager = Arc::new(manager);
|
let manager = Arc::new(manager);
|
||||||
|
|
||||||
tracing::info!("executing first");
|
|
||||||
short().execute(&manager).unwrap();
|
|
||||||
tracing::info!("first executed");
|
|
||||||
|
|
||||||
let thread_counts = [1, 2, 4, 8, 16];
|
let thread_counts = [1, 2, 4, 8, 16];
|
||||||
|
|
||||||
let mut group = c.benchmark_group("short");
|
|
||||||
group.sampling_mode(criterion::SamplingMode::Flat);
|
|
||||||
|
|
||||||
for thread_count in thread_counts {
|
for thread_count in thread_counts {
|
||||||
group.bench_with_input(
|
c.bench_with_input(
|
||||||
BenchmarkId::new("short", thread_count),
|
BenchmarkId::new("short-50record", thread_count),
|
||||||
&thread_count,
|
&thread_count,
|
||||||
|b, thread_count| {
|
|b, thread_count| {
|
||||||
add_multithreaded_walredo_requesters(b, *thread_count, &manager, short);
|
add_multithreaded_walredo_requesters(b, *thread_count, &manager, short, 50);
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
drop(group);
|
|
||||||
|
|
||||||
let mut group = c.benchmark_group("medium");
|
|
||||||
group.sampling_mode(criterion::SamplingMode::Flat);
|
|
||||||
|
|
||||||
for thread_count in thread_counts {
|
for thread_count in thread_counts {
|
||||||
group.bench_with_input(
|
c.bench_with_input(
|
||||||
BenchmarkId::new("medium", thread_count),
|
BenchmarkId::new("medium-10record", thread_count),
|
||||||
&thread_count,
|
&thread_count,
|
||||||
|b, thread_count| {
|
|b, thread_count| {
|
||||||
add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium);
|
add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium, 10);
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
drop(group);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sets up `threads` number of requesters to `request_redo`, with the given input.
|
/// Sets up `threads` number of requesters to `request_redo`, with the given input.
|
||||||
@@ -76,66 +65,46 @@ fn add_multithreaded_walredo_requesters(
|
|||||||
threads: u32,
|
threads: u32,
|
||||||
manager: &Arc<PostgresRedoManager>,
|
manager: &Arc<PostgresRedoManager>,
|
||||||
input_factory: fn() -> Request,
|
input_factory: fn() -> Request,
|
||||||
|
request_repeats: usize,
|
||||||
) {
|
) {
|
||||||
assert_ne!(threads, 0);
|
b.iter_batched_ref(
|
||||||
|
|| {
|
||||||
|
// barrier for all of the threads, and the benchmarked thread
|
||||||
|
let barrier = Arc::new(Barrier::new(threads as usize + 1));
|
||||||
|
|
||||||
if threads == 1 {
|
let jhs = (0..threads)
|
||||||
b.iter_batched_ref(
|
.map(|_| {
|
||||||
|| Some(input_factory()),
|
std::thread::spawn({
|
||||||
|input| execute_all(input.take(), manager),
|
let manager = manager.clone();
|
||||||
criterion::BatchSize::PerIteration,
|
let barrier = barrier.clone();
|
||||||
);
|
move || {
|
||||||
} else {
|
let input = std::iter::repeat(input_factory())
|
||||||
let (work_tx, work_rx) = std::sync::mpsc::sync_channel(threads as usize);
|
.take(request_repeats)
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
let work_rx = std::sync::Arc::new(std::sync::Mutex::new(work_rx));
|
barrier.wait();
|
||||||
|
|
||||||
let barrier = Arc::new(Barrier::new(threads as usize + 1));
|
execute_all(input, &manager).unwrap();
|
||||||
|
|
||||||
let jhs = (0..threads)
|
barrier.wait();
|
||||||
.map(|_| {
|
|
||||||
std::thread::spawn({
|
|
||||||
let manager = manager.clone();
|
|
||||||
let barrier = barrier.clone();
|
|
||||||
let work_rx = work_rx.clone();
|
|
||||||
move || loop {
|
|
||||||
// queue up and wait if we want to go another round
|
|
||||||
if work_rx.lock().unwrap().recv().is_err() {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
})
|
||||||
let input = Some(input_factory());
|
|
||||||
|
|
||||||
barrier.wait();
|
|
||||||
|
|
||||||
execute_all(input, &manager).unwrap();
|
|
||||||
|
|
||||||
barrier.wait();
|
|
||||||
}
|
|
||||||
})
|
})
|
||||||
})
|
.collect::<Vec<_>>();
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
let _jhs = JoinOnDrop(jhs);
|
(barrier, JoinOnDrop(jhs))
|
||||||
|
},
|
||||||
|
|input| {
|
||||||
|
let barrier = &input.0;
|
||||||
|
|
||||||
b.iter_batched(
|
// start the work
|
||||||
|| {
|
barrier.wait();
|
||||||
for _ in 0..threads {
|
|
||||||
work_tx.send(()).unwrap()
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|()| {
|
|
||||||
// start the work
|
|
||||||
barrier.wait();
|
|
||||||
|
|
||||||
// wait for work to complete
|
// wait for work to complete
|
||||||
barrier.wait();
|
barrier.wait();
|
||||||
},
|
},
|
||||||
criterion::BatchSize::PerIteration,
|
criterion::BatchSize::PerIteration,
|
||||||
);
|
);
|
||||||
|
|
||||||
drop(work_tx);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct JoinOnDrop(Vec<std::thread::JoinHandle<()>>);
|
struct JoinOnDrop(Vec<std::thread::JoinHandle<()>>);
|
||||||
@@ -152,10 +121,7 @@ impl Drop for JoinOnDrop {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn execute_all<I>(input: I, manager: &PostgresRedoManager) -> Result<(), WalRedoError>
|
fn execute_all(input: Vec<Request>, manager: &PostgresRedoManager) -> Result<(), WalRedoError> {
|
||||||
where
|
|
||||||
I: IntoIterator<Item = Request>,
|
|
||||||
{
|
|
||||||
// just fire all requests as fast as possible
|
// just fire all requests as fast as possible
|
||||||
input.into_iter().try_for_each(|req| {
|
input.into_iter().try_for_each(|req| {
|
||||||
let page = req.execute(manager)?;
|
let page = req.execute(manager)?;
|
||||||
@@ -177,7 +143,6 @@ macro_rules! lsn {
|
|||||||
}};
|
}};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Short payload, 1132 bytes.
|
|
||||||
// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
|
// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
|
||||||
// for null bytes.
|
// for null bytes.
|
||||||
#[allow(clippy::octal_escapes)]
|
#[allow(clippy::octal_escapes)]
|
||||||
@@ -207,7 +172,6 @@ fn short() -> Request {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Medium sized payload, serializes as 26393 bytes.
|
|
||||||
// see [`short`]
|
// see [`short`]
|
||||||
#[allow(clippy::octal_escapes)]
|
#[allow(clippy::octal_escapes)]
|
||||||
fn medium() -> Request {
|
fn medium() -> Request {
|
||||||
|
|||||||
Binary file not shown.
@@ -10,7 +10,7 @@
|
|||||||
//! This module is responsible for creation of such tarball
|
//! This module is responsible for creation of such tarball
|
||||||
//! from data stored in object storage.
|
//! from data stored in object storage.
|
||||||
//!
|
//!
|
||||||
use anyhow::{anyhow, bail, ensure, Context};
|
use anyhow::{anyhow, ensure, Context, Result};
|
||||||
use bytes::{BufMut, BytesMut};
|
use bytes::{BufMut, BytesMut};
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
use std::fmt::Write as FmtWrite;
|
use std::fmt::Write as FmtWrite;
|
||||||
@@ -27,8 +27,8 @@ use tracing::*;
|
|||||||
///
|
///
|
||||||
use tokio_tar::{Builder, EntryType, Header};
|
use tokio_tar::{Builder, EntryType, Header};
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::tenant::TimelineRequestContext;
|
||||||
use crate::tenant::Timeline;
|
use crate::tenant::{PageReconstructError, Timeline};
|
||||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||||
|
|
||||||
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
||||||
@@ -53,8 +53,8 @@ pub async fn send_basebackup_tarball<'a, W>(
|
|||||||
req_lsn: Option<Lsn>,
|
req_lsn: Option<Lsn>,
|
||||||
prev_lsn: Option<Lsn>,
|
prev_lsn: Option<Lsn>,
|
||||||
full_backup: bool,
|
full_backup: bool,
|
||||||
ctx: &'a RequestContext,
|
ctx: &'a TimelineRequestContext,
|
||||||
) -> anyhow::Result<()>
|
) -> Result<(), PageReconstructError>
|
||||||
where
|
where
|
||||||
W: AsyncWrite + Send + Sync + Unpin,
|
W: AsyncWrite + Send + Sync + Unpin,
|
||||||
{
|
{
|
||||||
@@ -93,8 +93,10 @@ where
|
|||||||
|
|
||||||
// Consolidate the derived and the provided prev_lsn values
|
// Consolidate the derived and the provided prev_lsn values
|
||||||
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
|
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
|
||||||
if backup_prev != Lsn(0) {
|
if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
|
||||||
ensure!(backup_prev == provided_prev_lsn);
|
return Err(PageReconstructError::Other(anyhow!(
|
||||||
|
"prev LSN doesn't match"
|
||||||
|
)));
|
||||||
}
|
}
|
||||||
provided_prev_lsn
|
provided_prev_lsn
|
||||||
} else {
|
} else {
|
||||||
@@ -132,14 +134,14 @@ where
|
|||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
prev_record_lsn: Lsn,
|
prev_record_lsn: Lsn,
|
||||||
full_backup: bool,
|
full_backup: bool,
|
||||||
ctx: &'a RequestContext,
|
ctx: &'a TimelineRequestContext,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, W> Basebackup<'a, W>
|
impl<'a, W> Basebackup<'a, W>
|
||||||
where
|
where
|
||||||
W: AsyncWrite + Send + Sync + Unpin,
|
W: AsyncWrite + Send + Sync + Unpin,
|
||||||
{
|
{
|
||||||
async fn send_tarball(mut self) -> anyhow::Result<()> {
|
async fn send_tarball(mut self) -> Result<(), PageReconstructError> {
|
||||||
// TODO include checksum
|
// TODO include checksum
|
||||||
|
|
||||||
// Create pgdata subdirs structure
|
// Create pgdata subdirs structure
|
||||||
@@ -210,17 +212,19 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
fail_point!("basebackup-before-control-file", |_| {
|
fail_point!("basebackup-before-control-file", |_| {
|
||||||
bail!("failpoint basebackup-before-control-file")
|
Err(PageReconstructError::from(anyhow!(
|
||||||
|
"failpoint basebackup-before-control-file"
|
||||||
|
)))
|
||||||
});
|
});
|
||||||
|
|
||||||
// Generate pg_control and bootstrap WAL segment.
|
// Generate pg_control and bootstrap WAL segment.
|
||||||
self.add_pgcontrol_file().await?;
|
self.add_pgcontrol_file().await?;
|
||||||
self.ar.finish().await?;
|
self.ar.finish().await.context("could not finish tarball")?;
|
||||||
debug!("all tarred up!");
|
debug!("all tarred up!");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
|
async fn add_rel(&mut self, tag: RelTag) -> Result<(), PageReconstructError> {
|
||||||
let nblocks = self
|
let nblocks = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_rel_size(tag, self.lsn, false, self.ctx)
|
.get_rel_size(tag, self.lsn, false, self.ctx)
|
||||||
@@ -230,7 +234,10 @@ where
|
|||||||
if nblocks == 0 {
|
if nblocks == 0 {
|
||||||
let file_name = tag.to_segfile_name(0);
|
let file_name = tag.to_segfile_name(0);
|
||||||
let header = new_tar_header(&file_name, 0)?;
|
let header = new_tar_header(&file_name, 0)?;
|
||||||
self.ar.append(&header, &mut io::empty()).await?;
|
self.ar
|
||||||
|
.append(&header, &mut io::empty())
|
||||||
|
.await
|
||||||
|
.context("could not write empty relfile to tar stream")?;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -239,7 +246,6 @@ where
|
|||||||
let mut seg = 0;
|
let mut seg = 0;
|
||||||
while startblk < nblocks {
|
while startblk < nblocks {
|
||||||
let endblk = std::cmp::min(startblk + RELSEG_SIZE, nblocks);
|
let endblk = std::cmp::min(startblk + RELSEG_SIZE, nblocks);
|
||||||
|
|
||||||
let mut segment_data: Vec<u8> = vec![];
|
let mut segment_data: Vec<u8> = vec![];
|
||||||
for blknum in startblk..endblk {
|
for blknum in startblk..endblk {
|
||||||
let img = self
|
let img = self
|
||||||
@@ -251,7 +257,10 @@ where
|
|||||||
|
|
||||||
let file_name = tag.to_segfile_name(seg as u32);
|
let file_name = tag.to_segfile_name(seg as u32);
|
||||||
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
|
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
|
||||||
self.ar.append(&header, segment_data.as_slice()).await?;
|
self.ar
|
||||||
|
.append(&header, segment_data.as_slice())
|
||||||
|
.await
|
||||||
|
.context("could not write relfile segment to tar stream")?;
|
||||||
|
|
||||||
seg += 1;
|
seg += 1;
|
||||||
startblk = endblk;
|
startblk = endblk;
|
||||||
|
|||||||
@@ -13,9 +13,8 @@ use tracing::*;
|
|||||||
use metrics::set_build_info_metric;
|
use metrics::set_build_info_metric;
|
||||||
use pageserver::{
|
use pageserver::{
|
||||||
config::{defaults::*, PageServerConf},
|
config::{defaults::*, PageServerConf},
|
||||||
context::{DownloadBehavior, RequestContext},
|
context::{DownloadBehavior, RequestContext, TaskKind},
|
||||||
http, page_cache, page_service, task_mgr,
|
http, page_cache, page_service, task_mgr,
|
||||||
task_mgr::TaskKind,
|
|
||||||
task_mgr::{
|
task_mgr::{
|
||||||
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
|
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
|
||||||
},
|
},
|
||||||
@@ -27,7 +26,7 @@ use utils::{
|
|||||||
logging,
|
logging,
|
||||||
postgres_backend::AuthType,
|
postgres_backend::AuthType,
|
||||||
project_git_version,
|
project_git_version,
|
||||||
sentry_init::init_sentry,
|
sentry_init::{init_sentry, release_name},
|
||||||
signals::{self, Signal},
|
signals::{self, Signal},
|
||||||
tcp_listener,
|
tcp_listener,
|
||||||
};
|
};
|
||||||
@@ -86,10 +85,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// initialize sentry if SENTRY_DSN is provided
|
// initialize sentry if SENTRY_DSN is provided
|
||||||
let _sentry_guard = init_sentry(
|
let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.id.to_string())]);
|
||||||
Some(GIT_VERSION.into()),
|
|
||||||
&[("node_id", &conf.id.to_string())],
|
|
||||||
);
|
|
||||||
|
|
||||||
let tenants_path = conf.tenants_path();
|
let tenants_path = conf.tenants_path();
|
||||||
if !tenants_path.exists() {
|
if !tenants_path.exists() {
|
||||||
@@ -307,62 +303,52 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
|||||||
{
|
{
|
||||||
let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
|
let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
|
||||||
|
|
||||||
|
let mgmt_ctx = RequestContext::new(TaskKind::HttpEndpointListener, DownloadBehavior::Error);
|
||||||
|
let cancellation_token = Box::leak(Box::new(mgmt_ctx.cancellation_token().clone()));
|
||||||
let router = http::make_router(conf, auth.clone(), remote_storage)?
|
let router = http::make_router(conf, auth.clone(), remote_storage)?
|
||||||
.build()
|
.build()
|
||||||
.map_err(|err| anyhow!(err))?;
|
.map_err(|err| anyhow!(err))?;
|
||||||
let service = utils::http::RouterService::new(router).unwrap();
|
let service = utils::http::RouterService::new(router).unwrap();
|
||||||
let server = hyper::Server::from_tcp(http_listener)?
|
let server = hyper::Server::from_tcp(http_listener)?
|
||||||
.serve(service)
|
.serve(service)
|
||||||
.with_graceful_shutdown(task_mgr::shutdown_watcher());
|
.with_graceful_shutdown(cancellation_token.cancelled());
|
||||||
|
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
MGMT_REQUEST_RUNTIME.handle(),
|
MGMT_REQUEST_RUNTIME.handle(),
|
||||||
TaskKind::HttpEndpointListener,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
"http endpoint listener",
|
"http endpoint listener",
|
||||||
true,
|
true,
|
||||||
async {
|
async {
|
||||||
server.await?;
|
match server.await {
|
||||||
Ok(())
|
Ok(()) => info!("HTTP endpoint listener shut down"),
|
||||||
|
Err(err) => error!("HTTP endpoint listener shut down with error: {err:?}"),
|
||||||
|
}
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
|
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
|
||||||
let metrics_ctx = RequestContext::todo_child(
|
let metrics_ctx = RequestContext::new(
|
||||||
TaskKind::MetricsCollection,
|
TaskKind::MetricsCollection,
|
||||||
// This task itself shouldn't download anything.
|
DownloadBehavior::Error, // metrics collector shouldn't be downloading anything
|
||||||
// The actual size calculation does need downloads, and
|
);
|
||||||
// creates a child context with the right DownloadBehavior.
|
task_mgr::spawn(
|
||||||
DownloadBehavior::Error,
|
MGMT_REQUEST_RUNTIME.handle(),
|
||||||
);
|
"consumption metrics collection",
|
||||||
task_mgr::spawn(
|
true,
|
||||||
MGMT_REQUEST_RUNTIME.handle(),
|
pageserver::consumption_metrics::collect_metrics(
|
||||||
TaskKind::MetricsCollection,
|
metric_collection_endpoint,
|
||||||
None,
|
conf.metric_collection_interval,
|
||||||
None,
|
conf.id,
|
||||||
"consumption metrics collection",
|
metrics_ctx,
|
||||||
true,
|
)
|
||||||
async move {
|
.instrument(info_span!("metrics_collection")),
|
||||||
pageserver::consumption_metrics::collect_metrics(
|
);
|
||||||
metric_collection_endpoint,
|
|
||||||
conf.metric_collection_interval,
|
|
||||||
conf.synthetic_size_calculation_interval,
|
|
||||||
conf.id,
|
|
||||||
metrics_ctx,
|
|
||||||
)
|
|
||||||
.instrument(info_span!("metrics_collection"))
|
|
||||||
.await?;
|
|
||||||
Ok(())
|
|
||||||
},
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
||||||
// for each connection. We created the listener earlier already.
|
// for each connection. We created the listener earlier already.
|
||||||
{
|
{
|
||||||
let libpq_ctx = RequestContext::todo_child(
|
let libpq_ctx = RequestContext::new(
|
||||||
TaskKind::LibpqEndpointListener,
|
TaskKind::LibpqEndpointListener,
|
||||||
// listener task shouldn't need to download anything. (We will
|
// listener task shouldn't need to download anything. (We will
|
||||||
// create a separate sub-contexts for each connection, with their
|
// create a separate sub-contexts for each connection, with their
|
||||||
@@ -372,13 +358,10 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
|||||||
);
|
);
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
COMPUTE_REQUEST_RUNTIME.handle(),
|
COMPUTE_REQUEST_RUNTIME.handle(),
|
||||||
TaskKind::LibpqEndpointListener,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
"libpq endpoint listener",
|
"libpq endpoint listener",
|
||||||
true,
|
true,
|
||||||
async move {
|
async move {
|
||||||
page_service::libpq_listener_main(
|
match page_service::libpq_listener_main(
|
||||||
conf,
|
conf,
|
||||||
auth,
|
auth,
|
||||||
pageserver_listener,
|
pageserver_listener,
|
||||||
@@ -386,6 +369,10 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
|||||||
libpq_ctx,
|
libpq_ctx,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
|
{
|
||||||
|
Ok(()) => info!("libpq endpoint listener shut down"),
|
||||||
|
Err(err) => error!("libpq endpoint listener shut down with error: {err:?}"),
|
||||||
|
}
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -405,7 +392,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
|||||||
"Got {}. Terminating gracefully in fast shutdown mode",
|
"Got {}. Terminating gracefully in fast shutdown mode",
|
||||||
signal.name()
|
signal.name()
|
||||||
);
|
);
|
||||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
|
BACKGROUND_RUNTIME.block_on(task_mgr::shutdown_pageserver(0));
|
||||||
unreachable!()
|
unreachable!()
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -59,8 +59,6 @@ pub mod defaults {
|
|||||||
|
|
||||||
pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
|
pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
|
||||||
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
|
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
|
||||||
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
|
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Default built-in configuration file.
|
/// Default built-in configuration file.
|
||||||
///
|
///
|
||||||
@@ -85,7 +83,6 @@ pub mod defaults {
|
|||||||
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
|
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
|
||||||
|
|
||||||
#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
|
#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
|
||||||
#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
|
|
||||||
|
|
||||||
# [tenant_config]
|
# [tenant_config]
|
||||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||||
@@ -155,11 +152,8 @@ pub struct PageServerConf {
|
|||||||
// How often to collect metrics and send them to the metrics endpoint.
|
// How often to collect metrics and send them to the metrics endpoint.
|
||||||
pub metric_collection_interval: Duration,
|
pub metric_collection_interval: Duration,
|
||||||
pub metric_collection_endpoint: Option<Url>,
|
pub metric_collection_endpoint: Option<Url>,
|
||||||
pub synthetic_size_calculation_interval: Duration,
|
|
||||||
|
|
||||||
pub test_remote_failures: u64,
|
pub test_remote_failures: u64,
|
||||||
|
|
||||||
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||||
@@ -221,11 +215,8 @@ struct PageServerConfigBuilder {
|
|||||||
|
|
||||||
metric_collection_interval: BuilderValue<Duration>,
|
metric_collection_interval: BuilderValue<Duration>,
|
||||||
metric_collection_endpoint: BuilderValue<Option<Url>>,
|
metric_collection_endpoint: BuilderValue<Option<Url>>,
|
||||||
synthetic_size_calculation_interval: BuilderValue<Duration>,
|
|
||||||
|
|
||||||
test_remote_failures: BuilderValue<u64>,
|
test_remote_failures: BuilderValue<u64>,
|
||||||
|
|
||||||
ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for PageServerConfigBuilder {
|
impl Default for PageServerConfigBuilder {
|
||||||
@@ -264,15 +255,9 @@ impl Default for PageServerConfigBuilder {
|
|||||||
DEFAULT_METRIC_COLLECTION_INTERVAL,
|
DEFAULT_METRIC_COLLECTION_INTERVAL,
|
||||||
)
|
)
|
||||||
.expect("cannot parse default metric collection interval")),
|
.expect("cannot parse default metric collection interval")),
|
||||||
synthetic_size_calculation_interval: Set(humantime::parse_duration(
|
|
||||||
DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
|
|
||||||
)
|
|
||||||
.expect("cannot parse default synthetic size calculation interval")),
|
|
||||||
metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
|
metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
|
||||||
|
|
||||||
test_remote_failures: Set(0),
|
test_remote_failures: Set(0),
|
||||||
|
|
||||||
ondemand_download_behavior_treat_error_as_warn: Set(false),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -357,26 +342,10 @@ impl PageServerConfigBuilder {
|
|||||||
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
|
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn synthetic_size_calculation_interval(
|
|
||||||
&mut self,
|
|
||||||
synthetic_size_calculation_interval: Duration,
|
|
||||||
) {
|
|
||||||
self.synthetic_size_calculation_interval =
|
|
||||||
BuilderValue::Set(synthetic_size_calculation_interval)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn test_remote_failures(&mut self, fail_first: u64) {
|
pub fn test_remote_failures(&mut self, fail_first: u64) {
|
||||||
self.test_remote_failures = BuilderValue::Set(fail_first);
|
self.test_remote_failures = BuilderValue::Set(fail_first);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn ondemand_download_behavior_treat_error_as_warn(
|
|
||||||
&mut self,
|
|
||||||
ondemand_download_behavior_treat_error_as_warn: bool,
|
|
||||||
) {
|
|
||||||
self.ondemand_download_behavior_treat_error_as_warn =
|
|
||||||
BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||||
Ok(PageServerConf {
|
Ok(PageServerConf {
|
||||||
listen_pg_addr: self
|
listen_pg_addr: self
|
||||||
@@ -430,17 +399,9 @@ impl PageServerConfigBuilder {
|
|||||||
metric_collection_endpoint: self
|
metric_collection_endpoint: self
|
||||||
.metric_collection_endpoint
|
.metric_collection_endpoint
|
||||||
.ok_or(anyhow!("missing metric_collection_endpoint"))?,
|
.ok_or(anyhow!("missing metric_collection_endpoint"))?,
|
||||||
synthetic_size_calculation_interval: self
|
|
||||||
.synthetic_size_calculation_interval
|
|
||||||
.ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
|
|
||||||
test_remote_failures: self
|
test_remote_failures: self
|
||||||
.test_remote_failures
|
.test_remote_failures
|
||||||
.ok_or(anyhow!("missing test_remote_failuers"))?,
|
.ok_or(anyhow!("missing test_remote_failuers"))?,
|
||||||
ondemand_download_behavior_treat_error_as_warn: self
|
|
||||||
.ondemand_download_behavior_treat_error_as_warn
|
|
||||||
.ok_or(anyhow!(
|
|
||||||
"missing ondemand_download_behavior_treat_error_as_warn"
|
|
||||||
))?,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -616,10 +577,8 @@ impl PageServerConf {
|
|||||||
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
|
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
|
||||||
builder.metric_collection_endpoint(Some(endpoint));
|
builder.metric_collection_endpoint(Some(endpoint));
|
||||||
},
|
},
|
||||||
"synthetic_size_calculation_interval" =>
|
|
||||||
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
|
|
||||||
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
|
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
|
||||||
"ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
|
|
||||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -742,9 +701,7 @@ impl PageServerConf {
|
|||||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||||
metric_collection_interval: Duration::from_secs(60),
|
metric_collection_interval: Duration::from_secs(60),
|
||||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||||
synthetic_size_calculation_interval: Duration::from_secs(60),
|
|
||||||
test_remote_failures: 0,
|
test_remote_failures: 0,
|
||||||
ondemand_download_behavior_treat_error_as_warn: false,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -770,11 +727,6 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
|
|||||||
Ok(i as u64)
|
Ok(i as u64)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_toml_bool(name: &str, item: &Item) -> Result<bool> {
|
|
||||||
item.as_bool()
|
|
||||||
.with_context(|| format!("configure option {name} is not a bool"))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
|
fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
|
||||||
let s = item
|
let s = item
|
||||||
.as_str()
|
.as_str()
|
||||||
@@ -882,7 +834,6 @@ id = 10
|
|||||||
|
|
||||||
metric_collection_interval = '222 s'
|
metric_collection_interval = '222 s'
|
||||||
metric_collection_endpoint = 'http://localhost:80/metrics'
|
metric_collection_endpoint = 'http://localhost:80/metrics'
|
||||||
synthetic_size_calculation_interval = '333 s'
|
|
||||||
log_format = 'json'
|
log_format = 'json'
|
||||||
|
|
||||||
"#;
|
"#;
|
||||||
@@ -929,11 +880,7 @@ log_format = 'json'
|
|||||||
defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
|
defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
|
||||||
)?,
|
)?,
|
||||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||||
synthetic_size_calculation_interval: humantime::parse_duration(
|
|
||||||
defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
|
|
||||||
)?,
|
|
||||||
test_remote_failures: 0,
|
test_remote_failures: 0,
|
||||||
ondemand_download_behavior_treat_error_as_warn: false,
|
|
||||||
},
|
},
|
||||||
"Correct defaults should be used when no config values are provided"
|
"Correct defaults should be used when no config values are provided"
|
||||||
);
|
);
|
||||||
@@ -979,9 +926,7 @@ log_format = 'json'
|
|||||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||||
metric_collection_interval: Duration::from_secs(222),
|
metric_collection_interval: Duration::from_secs(222),
|
||||||
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
|
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
|
||||||
synthetic_size_calculation_interval: Duration::from_secs(333),
|
|
||||||
test_remote_failures: 0,
|
test_remote_failures: 0,
|
||||||
ondemand_download_behavior_treat_error_as_warn: false,
|
|
||||||
},
|
},
|
||||||
"Should be able to parse all basic config values correctly"
|
"Should be able to parse all basic config values correctly"
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -3,88 +3,164 @@
|
|||||||
//! and push them to a HTTP endpoint.
|
//! and push them to a HTTP endpoint.
|
||||||
//! Cache metrics to send only the updated ones.
|
//! Cache metrics to send only the updated ones.
|
||||||
//!
|
//!
|
||||||
use crate::context::{DownloadBehavior, RequestContext};
|
|
||||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
|
||||||
use crate::tenant::mgr;
|
|
||||||
use anyhow;
|
use anyhow;
|
||||||
use chrono::Utc;
|
use tracing::*;
|
||||||
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
|
use utils::id::NodeId;
|
||||||
use pageserver_api::models::TenantState;
|
use utils::id::TimelineId;
|
||||||
use reqwest::Url;
|
|
||||||
use serde::Serialize;
|
use crate::context::RequestContext;
|
||||||
|
use crate::tenant::mgr;
|
||||||
|
use utils::id::TenantId;
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_with::{serde_as, DisplayFromStr};
|
use serde_with::{serde_as, DisplayFromStr};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
use std::fmt;
|
||||||
|
use std::str::FromStr;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tracing::*;
|
|
||||||
use utils::id::{NodeId, TenantId, TimelineId};
|
|
||||||
|
|
||||||
const WRITTEN_SIZE: &str = "written_size";
|
use chrono::{DateTime, Utc};
|
||||||
const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
|
use rand::Rng;
|
||||||
const RESIDENT_SIZE: &str = "resident_size";
|
use reqwest::Url;
|
||||||
const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
|
|
||||||
const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
|
|
||||||
|
|
||||||
|
/// ConsumptionMetric struct that defines the format for one metric entry
|
||||||
|
/// i.e.
|
||||||
|
///
|
||||||
|
/// ```json
|
||||||
|
/// {
|
||||||
|
/// "metric": "remote_storage_size",
|
||||||
|
/// "type": "absolute",
|
||||||
|
/// "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
|
||||||
|
/// "timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
|
||||||
|
/// "time": "2022-12-28T11:07:19.317310284Z",
|
||||||
|
/// "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
|
||||||
|
/// "value": 12345454,
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
#[serde_as]
|
#[serde_as]
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
|
||||||
struct Ids {
|
pub struct ConsumptionMetric {
|
||||||
|
pub metric: ConsumptionMetricKind,
|
||||||
|
#[serde(rename = "type")]
|
||||||
|
pub metric_type: &'static str,
|
||||||
#[serde_as(as = "DisplayFromStr")]
|
#[serde_as(as = "DisplayFromStr")]
|
||||||
tenant_id: TenantId,
|
pub tenant_id: TenantId,
|
||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
timeline_id: Option<TimelineId>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Key that uniquely identifies the object, this metric describes.
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
|
||||||
pub struct PageserverConsumptionMetricsKey {
|
|
||||||
pub tenant_id: TenantId,
|
|
||||||
pub timeline_id: Option<TimelineId>,
|
pub timeline_id: Option<TimelineId>,
|
||||||
pub metric: &'static str,
|
pub time: DateTime<Utc>,
|
||||||
|
pub idempotency_key: String,
|
||||||
|
pub value: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Main thread that serves metrics collection
|
impl ConsumptionMetric {
|
||||||
|
pub fn new_absolute<R: Rng + ?Sized>(
|
||||||
|
metric: ConsumptionMetricKind,
|
||||||
|
tenant_id: TenantId,
|
||||||
|
timeline_id: Option<TimelineId>,
|
||||||
|
value: u64,
|
||||||
|
node_id: NodeId,
|
||||||
|
rng: &mut R,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
metric,
|
||||||
|
metric_type: "absolute",
|
||||||
|
tenant_id,
|
||||||
|
timeline_id,
|
||||||
|
time: Utc::now(),
|
||||||
|
// key that allows metric collector to distinguish unique events
|
||||||
|
idempotency_key: format!("{}-{}-{:04}", Utc::now(), node_id, rng.gen_range(0..=9999)),
|
||||||
|
value,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
pub enum ConsumptionMetricKind {
|
||||||
|
/// Amount of WAL produced , by a timeline, i.e. last_record_lsn
|
||||||
|
/// This is an absolute, per-timeline metric.
|
||||||
|
WrittenSize,
|
||||||
|
/// Size of all tenant branches including WAL
|
||||||
|
/// This is an absolute, per-tenant metric.
|
||||||
|
/// This is the same metric that tenant/tenant_id/size endpoint returns.
|
||||||
|
SyntheticStorageSize,
|
||||||
|
/// Size of all the layer files in the tenant's directory on disk on the pageserver.
|
||||||
|
/// This is an absolute, per-tenant metric.
|
||||||
|
/// See also prometheus metric RESIDENT_PHYSICAL_SIZE.
|
||||||
|
ResidentSize,
|
||||||
|
/// Size of the remote storage (S3) directory.
|
||||||
|
/// This is an absolute, per-tenant metric.
|
||||||
|
RemoteStorageSize,
|
||||||
|
/// Logical size of the data in the timeline
|
||||||
|
/// This is an absolute, per-timeline metric
|
||||||
|
TimelineLogicalSize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for ConsumptionMetricKind {
|
||||||
|
type Err = anyhow::Error;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
match s {
|
||||||
|
"written_size" => Ok(Self::WrittenSize),
|
||||||
|
"synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
|
||||||
|
"resident_size" => Ok(Self::ResidentSize),
|
||||||
|
"remote_storage_size" => Ok(Self::RemoteStorageSize),
|
||||||
|
"timeline_logical_size" => Ok(Self::TimelineLogicalSize),
|
||||||
|
_ => anyhow::bail!("invalid value \"{s}\" for metric type"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for ConsumptionMetricKind {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
f.write_str(match self {
|
||||||
|
ConsumptionMetricKind::WrittenSize => "written_size",
|
||||||
|
ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size",
|
||||||
|
ConsumptionMetricKind::ResidentSize => "resident_size",
|
||||||
|
ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size",
|
||||||
|
ConsumptionMetricKind::TimelineLogicalSize => "timeline_logical_size",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub struct ConsumptionMetricsKey {
|
||||||
|
tenant_id: TenantId,
|
||||||
|
timeline_id: Option<TimelineId>,
|
||||||
|
metric: ConsumptionMetricKind,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(serde::Serialize)]
|
||||||
|
struct EventChunk<'a> {
|
||||||
|
events: &'a [ConsumptionMetric],
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Main task that serves metrics collection
|
||||||
pub async fn collect_metrics(
|
pub async fn collect_metrics(
|
||||||
metric_collection_endpoint: &Url,
|
metric_collection_endpoint: &Url,
|
||||||
metric_collection_interval: Duration,
|
metric_collection_interval: Duration,
|
||||||
synthetic_size_calculation_interval: Duration,
|
|
||||||
node_id: NodeId,
|
node_id: NodeId,
|
||||||
ctx: RequestContext,
|
metrics_ctx: RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) {
|
||||||
let mut ticker = tokio::time::interval(metric_collection_interval);
|
let mut ticker = tokio::time::interval(metric_collection_interval);
|
||||||
|
|
||||||
info!("starting collect_metrics");
|
info!("starting collect_metrics");
|
||||||
|
|
||||||
// spin up background worker that caclulates tenant sizes
|
|
||||||
let worker_ctx =
|
|
||||||
ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
|
|
||||||
task_mgr::spawn(
|
|
||||||
BACKGROUND_RUNTIME.handle(),
|
|
||||||
TaskKind::CalculateSyntheticSize,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
"synthetic size calculation",
|
|
||||||
false,
|
|
||||||
async move {
|
|
||||||
calculate_synthetic_size_worker(synthetic_size_calculation_interval, &worker_ctx)
|
|
||||||
.instrument(info_span!("synthetic_size_worker"))
|
|
||||||
.await?;
|
|
||||||
Ok(())
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
// define client here to reuse it for all requests
|
// define client here to reuse it for all requests
|
||||||
let client = reqwest::Client::new();
|
let client = reqwest::Client::new();
|
||||||
let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
|
let mut cached_metrics: HashMap<ConsumptionMetricsKey, u64> = HashMap::new();
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = task_mgr::shutdown_watcher() => {
|
_ = metrics_ctx.cancelled() => {
|
||||||
info!("collect_metrics received cancellation request");
|
info!("collect_metrics received cancellation request");
|
||||||
return Ok(());
|
return;
|
||||||
},
|
},
|
||||||
_ = ticker.tick() => {
|
_ = ticker.tick() => {
|
||||||
if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx).await
|
if let Err(err) = collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &metrics_ctx).await {
|
||||||
{
|
// Log the error and continue
|
||||||
error!("metrics collection failed: {err:?}");
|
error!("metrics collection failed: {err:?}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -96,60 +172,65 @@ pub async fn collect_metrics(
|
|||||||
///
|
///
|
||||||
/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
|
/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
|
||||||
/// Cache metrics to avoid sending the same metrics multiple times.
|
/// Cache metrics to avoid sending the same metrics multiple times.
|
||||||
///
|
pub async fn collect_metrics_task(
|
||||||
/// TODO
|
|
||||||
/// - refactor this function (chunking+sending part) to reuse it in proxy module;
|
|
||||||
/// - improve error handling. Now if one tenant fails to collect metrics,
|
|
||||||
/// the whole iteration fails and metrics for other tenants are not collected.
|
|
||||||
pub async fn collect_metrics_iteration(
|
|
||||||
client: &reqwest::Client,
|
client: &reqwest::Client,
|
||||||
cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
|
cached_metrics: &mut HashMap<ConsumptionMetricsKey, u64>,
|
||||||
metric_collection_endpoint: &reqwest::Url,
|
metric_collection_endpoint: &reqwest::Url,
|
||||||
node_id: NodeId,
|
node_id: NodeId,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
|
let mut current_metrics: Vec<(ConsumptionMetricsKey, u64)> = Vec::new();
|
||||||
trace!(
|
trace!(
|
||||||
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
|
"starting collect_metrics_task. metric_collection_endpoint: {}",
|
||||||
metric_collection_endpoint
|
metric_collection_endpoint
|
||||||
);
|
);
|
||||||
|
|
||||||
// get list of tenants
|
// get list of tenants
|
||||||
let tenants = mgr::list_tenants().await?;
|
let tenants = mgr::list_tenants().await;
|
||||||
|
|
||||||
// iterate through list of Active tenants and collect metrics
|
// iterate through list of Active tenants and collect metrics
|
||||||
for (tenant_id, tenant_state) in tenants {
|
for (tenant_id, tenant_state) in tenants {
|
||||||
if tenant_state != TenantState::Active {
|
if ctx.is_cancelled() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
let tenant = mgr::get_tenant(tenant_id).await?;
|
||||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
// If the tenant was shut down while while we were looking elsewhere, skip it.
|
||||||
|
let tenant_ctx = match tenant.get_context(ctx) {
|
||||||
|
Ok(ctx) => ctx,
|
||||||
|
Err(_state) => {
|
||||||
|
debug!(
|
||||||
|
"skipping metrics collection for tenant {tenant_id} because it is not active"
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let mut tenant_resident_size = 0;
|
let mut tenant_resident_size = 0;
|
||||||
|
|
||||||
// iterate through list of timelines in tenant
|
// iterate through list of timelines in tenant
|
||||||
for timeline in tenant.list_timelines().iter() {
|
for timeline in tenant.list_timelines().iter() {
|
||||||
// collect per-timeline metrics only for active timelines
|
// collect per-timeline metrics only for active timelines
|
||||||
if timeline.is_active() {
|
if let Ok(timeline_ctx) = timeline.get_context(&tenant_ctx) {
|
||||||
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
|
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
|
||||||
|
|
||||||
current_metrics.push((
|
current_metrics.push((
|
||||||
PageserverConsumptionMetricsKey {
|
ConsumptionMetricsKey {
|
||||||
tenant_id,
|
tenant_id,
|
||||||
timeline_id: Some(timeline.timeline_id),
|
timeline_id: Some(timeline.timeline_id),
|
||||||
metric: WRITTEN_SIZE,
|
metric: ConsumptionMetricKind::WrittenSize,
|
||||||
},
|
},
|
||||||
timeline_written_size,
|
timeline_written_size,
|
||||||
));
|
));
|
||||||
|
|
||||||
let (timeline_logical_size, is_exact) = timeline.get_current_logical_size(ctx)?;
|
let (timeline_logical_size, is_exact) =
|
||||||
|
timeline.get_current_logical_size(&timeline_ctx)?;
|
||||||
// Only send timeline logical size when it is fully calculated.
|
// Only send timeline logical size when it is fully calculated.
|
||||||
if is_exact {
|
if is_exact {
|
||||||
current_metrics.push((
|
current_metrics.push((
|
||||||
PageserverConsumptionMetricsKey {
|
ConsumptionMetricsKey {
|
||||||
tenant_id,
|
tenant_id,
|
||||||
timeline_id: Some(timeline.timeline_id),
|
timeline_id: Some(timeline.timeline_id),
|
||||||
metric: TIMELINE_LOGICAL_SIZE,
|
metric: ConsumptionMetricKind::TimelineLogicalSize,
|
||||||
},
|
},
|
||||||
timeline_logical_size,
|
timeline_logical_size,
|
||||||
));
|
));
|
||||||
@@ -167,34 +248,24 @@ pub async fn collect_metrics_iteration(
|
|||||||
);
|
);
|
||||||
|
|
||||||
current_metrics.push((
|
current_metrics.push((
|
||||||
PageserverConsumptionMetricsKey {
|
ConsumptionMetricsKey {
|
||||||
tenant_id,
|
tenant_id,
|
||||||
timeline_id: None,
|
timeline_id: None,
|
||||||
metric: RESIDENT_SIZE,
|
metric: ConsumptionMetricKind::ResidentSize,
|
||||||
},
|
},
|
||||||
tenant_resident_size,
|
tenant_resident_size,
|
||||||
));
|
));
|
||||||
|
|
||||||
current_metrics.push((
|
current_metrics.push((
|
||||||
PageserverConsumptionMetricsKey {
|
ConsumptionMetricsKey {
|
||||||
tenant_id,
|
tenant_id,
|
||||||
timeline_id: None,
|
timeline_id: None,
|
||||||
metric: REMOTE_STORAGE_SIZE,
|
metric: ConsumptionMetricKind::RemoteStorageSize,
|
||||||
},
|
},
|
||||||
tenant_remote_size,
|
tenant_remote_size,
|
||||||
));
|
));
|
||||||
|
|
||||||
// Note that this metric is calculated in a separate bgworker
|
// TODO add SyntheticStorageSize metric
|
||||||
// Here we only use cached value, which may lag behind the real latest one
|
|
||||||
let tenant_synthetic_size = tenant.get_cached_synthetic_size();
|
|
||||||
current_metrics.push((
|
|
||||||
PageserverConsumptionMetricsKey {
|
|
||||||
tenant_id,
|
|
||||||
timeline_id: None,
|
|
||||||
metric: SYNTHETIC_STORAGE_SIZE,
|
|
||||||
},
|
|
||||||
tenant_synthetic_size,
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Filter metrics
|
// Filter metrics
|
||||||
@@ -210,29 +281,35 @@ pub async fn collect_metrics_iteration(
|
|||||||
|
|
||||||
// Send metrics.
|
// Send metrics.
|
||||||
// Split into chunks of 1000 metrics to avoid exceeding the max request size
|
// Split into chunks of 1000 metrics to avoid exceeding the max request size
|
||||||
|
const CHUNK_SIZE: usize = 1000;
|
||||||
let chunks = current_metrics.chunks(CHUNK_SIZE);
|
let chunks = current_metrics.chunks(CHUNK_SIZE);
|
||||||
|
|
||||||
let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);
|
let mut chunk_to_send: Vec<ConsumptionMetric> = Vec::with_capacity(1000);
|
||||||
|
|
||||||
for chunk in chunks {
|
for chunk in chunks {
|
||||||
chunk_to_send.clear();
|
chunk_to_send.clear();
|
||||||
|
|
||||||
// enrich metrics with type,timestamp and idempotency key before sending
|
// this code block is needed to convince compiler
|
||||||
chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
|
// that rng is not reused aroung await point
|
||||||
kind: EventType::Absolute { time: Utc::now() },
|
{
|
||||||
metric: curr_key.metric,
|
// enrich metrics with timestamp and metric_kind before sending
|
||||||
idempotency_key: idempotency_key(node_id.to_string()),
|
let mut rng = rand::thread_rng();
|
||||||
value: *curr_val,
|
chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
|
||||||
extra: Ids {
|
ConsumptionMetric::new_absolute(
|
||||||
tenant_id: curr_key.tenant_id,
|
curr_key.metric,
|
||||||
timeline_id: curr_key.timeline_id,
|
curr_key.tenant_id,
|
||||||
},
|
curr_key.timeline_id,
|
||||||
}));
|
*curr_val,
|
||||||
|
node_id,
|
||||||
|
&mut rng,
|
||||||
|
)
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
let chunk_json = serde_json::value::to_raw_value(&EventChunk {
|
let chunk_json = serde_json::value::to_raw_value(&EventChunk {
|
||||||
events: &chunk_to_send,
|
events: &chunk_to_send,
|
||||||
})
|
})
|
||||||
.expect("PageserverConsumptionMetric should not fail serialization");
|
.expect("ConsumptionMetric should not fail serialization");
|
||||||
|
|
||||||
let res = client
|
let res = client
|
||||||
.post(metric_collection_endpoint.clone())
|
.post(metric_collection_endpoint.clone())
|
||||||
@@ -259,46 +336,3 @@ pub async fn collect_metrics_iteration(
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Caclculate synthetic size for each active tenant
|
|
||||||
pub async fn calculate_synthetic_size_worker(
|
|
||||||
synthetic_size_calculation_interval: Duration,
|
|
||||||
ctx: &RequestContext,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
info!("starting calculate_synthetic_size_worker");
|
|
||||||
|
|
||||||
let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
|
|
||||||
|
|
||||||
loop {
|
|
||||||
tokio::select! {
|
|
||||||
_ = task_mgr::shutdown_watcher() => {
|
|
||||||
return Ok(());
|
|
||||||
},
|
|
||||||
_ = ticker.tick() => {
|
|
||||||
|
|
||||||
let tenants = match mgr::list_tenants().await {
|
|
||||||
Ok(tenants) => tenants,
|
|
||||||
Err(e) => {
|
|
||||||
warn!("cannot get tenant list: {e:#}");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
// iterate through list of Active tenants and collect metrics
|
|
||||||
for (tenant_id, tenant_state) in tenants {
|
|
||||||
|
|
||||||
if tenant_state != TenantState::Active {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
|
|
||||||
{
|
|
||||||
if let Err(e) = tenant.calculate_synthetic_size(ctx).await {
|
|
||||||
error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,100 +1,100 @@
|
|||||||
//! This module defines `RequestContext`, a structure that we use throughout
|
|
||||||
//! the pageserver to propagate high-level context from places
|
|
||||||
//! that _originate_ activity down to the shared code paths at the
|
|
||||||
//! heart of the pageserver. It's inspired by Golang's `context.Context`.
|
|
||||||
//!
|
//!
|
||||||
//! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
|
//! Most async functions throughout the pageserver take a `ctx: &RequestContext`
|
||||||
//! 1. What high-level activity ([`TaskKind`]) needs this page?
|
//! argument. It is used to control desired behaviour of the operation, and to
|
||||||
//! We need that information as a categorical dimension for page access
|
//! allow cancelling the operation gracefully.
|
||||||
//! statistics, which we, in turn, need to guide layer eviction policy design.
|
|
||||||
//! 2. How should we behave if, to produce the page image, we need to
|
|
||||||
//! on-demand download a layer file ([`DownloadBehavior`]).
|
|
||||||
//!
|
//!
|
||||||
//! [`RequestContext`] satisfies those needs.
|
//! # Context hierarchy
|
||||||
//! The current implementation is a small `struct` that is passed through
|
|
||||||
//! the call chain by reference.
|
|
||||||
//!
|
//!
|
||||||
//! ### Future Work
|
//! RequestContext's form a hierarchy. For example:
|
||||||
//!
|
//!
|
||||||
//! However, we do not intend to stop here, since there are other needs that
|
//! listener context (LibpqEndpointListener)
|
||||||
//! require carrying information from high to low levels of the app.
|
//! connection context (PageRequestHandler)
|
||||||
|
//! per-request context (PageRequestHandler)
|
||||||
//!
|
//!
|
||||||
//! Most importantly, **cancellation signaling** in response to
|
//! The top "listener context" is created at pageserver startup. The tokio
|
||||||
//! 1. timeouts (page_service max response time) and
|
//! task that listens on the libpq protocol TCP port holds that context. When
|
||||||
//! 2. lifecycle requests (detach tenant, delete timeline).
|
//! it accepts a connection, it spawns a new task to handle that connection
|
||||||
|
//! and creates a new per-connection context for it. The mgmt API listener,
|
||||||
|
//! background jobs, and other things form separate but similar hierarchies.
|
||||||
//!
|
//!
|
||||||
//! Related to that, there is sometimes a need to ensure that all tokio tasks spawned
|
//! Usually, each tokio task has its own context, but it's not a strict
|
||||||
//! by the transitive callees of a request have finished. The keyword here
|
//! requirement and some tasks can hold multiple contexts, and converesely,
|
||||||
//! is **Structured Concurrency**, and right now, we use `task_mgr` in most places,
|
//! some contexts are shared by multiple tasks that work together to perform
|
||||||
//! `TaskHandle` in some places, and careful code review around `FuturesUnordered`
|
//! some operation.
|
||||||
//! or `JoinSet` in other places.
|
|
||||||
//!
|
//!
|
||||||
//! We do not yet have a systematic cancellation story in pageserver, and it is
|
//! The hierarchy is not explictly tracked in the RequestContext struct
|
||||||
//! pretty clear that [`RequestContext`] will be responsible for that.
|
//! itself, but only by their cancellation tokens. It's entirely possible for
|
||||||
//! So, the API already prepares for this role through the
|
//! the parent context to be dropped before its children.
|
||||||
//! [`RequestContext::detached_child`] and [`RequestContext::attached_child`] methods.
|
|
||||||
//! See their doc comments for details on how we will use them in the future.
|
|
||||||
//!
|
//!
|
||||||
//! It is not clear whether or how we will enforce Structured Concurrency, and
|
//! # Tenant and Timeline registration
|
||||||
//! what role [`RequestContext`] will play there.
|
|
||||||
//! So, the API doesn't prepare us for this topic.
|
|
||||||
//!
|
//!
|
||||||
//! Other future uses of `RequestContext`:
|
//! Most operations are performed on a particular Tenant or Timeline. When
|
||||||
//! - Communicate compute & IO priorities (user-initiated request vs. background-loop)
|
//! operating on a Tenant or Timeline, it's important that the Tenant/Timeline
|
||||||
//! - Request IDs for distributed tracing
|
//! isn't detached or deleted while there are tasks working on it. To ensure
|
||||||
//! - Request/Timeline/Tenant-scoped log levels
|
//! that, a RequestContext can be registered with a Tenant or Timeline. See
|
||||||
|
//! `Tenant::register_context` and `Timeline::register_context` When
|
||||||
|
//! shutting down a Tenant or Timeline, the shutdown routine cancels all the
|
||||||
|
//! registered contexts, and waits for them to be dropped before completing
|
||||||
|
//! the shutdown.
|
||||||
//!
|
//!
|
||||||
//! RequestContext might look quite different once it supports those features.
|
//! To enforce that you hold a registered context when operating on a Tenant
|
||||||
//! Likely, it will have a shape similar to Golang's `context.Context`.
|
//! or Timeline, most functions take a TimelineRequestContext or
|
||||||
|
//! TenantRequestContext reference as argument.
|
||||||
//!
|
//!
|
||||||
//! ### Why A Struct Instead Of Method Parameters
|
//! NOTE: The Tenant / Timeline registration is separate from the context
|
||||||
|
//! hierarchy. You can create a new RequestContext with TimelineRequestContext
|
||||||
|
//! as the parent, and register it with a different timeline, for example.
|
||||||
//!
|
//!
|
||||||
//! What's typical about such information is that it needs to be passed down
|
//! # Notes
|
||||||
//! along the call chain from high level to low level, but few of the functions
|
|
||||||
//! in the middle need to understand it.
|
|
||||||
//! Further, it is to be expected that we will need to propagate more data
|
|
||||||
//! in the future (see the earlier section on future work).
|
|
||||||
//! Hence, for functions in the middle of the call chain, we have the following
|
|
||||||
//! requirements:
|
|
||||||
//! 1. It should be easy to forward the context to callees.
|
|
||||||
//! 2. To propagate more data from high-level to low-level code, the functions in
|
|
||||||
//! the middle should not need to be modified.
|
|
||||||
//! The solution is to have a container structure ([`RequestContext`]) that
|
|
||||||
//! carries the information. Functions that don't care about what's in it
|
|
||||||
//! pass it along to callees.
|
|
||||||
//!
|
//!
|
||||||
//! ### Why Not Task-Local Variables
|
//! All RequestContexts in the system have a unique ID, and are also tracked
|
||||||
|
//! in a global hash table, CONTEXTS.
|
||||||
//!
|
//!
|
||||||
//! One could use task-local variables (the equivalent of thread-local variables)
|
//! - Futures are normally not assumed to be async cancellation-safe. Pass a
|
||||||
//! to address the immediate needs outlined above.
|
//! RequestContext as argument and use cancel() on it instead.
|
||||||
//! However, we reject task-local variables because:
|
|
||||||
//! 1. they are implicit, thereby making it harder to trace the data flow in code
|
|
||||||
//! reviews and during debugging,
|
|
||||||
//! 2. they can be mutable, which enables implicit return data flow,
|
|
||||||
//! 3. they are restrictive in that code which fans out into multiple tasks,
|
|
||||||
//! or even threads, needs to carefully propagate the state.
|
|
||||||
//!
|
//!
|
||||||
//! In contrast, information flow with [`RequestContext`] is
|
//! - If you perform an operation that depends on some external actor or the
|
||||||
//! 1. always explicit,
|
//! network, use the cancellation token to check for cancellation
|
||||||
//! 2. strictly uni-directional because RequestContext is immutable,
|
//!
|
||||||
//! 3. tangible because a [`RequestContext`] is just a value.
|
//! - By convention, the appropriate context for current operation is carried in
|
||||||
//! When creating child activities, regardless of whether it's a task,
|
//! a variable called 'ctx'. If a function handles multiple contexts, it's
|
||||||
//! thread, or even an RPC to another service, the value can
|
//! best to *not* have a variable called 'ctx', to force you to think which
|
||||||
//! be used like any other argument.
|
//! one to use in each call.
|
||||||
|
//!
|
||||||
|
//! # TODO
|
||||||
|
//! - include a unique request ID for tracing
|
||||||
//!
|
//!
|
||||||
//! The solution is that all code paths are infected with precisely one
|
|
||||||
//! [`RequestContext`] argument. Functions in the middle of the call chain
|
|
||||||
//! only need to pass it on.
|
|
||||||
use crate::task_mgr::TaskKind;
|
|
||||||
|
|
||||||
// The main structure of this module, see module-level comment.
|
use once_cell::sync::Lazy;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
|
use tracing::{info, warn};
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::atomic::{AtomicU64, Ordering};
|
||||||
|
use std::sync::Mutex;
|
||||||
|
|
||||||
|
/// Each RequestContext has a unique context ID. It's just an increasing
|
||||||
|
/// number that we assign.
|
||||||
|
static NEXT_CONTEXT_ID: AtomicU64 = AtomicU64::new(1);
|
||||||
|
|
||||||
|
/// Global registry of contexts
|
||||||
|
static CONTEXTS: Lazy<Mutex<HashMap<RequestContextId, (TaskKind, CancellationToken)>>> =
|
||||||
|
Lazy::new(|| Mutex::new(HashMap::new()));
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
|
||||||
|
pub struct RequestContextId(u64);
|
||||||
|
|
||||||
|
///
|
||||||
pub struct RequestContext {
|
pub struct RequestContext {
|
||||||
|
context_id: RequestContextId,
|
||||||
task_kind: TaskKind,
|
task_kind: TaskKind,
|
||||||
|
|
||||||
download_behavior: DownloadBehavior,
|
download_behavior: DownloadBehavior,
|
||||||
|
cancellation_token: CancellationToken,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Desired behavior if the operation requires an on-demand download
|
/// DownloadBehavior option specifies the behavior if completing the operation
|
||||||
/// to proceed.
|
/// would require downloading a layer file from remote storage.
|
||||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||||
pub enum DownloadBehavior {
|
pub enum DownloadBehavior {
|
||||||
/// Download the layer file. It can take a while.
|
/// Download the layer file. It can take a while.
|
||||||
@@ -108,87 +108,132 @@ pub enum DownloadBehavior {
|
|||||||
Error,
|
Error,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// There are many kinds of tasks in the system. Some are associated with a particular
|
||||||
|
/// tenant or timeline, while others are global.
|
||||||
|
///
|
||||||
|
/// The task kind affects the shutdown sequence on pageserver shutdown and on detach
|
||||||
|
/// of an individual tenant. For example, when shutting down the pageserver, we shut
|
||||||
|
/// down the LibpqEndpointListeners first, so that we don't accept any more client
|
||||||
|
/// connections while we perform the rest of the shutdown duties. See
|
||||||
|
/// [`Timeline::graceful_shutdown and`] and [`tenant_mgr::shutdown_pageserver`]
|
||||||
|
/// for details.
|
||||||
|
///
|
||||||
|
/// Note that we don't try to limit how many task of a certain kind can be running
|
||||||
|
/// at the same time.
|
||||||
|
///
|
||||||
|
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||||
|
pub enum TaskKind {
|
||||||
|
// libpq listener task. It just accepts connection and spawns a
|
||||||
|
// PageRequestHandler task for each connection.
|
||||||
|
LibpqEndpointListener,
|
||||||
|
|
||||||
|
// HTTP endpoint listener.
|
||||||
|
HttpEndpointListener,
|
||||||
|
|
||||||
|
// Task that handles a single connection. A PageRequestHandler task
|
||||||
|
// starts detached from any particular tenant or timeline, but it can be
|
||||||
|
// associated with one later, after receiving a command from the client.
|
||||||
|
PageRequestHandler,
|
||||||
|
|
||||||
|
// Context for one management API request
|
||||||
|
MgmtRequest,
|
||||||
|
|
||||||
|
// Manages the WAL receiver connection for one timeline. It subscribes to
|
||||||
|
// events from storage_broker, decides which safekeeper to connect to. It spawns a
|
||||||
|
// separate WalReceiverConnection task to handle each connection.
|
||||||
|
WalReceiverManager,
|
||||||
|
|
||||||
|
// Handles a connection to a safekeeper, to stream WAL to a timeline.
|
||||||
|
WalReceiverConnection,
|
||||||
|
|
||||||
|
// Garbage collection worker. One per tenant
|
||||||
|
GarbageCollector,
|
||||||
|
|
||||||
|
// Compaction. One per tenant.
|
||||||
|
Compaction,
|
||||||
|
|
||||||
|
// Initial logical size calculation
|
||||||
|
InitialLogicalSizeCalculation,
|
||||||
|
|
||||||
|
// Task that flushes frozen in-memory layers to disk
|
||||||
|
LayerFlush,
|
||||||
|
|
||||||
|
// Task that uploads a file to remote storage
|
||||||
|
RemoteUploadTask,
|
||||||
|
|
||||||
|
// Task that downloads a file from remote storage
|
||||||
|
RemoteDownloadTask,
|
||||||
|
|
||||||
|
// task that handles the initial downloading of all tenants
|
||||||
|
InitialLoad,
|
||||||
|
|
||||||
|
// task that handles attaching a tenant
|
||||||
|
Attach,
|
||||||
|
|
||||||
|
// task that handles metrics collection
|
||||||
|
MetricsCollection,
|
||||||
|
|
||||||
|
// task that drives downloading layers
|
||||||
|
DownloadAllRemoteLayers,
|
||||||
|
|
||||||
|
// Only used in unit tests
|
||||||
|
UnitTest,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for RequestContext {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
CONTEXTS
|
||||||
|
.lock()
|
||||||
|
.unwrap()
|
||||||
|
.remove(&self.context_id)
|
||||||
|
.expect("context is not in global registry");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl RequestContext {
|
impl RequestContext {
|
||||||
/// Create a new RequestContext that has no parent.
|
/// Create a new RequestContext
|
||||||
///
|
|
||||||
/// The function is called `new` because, once we add children
|
|
||||||
/// to it using `detached_child` or `attached_child`, the context
|
|
||||||
/// form a tree (not implemented yet since cancellation will be
|
|
||||||
/// the first feature that requires a tree).
|
|
||||||
///
|
|
||||||
/// # Future: Cancellation
|
|
||||||
///
|
|
||||||
/// The only reason why a context like this one can be canceled is
|
|
||||||
/// because someone explicitly canceled it.
|
|
||||||
/// It has no parent, so it cannot inherit cancellation from there.
|
|
||||||
pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
||||||
|
let cancellation_token = CancellationToken::new();
|
||||||
|
let context_id = RequestContextId(NEXT_CONTEXT_ID.fetch_add(1, Ordering::Relaxed));
|
||||||
|
CONTEXTS
|
||||||
|
.lock()
|
||||||
|
.unwrap()
|
||||||
|
.insert(context_id, (task_kind, cancellation_token.clone()));
|
||||||
|
|
||||||
RequestContext {
|
RequestContext {
|
||||||
task_kind,
|
task_kind,
|
||||||
|
context_id,
|
||||||
download_behavior,
|
download_behavior,
|
||||||
|
cancellation_token,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create a detached child context for a task that may outlive `self`.
|
/// Create a new RequestContext, as a child of 'parent'.
|
||||||
///
|
pub fn with_parent(
|
||||||
/// Use this when spawning new background activity that should complete
|
task_kind: TaskKind,
|
||||||
/// even if the current request is canceled.
|
download_behavior: DownloadBehavior,
|
||||||
///
|
parent: &RequestContext,
|
||||||
/// # Future: Cancellation
|
) -> Self {
|
||||||
///
|
let cancellation_token = parent.cancellation_token.child_token();
|
||||||
/// Cancellation of `self` will not propagate to the child context returned
|
let context_id = RequestContextId(NEXT_CONTEXT_ID.fetch_add(1, Ordering::Relaxed));
|
||||||
/// by this method.
|
CONTEXTS
|
||||||
///
|
.lock()
|
||||||
/// # Future: Structured Concurrency
|
.unwrap()
|
||||||
///
|
.insert(context_id, (task_kind, cancellation_token.clone()));
|
||||||
/// We could add the Future as a parameter to this function, spawn it as a task,
|
|
||||||
/// and pass to the new task the child context as an argument.
|
|
||||||
/// That would be an ergonomic improvement.
|
|
||||||
///
|
|
||||||
/// We could make new calls to this function fail if `self` is already canceled.
|
|
||||||
pub fn detached_child(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
|
||||||
self.child_impl(task_kind, download_behavior)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a child of context `self` for a task that shall not outlive `self`.
|
|
||||||
///
|
|
||||||
/// Use this when fanning-out work to other async tasks.
|
|
||||||
///
|
|
||||||
/// # Future: Cancellation
|
|
||||||
///
|
|
||||||
/// Cancelling a context will propagate to its attached children.
|
|
||||||
///
|
|
||||||
/// # Future: Structured Concurrency
|
|
||||||
///
|
|
||||||
/// We could add the Future as a parameter to this function, spawn it as a task,
|
|
||||||
/// and track its `JoinHandle` inside the `RequestContext`.
|
|
||||||
///
|
|
||||||
/// We could then provide another method to allow waiting for all child tasks
|
|
||||||
/// to finish.
|
|
||||||
///
|
|
||||||
/// We could make new calls to this function fail if `self` is already canceled.
|
|
||||||
/// Alternatively, we could allow the creation but not spawn the task.
|
|
||||||
/// The method to wait for child tasks would return an error, indicating
|
|
||||||
/// that the child task was not started because the context was canceled.
|
|
||||||
pub fn attached_child(&self) -> Self {
|
|
||||||
self.child_impl(self.task_kind(), self.download_behavior())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Use this function when you should be creating a child context using
|
|
||||||
/// [`attached_child`] or [`detached_child`], but your caller doesn't provide
|
|
||||||
/// a context and you are unwilling to change all callers to provide one.
|
|
||||||
///
|
|
||||||
/// Before we add cancellation, we should get rid of this method.
|
|
||||||
pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
|
||||||
Self::new(task_kind, download_behavior)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
|
||||||
RequestContext {
|
RequestContext {
|
||||||
task_kind,
|
task_kind,
|
||||||
|
context_id,
|
||||||
download_behavior,
|
download_behavior,
|
||||||
|
cancellation_token,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn context_id(&self) -> RequestContextId {
|
||||||
|
self.context_id
|
||||||
|
}
|
||||||
|
|
||||||
pub fn task_kind(&self) -> TaskKind {
|
pub fn task_kind(&self) -> TaskKind {
|
||||||
self.task_kind
|
self.task_kind
|
||||||
}
|
}
|
||||||
@@ -196,4 +241,108 @@ impl RequestContext {
|
|||||||
pub fn download_behavior(&self) -> DownloadBehavior {
|
pub fn download_behavior(&self) -> DownloadBehavior {
|
||||||
self.download_behavior
|
self.download_behavior
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn cancellation_token(&self) -> &CancellationToken {
|
||||||
|
&self.cancellation_token
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_cancelled(&self) -> bool {
|
||||||
|
self.cancellation_token.is_cancelled()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn cancelled(&self) {
|
||||||
|
self.cancellation_token.cancelled().await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Cancel all the contexts in 'context_ids' and wait for them to finish.
|
||||||
|
///
|
||||||
|
/// Whenever we notice that one of the contexts has finished, it is removed
|
||||||
|
/// from 'context_ids'. On return, it is empty.
|
||||||
|
///
|
||||||
|
pub async fn cancel_and_wait(context_ids: &mut Vec<RequestContextId>) {
|
||||||
|
{
|
||||||
|
let contexts = CONTEXTS.lock().unwrap();
|
||||||
|
context_ids.retain(|context_id| {
|
||||||
|
if let Some((task_kind, cancellation_token)) = contexts.get(context_id) {
|
||||||
|
info!("cancelling task {task_kind:?} with ID {context_id:?}");
|
||||||
|
cancellation_token.cancel();
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
// Already gone
|
||||||
|
false
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
wait_contexts_to_finish(context_ids).await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn wait_contexts_to_finish(context_ids: &mut Vec<RequestContextId>) {
|
||||||
|
let mut n = 0;
|
||||||
|
while !context_ids.is_empty() {
|
||||||
|
{
|
||||||
|
let contexts = CONTEXTS.lock().unwrap();
|
||||||
|
while let Some(context_id) = context_ids.last() {
|
||||||
|
if let Some((task_kind, _cancellation_token)) = contexts.get(context_id) {
|
||||||
|
info!("waiting for task {task_kind:?} with ID {context_id:?} to finish");
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
context_ids.pop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !context_ids.is_empty() {
|
||||||
|
crate::exponential_backoff(
|
||||||
|
n,
|
||||||
|
crate::DEFAULT_BASE_BACKOFF_SECONDS,
|
||||||
|
crate::DEFAULT_MAX_BACKOFF_SECONDS,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
n += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cancel and wait for all tasks of given 'kind' to finish
|
||||||
|
pub async fn shutdown_tasks(kind: TaskKind) {
|
||||||
|
let mut context_ids = Vec::new();
|
||||||
|
{
|
||||||
|
let contexts = CONTEXTS.lock().unwrap();
|
||||||
|
for (&context_id, (task_kind, cancellation_token)) in contexts.iter() {
|
||||||
|
if *task_kind == kind {
|
||||||
|
cancellation_token.cancel();
|
||||||
|
context_ids.push(context_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
wait_contexts_to_finish(&mut context_ids).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cancel all remaining contexts.
|
||||||
|
///
|
||||||
|
/// This is used as part of pageserver shutdown. We have already shut down all
|
||||||
|
/// tasks / contexts, this is just a backstop or sanity check to make sure we
|
||||||
|
/// didn't miss anything. Hence, also print a warning for any remaining tasks.
|
||||||
|
pub async fn shutdown_all_tasks() {
|
||||||
|
loop {
|
||||||
|
let mut context_ids = Vec::new();
|
||||||
|
{
|
||||||
|
let contexts = CONTEXTS.lock().unwrap();
|
||||||
|
|
||||||
|
if contexts.is_empty() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (&context_id, (task_kind, cancellation_token)) in contexts.iter() {
|
||||||
|
cancellation_token.cancel();
|
||||||
|
context_ids.push(context_id);
|
||||||
|
warn!(
|
||||||
|
"unexpected task of kind {:?} with ID {:?} still running",
|
||||||
|
*task_kind, context_id
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
wait_contexts_to_finish(&mut context_ids).await
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -430,13 +430,6 @@ paths:
|
|||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
format: hex
|
||||||
- name: inputs_only
|
|
||||||
in: query
|
|
||||||
required: false
|
|
||||||
schema:
|
|
||||||
type: boolean
|
|
||||||
description: |
|
|
||||||
When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
|
|
||||||
get:
|
get:
|
||||||
description: |
|
description: |
|
||||||
Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
|
Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
|
||||||
@@ -456,9 +449,8 @@ paths:
|
|||||||
format: hex
|
format: hex
|
||||||
size:
|
size:
|
||||||
type: integer
|
type: integer
|
||||||
nullable: true
|
|
||||||
description: |
|
description: |
|
||||||
Size metric in bytes or null if inputs_only=true was given.
|
Size metric in bytes.
|
||||||
"401":
|
"401":
|
||||||
description: Unauthorized Error
|
description: Unauthorized Error
|
||||||
content:
|
content:
|
||||||
|
|||||||
@@ -3,21 +3,17 @@ use std::sync::Arc;
|
|||||||
use anyhow::{anyhow, Context, Result};
|
use anyhow::{anyhow, Context, Result};
|
||||||
use hyper::StatusCode;
|
use hyper::StatusCode;
|
||||||
use hyper::{Body, Request, Response, Uri};
|
use hyper::{Body, Request, Response, Uri};
|
||||||
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
|
|
||||||
use remote_storage::GenericRemoteStorage;
|
use remote_storage::GenericRemoteStorage;
|
||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
use super::models::{
|
use super::models::{
|
||||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||||
TimelineCreateRequest, TimelineInfo,
|
TimelineCreateRequest, TimelineInfo,
|
||||||
};
|
};
|
||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext, TaskKind};
|
||||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||||
use crate::task_mgr::TaskKind;
|
|
||||||
use crate::tenant::config::TenantConfOpt;
|
use crate::tenant::config::TenantConfOpt;
|
||||||
use crate::tenant::mgr::TenantMapInsertError;
|
use crate::tenant::{PageReconstructError, Timeline, TimelineRequestContext};
|
||||||
use crate::tenant::{PageReconstructError, Timeline};
|
|
||||||
use crate::{config::PageServerConf, tenant::mgr};
|
use crate::{config::PageServerConf, tenant::mgr};
|
||||||
use utils::{
|
use utils::{
|
||||||
auth::JwtAuth,
|
auth::JwtAuth,
|
||||||
@@ -100,45 +96,30 @@ fn apierror_from_prerror(err: PageReconstructError) -> ApiError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn apierror_from_tenant_map_insert_error(e: TenantMapInsertError) -> ApiError {
|
|
||||||
match e {
|
|
||||||
TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
|
|
||||||
ApiError::InternalServerError(anyhow::Error::new(e))
|
|
||||||
}
|
|
||||||
TenantMapInsertError::TenantAlreadyExists(id, state) => {
|
|
||||||
ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
|
|
||||||
}
|
|
||||||
TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper function to construct a TimelineInfo struct for a timeline
|
// Helper function to construct a TimelineInfo struct for a timeline
|
||||||
async fn build_timeline_info(
|
async fn build_timeline_info(
|
||||||
timeline: &Arc<Timeline>,
|
timeline: &Arc<Timeline>,
|
||||||
include_non_incremental_logical_size: bool,
|
include_non_incremental_logical_size: bool,
|
||||||
ctx: &RequestContext,
|
ctx: Option<&TimelineRequestContext>,
|
||||||
) -> anyhow::Result<TimelineInfo> {
|
) -> anyhow::Result<TimelineInfo> {
|
||||||
let mut info = build_timeline_info_common(timeline, ctx)?;
|
let mut info = build_timeline_info_common(timeline, ctx)?;
|
||||||
if include_non_incremental_logical_size {
|
if include_non_incremental_logical_size {
|
||||||
// XXX we should be using spawn_ondemand_logical_size_calculation here.
|
if let Some(ctx) = ctx {
|
||||||
// Otherwise, if someone deletes the timeline / detaches the tenant while
|
info.current_logical_size_non_incremental = Some(
|
||||||
// we're executing this function, we will outlive the timeline on-disk state.
|
timeline
|
||||||
info.current_logical_size_non_incremental = Some(
|
.get_current_logical_size_non_incremental(info.last_record_lsn, ctx)
|
||||||
timeline
|
.await?,
|
||||||
.get_current_logical_size_non_incremental(
|
);
|
||||||
info.last_record_lsn,
|
} else {
|
||||||
CancellationToken::new(),
|
info!("could not calculate non-incremental size for timeline because it is not active");
|
||||||
ctx,
|
}
|
||||||
)
|
|
||||||
.await?,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
Ok(info)
|
Ok(info)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_timeline_info_common(
|
fn build_timeline_info_common(
|
||||||
timeline: &Arc<Timeline>,
|
timeline: &Arc<Timeline>,
|
||||||
ctx: &RequestContext,
|
ctx: Option<&TimelineRequestContext>,
|
||||||
) -> anyhow::Result<TimelineInfo> {
|
) -> anyhow::Result<TimelineInfo> {
|
||||||
let last_record_lsn = timeline.get_last_record_lsn();
|
let last_record_lsn = timeline.get_last_record_lsn();
|
||||||
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
|
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
|
||||||
@@ -159,12 +140,16 @@ fn build_timeline_info_common(
|
|||||||
Lsn(0) => None,
|
Lsn(0) => None,
|
||||||
lsn @ Lsn(_) => Some(lsn),
|
lsn @ Lsn(_) => Some(lsn),
|
||||||
};
|
};
|
||||||
let current_logical_size = match timeline.get_current_logical_size(ctx) {
|
let current_logical_size = if let Some(ctx) = ctx {
|
||||||
Ok((size, _)) => Some(size),
|
match timeline.get_current_logical_size(ctx) {
|
||||||
Err(err) => {
|
Ok((size, _)) => Some(size),
|
||||||
error!("Timeline info creation failed to get current logical size: {err:?}");
|
Err(err) => {
|
||||||
None
|
error!("Timeline info creation failed to get current logical size: {err:?}");
|
||||||
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
};
|
};
|
||||||
let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok());
|
let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok());
|
||||||
let state = timeline.current_state();
|
let state = timeline.current_state();
|
||||||
@@ -210,9 +195,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
|||||||
.new_timeline_id
|
.new_timeline_id
|
||||||
.unwrap_or_else(TimelineId::generate);
|
.unwrap_or_else(TimelineId::generate);
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
|
|
||||||
let tenant = mgr::get_tenant(tenant_id, true)
|
let (tenant, tenant_ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::NotFound)?;
|
.map_err(ApiError::NotFound)?;
|
||||||
match tenant.create_timeline(
|
match tenant.create_timeline(
|
||||||
@@ -220,13 +205,13 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
|||||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||||
request_data.ancestor_start_lsn,
|
request_data.ancestor_start_lsn,
|
||||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
||||||
&ctx,
|
&tenant_ctx,
|
||||||
)
|
)
|
||||||
.instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
.instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||||
.await {
|
.await {
|
||||||
Ok(Some(new_timeline)) => {
|
Ok(Some((new_timeline, timeline_ctx))) => {
|
||||||
// Created. Construct a TimelineInfo for it.
|
// Created. Construct a TimelineInfo for it.
|
||||||
let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
|
let timeline_info = build_timeline_info_common(&new_timeline, Some(&timeline_ctx))
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
json_response(StatusCode::CREATED, timeline_info)
|
json_response(StatusCode::CREATED, timeline_info)
|
||||||
}
|
}
|
||||||
@@ -241,23 +226,25 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
|||||||
query_param_present(&request, "include-non-incremental-logical-size");
|
query_param_present(&request, "include-non-incremental-logical-size");
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let top_ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
|
|
||||||
let response_data = async {
|
let response_data = async {
|
||||||
let tenant = mgr::get_tenant(tenant_id, true)
|
let (tenant, tenant_ctx) = mgr::get_active_tenant(tenant_id, &top_ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::NotFound)?;
|
.map_err(ApiError::NotFound)?;
|
||||||
let timelines = tenant.list_timelines();
|
let timelines = tenant.list_timelines();
|
||||||
|
|
||||||
let mut response_data = Vec::with_capacity(timelines.len());
|
let mut response_data = Vec::with_capacity(timelines.len());
|
||||||
for timeline in timelines {
|
for timeline in timelines {
|
||||||
let timeline_info =
|
let timeline_ctx = timeline.get_context(&tenant_ctx).ok();
|
||||||
build_timeline_info(&timeline, include_non_incremental_logical_size, &ctx)
|
let timeline_info = build_timeline_info(
|
||||||
.await
|
&timeline,
|
||||||
.context(
|
include_non_incremental_logical_size,
|
||||||
"Failed to convert tenant timeline {timeline_id} into the local one: {e:?}",
|
timeline_ctx.as_ref(),
|
||||||
)
|
)
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.await
|
||||||
|
.context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
response_data.push(timeline_info);
|
response_data.push(timeline_info);
|
||||||
}
|
}
|
||||||
@@ -274,7 +261,11 @@ fn query_param_present(request: &Request<Body>, param: &str) -> bool {
|
|||||||
request
|
request
|
||||||
.uri()
|
.uri()
|
||||||
.query()
|
.query()
|
||||||
.map(|v| url::form_urlencoded::parse(v.as_bytes()).any(|(p, _)| p == param))
|
.map(|v| {
|
||||||
|
url::form_urlencoded::parse(v.as_bytes())
|
||||||
|
.into_owned()
|
||||||
|
.any(|(p, _)| p == param)
|
||||||
|
})
|
||||||
.unwrap_or(false)
|
.unwrap_or(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -283,12 +274,13 @@ fn get_query_param(request: &Request<Body>, param_name: &str) -> Result<String,
|
|||||||
Err(ApiError::BadRequest(anyhow!("empty query in request"))),
|
Err(ApiError::BadRequest(anyhow!("empty query in request"))),
|
||||||
|v| {
|
|v| {
|
||||||
url::form_urlencoded::parse(v.as_bytes())
|
url::form_urlencoded::parse(v.as_bytes())
|
||||||
|
.into_owned()
|
||||||
.find(|(k, _)| k == param_name)
|
.find(|(k, _)| k == param_name)
|
||||||
.map_or(
|
.map_or(
|
||||||
Err(ApiError::BadRequest(anyhow!(
|
Err(ApiError::BadRequest(anyhow!(
|
||||||
"no {param_name} specified in query parameters"
|
"no {param_name} specified in query parameters"
|
||||||
))),
|
))),
|
||||||
|(_, v)| Ok(v.into_owned()),
|
|(_, v)| Ok(v),
|
||||||
)
|
)
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
@@ -301,23 +293,26 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
|||||||
query_param_present(&request, "include-non-incremental-logical-size");
|
query_param_present(&request, "include-non-incremental-logical-size");
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
// Logical size calculation needs downloading.
|
let top_ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
|
||||||
|
|
||||||
let timeline_info = async {
|
let timeline_info = async {
|
||||||
let tenant = mgr::get_tenant(tenant_id, true)
|
let (tenant, tenant_ctx) = mgr::get_active_tenant(tenant_id, &top_ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::NotFound)?;
|
.map_err(ApiError::NotFound)?;
|
||||||
|
|
||||||
let timeline = tenant
|
let timeline = tenant
|
||||||
.get_timeline(timeline_id, false)
|
.get_timeline(timeline_id)
|
||||||
.map_err(ApiError::NotFound)?;
|
.map_err(ApiError::NotFound)?;
|
||||||
|
let timeline_ctx = timeline.get_context(&tenant_ctx).ok();
|
||||||
|
|
||||||
let timeline_info =
|
let timeline_info = build_timeline_info(
|
||||||
build_timeline_info(&timeline, include_non_incremental_logical_size, &ctx)
|
&timeline,
|
||||||
.await
|
include_non_incremental_logical_size,
|
||||||
.context("get local timeline info")
|
timeline_ctx.as_ref(),
|
||||||
.map_err(ApiError::InternalServerError)?;
|
)
|
||||||
|
.await
|
||||||
|
.context("Failed to get local timeline info: {e:#}")
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
Ok::<_, ApiError>(timeline_info)
|
Ok::<_, ApiError>(timeline_info)
|
||||||
}
|
}
|
||||||
@@ -339,9 +334,13 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
|
|||||||
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
|
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let timeline = mgr::get_tenant(tenant_id, true)
|
|
||||||
|
let (tenant, ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||||
.await
|
.await
|
||||||
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
|
.map_err(ApiError::NotFound)?;
|
||||||
|
|
||||||
|
let (timeline, ctx) = tenant
|
||||||
|
.get_active_timeline(timeline_id, &ctx)
|
||||||
.map_err(ApiError::NotFound)?;
|
.map_err(ApiError::NotFound)?;
|
||||||
let result = timeline
|
let result = timeline
|
||||||
.find_lsn_for_timestamp(timestamp_pg, &ctx)
|
.find_lsn_for_timestamp(timestamp_pg, &ctx)
|
||||||
@@ -362,17 +361,16 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
|
|||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
|
||||||
|
|
||||||
info!("Handling tenant attach {tenant_id}");
|
info!("Handling tenant attach {tenant_id}");
|
||||||
|
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
|
|
||||||
if let Some(remote_storage) = &state.remote_storage {
|
if let Some(remote_storage) = &state.remote_storage {
|
||||||
mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx)
|
// FIXME: distinguish between "Tenant already exists" and other errors
|
||||||
|
mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
|
||||||
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
|
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
|
||||||
.await
|
.await
|
||||||
.map_err(apierror_from_tenant_map_insert_error)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
} else {
|
} else {
|
||||||
return Err(ApiError::BadRequest(anyhow!(
|
return Err(ApiError::BadRequest(anyhow!(
|
||||||
"attach_tenant is not possible because pageserver was configured without remote storage"
|
"attach_tenant is not possible because pageserver was configured without remote storage"
|
||||||
@@ -387,6 +385,7 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
|
|||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
|
// deleting shouldn't require downloading anything
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||||
|
|
||||||
mgr::delete_timeline(tenant_id, timeline_id, &ctx)
|
mgr::delete_timeline(tenant_id, timeline_id, &ctx)
|
||||||
@@ -420,13 +419,11 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
|
|||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
|
||||||
|
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
|
mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
|
||||||
.instrument(info_span!("load", tenant = %tenant_id))
|
.instrument(info_span!("load", tenant = %tenant_id))
|
||||||
.await
|
.await
|
||||||
.map_err(apierror_from_tenant_map_insert_error)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
json_response(StatusCode::ACCEPTED, ())
|
json_response(StatusCode::ACCEPTED, ())
|
||||||
}
|
}
|
||||||
@@ -453,8 +450,6 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
|||||||
let response_data = mgr::list_tenants()
|
let response_data = mgr::list_tenants()
|
||||||
.instrument(info_span!("tenant_list"))
|
.instrument(info_span!("tenant_list"))
|
||||||
.await
|
.await
|
||||||
.map_err(anyhow::Error::new)
|
|
||||||
.map_err(ApiError::InternalServerError)?
|
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(id, state)| TenantInfo {
|
.map(|(id, state)| TenantInfo {
|
||||||
id: *id,
|
id: *id,
|
||||||
@@ -471,8 +466,10 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
|||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
|
let mut _req_ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
|
|
||||||
let tenant_info = async {
|
let tenant_info = async {
|
||||||
let tenant = mgr::get_tenant(tenant_id, false).await?;
|
let tenant = mgr::get_tenant(tenant_id).await?;
|
||||||
|
|
||||||
// Calculate total physical size of all timelines
|
// Calculate total physical size of all timelines
|
||||||
let mut current_physical_size = 0;
|
let mut current_physical_size = 0;
|
||||||
@@ -495,40 +492,23 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
|||||||
json_response(StatusCode::OK, tenant_info)
|
json_response(StatusCode::OK, tenant_info)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// HTTP endpoint to query the current tenant_size of a tenant.
|
|
||||||
///
|
|
||||||
/// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
|
|
||||||
/// to debug any of the calculations. Requires `tenant_id` request parameter, supports
|
|
||||||
/// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
|
|
||||||
/// values.
|
|
||||||
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let inputs_only = if query_param_present(&request, "inputs_only") {
|
|
||||||
get_query_param(&request, "inputs_only")?
|
|
||||||
.parse()
|
|
||||||
.map_err(|_| ApiError::BadRequest(anyhow!("failed to parse inputs_only")))?
|
|
||||||
} else {
|
|
||||||
false
|
|
||||||
};
|
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let tenant = mgr::get_tenant(tenant_id, true)
|
|
||||||
|
let (tenant, ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
// this can be long operation
|
// this can be long operation, it currently is not backed by any request coalescing or similar
|
||||||
let inputs = tenant
|
let inputs = tenant
|
||||||
.gather_size_inputs(&ctx)
|
.gather_size_inputs(&ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
let size = if !inputs_only {
|
let size = inputs.calculate().map_err(ApiError::InternalServerError)?;
|
||||||
Some(inputs.calculate().map_err(ApiError::InternalServerError)?)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Private response type with the additional "unstable" `inputs` field.
|
/// Private response type with the additional "unstable" `inputs` field.
|
||||||
///
|
///
|
||||||
@@ -540,9 +520,7 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
|||||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||||
id: TenantId,
|
id: TenantId,
|
||||||
/// Size is a mixture of WAL and logical size, so the unit is bytes.
|
/// Size is a mixture of WAL and logical size, so the unit is bytes.
|
||||||
///
|
size: u64,
|
||||||
/// Will be none if `?inputs_only=true` was given.
|
|
||||||
size: Option<u64>,
|
|
||||||
inputs: crate::tenant::size::ModelInputs,
|
inputs: crate::tenant::size::ModelInputs,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -569,7 +547,7 @@ fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn()
|
|||||||
async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
check_permission(&request, None)?;
|
check_permission(&request, None)?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
|
|
||||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||||
|
|
||||||
@@ -648,28 +626,34 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
|||||||
tenant_conf,
|
tenant_conf,
|
||||||
target_tenant_id,
|
target_tenant_id,
|
||||||
state.remote_storage.clone(),
|
state.remote_storage.clone(),
|
||||||
&ctx,
|
|
||||||
)
|
)
|
||||||
.instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
|
.instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
|
||||||
.await
|
.await
|
||||||
.map_err(apierror_from_tenant_map_insert_error)?;
|
// FIXME: `create_tenant` can fail from both user and internal errors. Replace this
|
||||||
|
// with better error handling once the type permits it
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
// We created the tenant. Existing API semantics are that the tenant
|
Ok(match new_tenant {
|
||||||
// is Active when this function returns.
|
Some(tenant) => {
|
||||||
if let res @ Err(_) = new_tenant.wait_to_become_active().await {
|
// We created the tenant. Existing API semantics are that the tenant
|
||||||
// This shouldn't happen because we just created the tenant directory
|
// is Active when this function returns.
|
||||||
// in tenant::mgr::create_tenant, and there aren't any remote timelines
|
if let res @ Err(_) = tenant.wait_to_become_active(ctx).await {
|
||||||
// to load, so, nothing can really fail during load.
|
// This shouldn't happen because we just created the tenant directory
|
||||||
// Don't do cleanup because we don't know how we got here.
|
// in tenant::mgr::create_tenant, and there aren't any remote timelines
|
||||||
// The tenant will likely be in `Broken` state and subsequent
|
// to load, so, nothing can really fail during load.
|
||||||
// calls will fail.
|
// Don't do cleanup because we don't know how we got here.
|
||||||
res.context("created tenant failed to become active")
|
// The tenant will likely be in `Broken` state and subsequent
|
||||||
.map_err(ApiError::InternalServerError)?;
|
// calls will fail.
|
||||||
}
|
res.context("created tenant failed to become active")
|
||||||
json_response(
|
.map_err(ApiError::InternalServerError)?;
|
||||||
StatusCode::CREATED,
|
}
|
||||||
TenantCreateResponse(new_tenant.tenant_id()),
|
json_response(
|
||||||
)
|
StatusCode::CREATED,
|
||||||
|
TenantCreateResponse(tenant.tenant_id()),
|
||||||
|
)?
|
||||||
|
}
|
||||||
|
None => json_response(StatusCode::CONFLICT, ())?,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
@@ -677,6 +661,8 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
|
|||||||
let tenant_id = request_data.tenant_id;
|
let tenant_id = request_data.tenant_id;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
|
|
||||||
let mut tenant_conf: TenantConfOpt = Default::default();
|
let mut tenant_conf: TenantConfOpt = Default::default();
|
||||||
if let Some(gc_period) = request_data.gc_period {
|
if let Some(gc_period) = request_data.gc_period {
|
||||||
tenant_conf.gc_period = Some(
|
tenant_conf.gc_period = Some(
|
||||||
@@ -739,7 +725,7 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
|
|||||||
}
|
}
|
||||||
|
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
|
mgr::update_tenant_config(state.conf, tenant_conf, tenant_id, &ctx)
|
||||||
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
|
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
|
||||||
.await
|
.await
|
||||||
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
|
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
|
||||||
@@ -792,11 +778,20 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
|
|||||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, &ctx).await?;
|
|
||||||
let gc_result = wait_task_done
|
let (tenant, ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::NotFound)?;
|
||||||
|
|
||||||
|
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||||
|
// Use tenant's pitr setting
|
||||||
|
let pitr = tenant.get_pitr_interval();
|
||||||
|
|
||||||
|
fail::fail_point!("immediate_gc_task_pre");
|
||||||
|
let gc_result = tenant
|
||||||
|
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx)
|
||||||
|
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
||||||
.await
|
.await
|
||||||
.context("wait for gc task")
|
|
||||||
.map_err(ApiError::InternalServerError)?
|
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
json_response(StatusCode::OK, gc_result)
|
json_response(StatusCode::OK, gc_result)
|
||||||
@@ -810,16 +805,18 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
|
|||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
|
|
||||||
.await
|
|
||||||
.context("spawn compaction task")
|
|
||||||
.map_err(ApiError::InternalServerError)?;
|
|
||||||
|
|
||||||
let result: anyhow::Result<()> = result_receiver
|
let (tenant, ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::NotFound)?;
|
||||||
|
let (timeline, ctx) = tenant
|
||||||
|
.get_active_timeline(timeline_id, &ctx)
|
||||||
|
.map_err(ApiError::NotFound)?;
|
||||||
|
timeline
|
||||||
|
.compact(&ctx)
|
||||||
|
.instrument(info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id))
|
||||||
.await
|
.await
|
||||||
.context("receive compaction result")
|
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
result.map_err(ApiError::InternalServerError)?;
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
@@ -832,11 +829,12 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
|||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let tenant = mgr::get_tenant(tenant_id, true)
|
|
||||||
|
let (tenant, ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::NotFound)?;
|
.map_err(ApiError::NotFound)?;
|
||||||
let timeline = tenant
|
let (timeline, ctx) = tenant
|
||||||
.get_timeline(timeline_id, true)
|
.get_active_timeline(timeline_id, &ctx)
|
||||||
.map_err(ApiError::NotFound)?;
|
.map_err(ApiError::NotFound)?;
|
||||||
timeline
|
timeline
|
||||||
.freeze_and_flush()
|
.freeze_and_flush()
|
||||||
@@ -844,6 +842,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
|||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
timeline
|
timeline
|
||||||
.compact(&ctx)
|
.compact(&ctx)
|
||||||
|
.instrument(info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id))
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
@@ -851,20 +850,21 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn timeline_download_remote_layers_handler_post(
|
async fn timeline_download_remote_layers_handler_post(
|
||||||
mut request: Request<Body>,
|
request: Request<Body>,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
|
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let tenant = mgr::get_tenant(tenant_id, true)
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
|
|
||||||
|
let (tenant, ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::NotFound)?;
|
.map_err(ApiError::NotFound)?;
|
||||||
let timeline = tenant
|
let (timeline, ctx) = tenant
|
||||||
.get_timeline(timeline_id, true)
|
.get_active_timeline(timeline_id, &ctx)
|
||||||
.map_err(ApiError::NotFound)?;
|
.map_err(ApiError::NotFound)?;
|
||||||
match timeline.spawn_download_all_remote_layers(body).await {
|
match timeline.spawn_download_all_remote_layers(&ctx).await {
|
||||||
Ok(st) => json_response(StatusCode::ACCEPTED, st),
|
Ok(st) => json_response(StatusCode::ACCEPTED, st),
|
||||||
Err(st) => json_response(StatusCode::CONFLICT, st),
|
Err(st) => json_response(StatusCode::CONFLICT, st),
|
||||||
}
|
}
|
||||||
@@ -877,11 +877,13 @@ async fn timeline_download_remote_layers_handler_get(
|
|||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let tenant = mgr::get_tenant(tenant_id, true)
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
|
|
||||||
|
let (tenant, ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::NotFound)?;
|
.map_err(ApiError::NotFound)?;
|
||||||
let timeline = tenant
|
let (timeline, _ctx) = tenant
|
||||||
.get_timeline(timeline_id, true)
|
.get_active_timeline(timeline_id, &ctx)
|
||||||
.map_err(ApiError::NotFound)?;
|
.map_err(ApiError::NotFound)?;
|
||||||
let info = timeline
|
let info = timeline
|
||||||
.get_download_all_remote_layers_task_info()
|
.get_download_all_remote_layers_task_info()
|
||||||
|
|||||||
@@ -12,9 +12,8 @@ use tokio_tar::Archive;
|
|||||||
use tracing::*;
|
use tracing::*;
|
||||||
use walkdir::WalkDir;
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
|
||||||
use crate::pgdatadir_mapping::*;
|
use crate::pgdatadir_mapping::*;
|
||||||
use crate::tenant::Timeline;
|
use crate::tenant::{Timeline, TimelineRequestContext};
|
||||||
use crate::walingest::WalIngest;
|
use crate::walingest::WalIngest;
|
||||||
use crate::walrecord::DecodedWALRecord;
|
use crate::walrecord::DecodedWALRecord;
|
||||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||||
@@ -48,7 +47,7 @@ pub async fn import_timeline_from_postgres_datadir(
|
|||||||
tline: &Timeline,
|
tline: &Timeline,
|
||||||
pgdata_path: &Path,
|
pgdata_path: &Path,
|
||||||
pgdata_lsn: Lsn,
|
pgdata_lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut pg_control: Option<ControlFileData> = None;
|
let mut pg_control: Option<ControlFileData> = None;
|
||||||
|
|
||||||
@@ -116,7 +115,7 @@ async fn import_rel(
|
|||||||
dboid: Oid,
|
dboid: Oid,
|
||||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||||
len: usize,
|
len: usize,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Does it look like a relation file?
|
// Does it look like a relation file?
|
||||||
trace!("importing rel file {}", path.display());
|
trace!("importing rel file {}", path.display());
|
||||||
@@ -202,7 +201,7 @@ async fn import_slru(
|
|||||||
path: &Path,
|
path: &Path,
|
||||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||||
len: usize,
|
len: usize,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
info!("importing slru file {path:?}");
|
info!("importing slru file {path:?}");
|
||||||
|
|
||||||
@@ -260,7 +259,7 @@ async fn import_wal(
|
|||||||
tline: &Timeline,
|
tline: &Timeline,
|
||||||
startpoint: Lsn,
|
startpoint: Lsn,
|
||||||
endpoint: Lsn,
|
endpoint: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
|
let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
|
||||||
|
|
||||||
@@ -335,7 +334,7 @@ pub async fn import_basebackup_from_tar(
|
|||||||
tline: &Timeline,
|
tline: &Timeline,
|
||||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||||
base_lsn: Lsn,
|
base_lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
info!("importing base at {base_lsn}");
|
info!("importing base at {base_lsn}");
|
||||||
let mut modification = tline.begin_modification(base_lsn);
|
let mut modification = tline.begin_modification(base_lsn);
|
||||||
@@ -386,7 +385,7 @@ pub async fn import_wal_from_tar(
|
|||||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||||
start_lsn: Lsn,
|
start_lsn: Lsn,
|
||||||
end_lsn: Lsn,
|
end_lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
// Set up walingest mutable state
|
// Set up walingest mutable state
|
||||||
let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version);
|
let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version);
|
||||||
@@ -477,7 +476,7 @@ async fn import_file(
|
|||||||
file_path: &Path,
|
file_path: &Path,
|
||||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||||
len: usize,
|
len: usize,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<Option<ControlFileData>> {
|
) -> Result<Option<ControlFileData>> {
|
||||||
let file_name = match file_path.file_name() {
|
let file_name = match file_path.file_name() {
|
||||||
Some(name) => name.to_string_lossy(),
|
Some(name) => name.to_string_lossy(),
|
||||||
|
|||||||
@@ -22,7 +22,6 @@ pub mod walredo;
|
|||||||
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use crate::task_mgr::TaskKind;
|
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
/// Current storage format version
|
/// Current storage format version
|
||||||
@@ -42,35 +41,6 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
|
|||||||
|
|
||||||
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
||||||
|
|
||||||
pub async fn shutdown_pageserver(exit_code: i32) {
|
|
||||||
// Shut down the libpq endpoint task. This prevents new connections from
|
|
||||||
// being accepted.
|
|
||||||
task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await;
|
|
||||||
|
|
||||||
// Shut down any page service tasks.
|
|
||||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await;
|
|
||||||
|
|
||||||
// Shut down all the tenants. This flushes everything to disk and kills
|
|
||||||
// the checkpoint and GC tasks.
|
|
||||||
tenant::mgr::shutdown_all_tenants().await;
|
|
||||||
|
|
||||||
// Stop syncing with remote storage.
|
|
||||||
//
|
|
||||||
// FIXME: Does this wait for the sync tasks to finish syncing what's queued up?
|
|
||||||
// Should it?
|
|
||||||
task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), None, None).await;
|
|
||||||
|
|
||||||
// Shut down the HTTP endpoint last, so that you can still check the server's
|
|
||||||
// status while it's shutting down.
|
|
||||||
// FIXME: We should probably stop accepting commands like attach/detach earlier.
|
|
||||||
task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await;
|
|
||||||
|
|
||||||
// There should be nothing left, but let's be sure
|
|
||||||
task_mgr::shutdown_tasks(None, None, None).await;
|
|
||||||
info!("Shut down successfully completed");
|
|
||||||
std::process::exit(exit_code);
|
|
||||||
}
|
|
||||||
|
|
||||||
const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
|
const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
|
||||||
const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
|
const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ use anyhow::Context;
|
|||||||
use bytes::Buf;
|
use bytes::Buf;
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use futures::{Stream, StreamExt};
|
use futures::{Stream, StreamExt};
|
||||||
use pageserver_api::models::TenantState;
|
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
|
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
|
||||||
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
|
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
|
||||||
@@ -31,37 +30,40 @@ use std::sync::Arc;
|
|||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::id::ConnectionId;
|
use utils::id::ConnectionId;
|
||||||
|
use utils::postgres_backend_async::QueryError;
|
||||||
use utils::{
|
use utils::{
|
||||||
auth::{Claims, JwtAuth, Scope},
|
auth::{Claims, JwtAuth, Scope},
|
||||||
id::{TenantId, TimelineId},
|
id::{TenantId, TimelineId},
|
||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
postgres_backend::AuthType,
|
postgres_backend::AuthType,
|
||||||
postgres_backend_async::{self, is_expected_io_error, PostgresBackend, QueryError},
|
postgres_backend_async::{self, PostgresBackend},
|
||||||
simple_rcu::RcuReadGuard,
|
simple_rcu::RcuReadGuard,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::auth::check_permission;
|
use crate::auth::check_permission;
|
||||||
use crate::basebackup;
|
use crate::basebackup;
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext, TaskKind};
|
||||||
use crate::import_datadir::import_wal_from_tar;
|
use crate::import_datadir::import_wal_from_tar;
|
||||||
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
||||||
use crate::task_mgr;
|
use crate::task_mgr;
|
||||||
use crate::task_mgr::TaskKind;
|
|
||||||
use crate::tenant::mgr;
|
use crate::tenant::mgr;
|
||||||
use crate::tenant::{Tenant, Timeline};
|
use crate::tenant::{Tenant, TenantRequestContext, Timeline, TimelineRequestContext};
|
||||||
use crate::trace::Tracer;
|
use crate::trace::Tracer;
|
||||||
|
|
||||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||||
use postgres_ffi::BLCKSZ;
|
use postgres_ffi::BLCKSZ;
|
||||||
|
|
||||||
fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Bytes>> + '_ {
|
fn copyin_stream<'a>(
|
||||||
|
pgb: &'a mut PostgresBackend,
|
||||||
|
ctx: &'a RequestContext,
|
||||||
|
) -> impl Stream<Item = io::Result<Bytes>> + 'a {
|
||||||
async_stream::try_stream! {
|
async_stream::try_stream! {
|
||||||
loop {
|
loop {
|
||||||
let msg = tokio::select! {
|
let msg = tokio::select! {
|
||||||
biased;
|
biased;
|
||||||
|
|
||||||
_ = task_mgr::shutdown_watcher() => {
|
_ = ctx.cancelled() => {
|
||||||
// We were requested to shut down.
|
// We were requested to shut down.
|
||||||
let msg = format!("pageserver is shutting down");
|
let msg = format!("pageserver is shutting down");
|
||||||
let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg, None));
|
let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg, None));
|
||||||
@@ -133,8 +135,9 @@ pub async fn libpq_listener_main(
|
|||||||
while let Some(res) = tokio::select! {
|
while let Some(res) = tokio::select! {
|
||||||
biased;
|
biased;
|
||||||
|
|
||||||
_ = task_mgr::shutdown_watcher() => {
|
_ = listener_ctx.cancelled() => {
|
||||||
// We were requested to shut down.
|
// We were requested to shut down.
|
||||||
|
info!("libpq listener shutting down");
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -148,8 +151,11 @@ pub async fn libpq_listener_main(
|
|||||||
debug!("accepted connection from {}", peer_addr);
|
debug!("accepted connection from {}", peer_addr);
|
||||||
let local_auth = auth.clone();
|
let local_auth = auth.clone();
|
||||||
|
|
||||||
let connection_ctx = listener_ctx
|
let connection_ctx = RequestContext::with_parent(
|
||||||
.detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download);
|
TaskKind::PageRequestHandler,
|
||||||
|
DownloadBehavior::Download,
|
||||||
|
&listener_ctx,
|
||||||
|
);
|
||||||
|
|
||||||
// PageRequestHandler tasks are not associated with any particular
|
// PageRequestHandler tasks are not associated with any particular
|
||||||
// timeline in the task manager. In practice most connections will
|
// timeline in the task manager. In practice most connections will
|
||||||
@@ -157,12 +163,21 @@ pub async fn libpq_listener_main(
|
|||||||
// yet.
|
// yet.
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
&tokio::runtime::Handle::current(),
|
&tokio::runtime::Handle::current(),
|
||||||
TaskKind::PageRequestHandler,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
"serving compute connection task",
|
"serving compute connection task",
|
||||||
false,
|
false,
|
||||||
page_service_conn_main(conf, local_auth, socket, auth_type, connection_ctx),
|
async move {
|
||||||
|
if let Err(err) = page_service_conn_main(
|
||||||
|
conf,
|
||||||
|
local_auth,
|
||||||
|
socket,
|
||||||
|
auth_type,
|
||||||
|
connection_ctx,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
error!("connection handler exited with error: {err:?}");
|
||||||
|
}
|
||||||
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
@@ -197,24 +212,26 @@ async fn page_service_conn_main(
|
|||||||
.set_nodelay(true)
|
.set_nodelay(true)
|
||||||
.context("could not set TCP_NODELAY")?;
|
.context("could not set TCP_NODELAY")?;
|
||||||
|
|
||||||
// XXX: pgbackend.run() should take the connection_ctx,
|
let cancellation_token = connection_ctx.cancellation_token().clone();
|
||||||
// and create a child per-query context when it invokes process_query.
|
|
||||||
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
|
||||||
// and create the per-query context in process_query ourselves.
|
|
||||||
let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
|
let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
|
||||||
let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
|
let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
|
||||||
|
|
||||||
match pgbackend
|
let result = pgbackend
|
||||||
.run(&mut conn_handler, task_mgr::shutdown_watcher)
|
.run(&mut conn_handler, || cancellation_token.cancelled())
|
||||||
.await
|
.await;
|
||||||
{
|
match result {
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
// we've been requested to shut down
|
// we've been requested to shut down
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
|
Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
|
||||||
if is_expected_io_error(&io_error) {
|
// `ConnectionReset` error happens when the Postgres client closes the connection.
|
||||||
info!("Postgres client disconnected ({io_error})");
|
// As this disconnection happens quite often and is expected,
|
||||||
|
// we decided to downgrade the logging level to `INFO`.
|
||||||
|
// See: https://github.com/neondatabase/neon/issues/1683.
|
||||||
|
if io_error.kind() == io::ErrorKind::ConnectionReset {
|
||||||
|
info!("Postgres client disconnected");
|
||||||
Ok(())
|
Ok(())
|
||||||
} else {
|
} else {
|
||||||
Err(io_error).context("Postgres connection error")
|
Err(io_error).context("Postgres connection error")
|
||||||
@@ -262,10 +279,6 @@ struct PageServerHandler {
|
|||||||
auth: Option<Arc<JwtAuth>>,
|
auth: Option<Arc<JwtAuth>>,
|
||||||
claims: Option<Claims>,
|
claims: Option<Claims>,
|
||||||
|
|
||||||
/// The context created for the lifetime of the connection
|
|
||||||
/// services by this PageServerHandler.
|
|
||||||
/// For each query received over the connection,
|
|
||||||
/// `process_query` creates a child context from this one.
|
|
||||||
connection_ctx: RequestContext,
|
connection_ctx: RequestContext,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -283,20 +296,16 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip(self, pgb, ctx))]
|
#[instrument(skip(self, pgb))]
|
||||||
async fn handle_pagerequests(
|
async fn handle_pagerequests(
|
||||||
&self,
|
&mut self,
|
||||||
pgb: &mut PostgresBackend,
|
pgb: &mut PostgresBackend,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
ctx: RequestContext,
|
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// NOTE: pagerequests handler exits when connection is closed,
|
let (tenant, ctx) = get_active_tenant_with_timeout(tenant_id, &self.connection_ctx).await?;
|
||||||
// so there is no need to reset the association
|
|
||||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
|
||||||
|
|
||||||
// Make request tracer if needed
|
// Make request tracer if needed
|
||||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
|
||||||
let mut tracer = if tenant.get_trace_read_requests() {
|
let mut tracer = if tenant.get_trace_read_requests() {
|
||||||
let connection_id = ConnectionId::generate();
|
let connection_id = ConnectionId::generate();
|
||||||
let path = tenant
|
let path = tenant
|
||||||
@@ -308,7 +317,7 @@ impl PageServerHandler {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Check that the timeline exists
|
// Check that the timeline exists
|
||||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
let (timeline, ctx) = tenant.get_active_timeline(timeline_id, &ctx)?;
|
||||||
|
|
||||||
// switch client to COPYBOTH
|
// switch client to COPYBOTH
|
||||||
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
||||||
@@ -320,7 +329,7 @@ impl PageServerHandler {
|
|||||||
let msg = tokio::select! {
|
let msg = tokio::select! {
|
||||||
biased;
|
biased;
|
||||||
|
|
||||||
_ = task_mgr::shutdown_watcher() => {
|
_ = ctx.cancelled() => {
|
||||||
// We were requested to shut down.
|
// We were requested to shut down.
|
||||||
info!("shutdown request received in page handler");
|
info!("shutdown request received in page handler");
|
||||||
break;
|
break;
|
||||||
@@ -386,23 +395,22 @@ impl PageServerHandler {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[instrument(skip(self, pgb))]
|
||||||
#[instrument(skip(self, pgb, ctx))]
|
|
||||||
async fn handle_import_basebackup(
|
async fn handle_import_basebackup(
|
||||||
&self,
|
&mut self,
|
||||||
pgb: &mut PostgresBackend,
|
pgb: &mut PostgresBackend,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
base_lsn: Lsn,
|
base_lsn: Lsn,
|
||||||
_end_lsn: Lsn,
|
_end_lsn: Lsn,
|
||||||
pg_version: u32,
|
pg_version: u32,
|
||||||
ctx: RequestContext,
|
|
||||||
) -> Result<(), QueryError> {
|
) -> Result<(), QueryError> {
|
||||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
|
||||||
// Create empty timeline
|
// Create empty timeline
|
||||||
info!("creating new timeline");
|
info!("creating new timeline");
|
||||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
let (tenant, tenant_ctx) =
|
||||||
let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)?;
|
get_active_tenant_with_timeout(tenant_id, &self.connection_ctx).await?;
|
||||||
|
let (timeline, ctx) =
|
||||||
|
tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &tenant_ctx)?;
|
||||||
|
|
||||||
// TODO mark timeline as not ready until it reaches end_lsn.
|
// TODO mark timeline as not ready until it reaches end_lsn.
|
||||||
// We might have some wal to import as well, and we should prevent compute
|
// We might have some wal to import as well, and we should prevent compute
|
||||||
@@ -419,7 +427,7 @@ impl PageServerHandler {
|
|||||||
pgb.write_message(&BeMessage::CopyInResponse)?;
|
pgb.write_message(&BeMessage::CopyInResponse)?;
|
||||||
pgb.flush().await?;
|
pgb.flush().await?;
|
||||||
|
|
||||||
let mut copyin_stream = Box::pin(copyin_stream(pgb));
|
let mut copyin_stream = Box::pin(copyin_stream(pgb, &ctx));
|
||||||
timeline
|
timeline
|
||||||
.import_basebackup_from_tar(&mut copyin_stream, base_lsn, &ctx)
|
.import_basebackup_from_tar(&mut copyin_stream, base_lsn, &ctx)
|
||||||
.await?;
|
.await?;
|
||||||
@@ -443,19 +451,17 @@ impl PageServerHandler {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip(self, pgb, ctx))]
|
#[instrument(skip(self, pgb))]
|
||||||
async fn handle_import_wal(
|
async fn handle_import_wal(
|
||||||
&self,
|
&mut self,
|
||||||
pgb: &mut PostgresBackend,
|
pgb: &mut PostgresBackend,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
start_lsn: Lsn,
|
start_lsn: Lsn,
|
||||||
end_lsn: Lsn,
|
end_lsn: Lsn,
|
||||||
ctx: RequestContext,
|
|
||||||
) -> Result<(), QueryError> {
|
) -> Result<(), QueryError> {
|
||||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
let (timeline, ctx) =
|
||||||
|
get_active_timeline_with_timeout(tenant_id, timeline_id, &self.connection_ctx).await?;
|
||||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
|
||||||
let last_record_lsn = timeline.get_last_record_lsn();
|
let last_record_lsn = timeline.get_last_record_lsn();
|
||||||
if last_record_lsn != start_lsn {
|
if last_record_lsn != start_lsn {
|
||||||
return Err(QueryError::Other(
|
return Err(QueryError::Other(
|
||||||
@@ -470,7 +476,7 @@ impl PageServerHandler {
|
|||||||
info!("importing wal");
|
info!("importing wal");
|
||||||
pgb.write_message(&BeMessage::CopyInResponse)?;
|
pgb.write_message(&BeMessage::CopyInResponse)?;
|
||||||
pgb.flush().await?;
|
pgb.flush().await?;
|
||||||
let mut copyin_stream = Box::pin(copyin_stream(pgb));
|
let mut copyin_stream = Box::pin(copyin_stream(pgb, &ctx));
|
||||||
let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
|
let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
|
||||||
import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn, &ctx).await?;
|
import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn, &ctx).await?;
|
||||||
info!("wal import complete");
|
info!("wal import complete");
|
||||||
@@ -518,7 +524,7 @@ impl PageServerHandler {
|
|||||||
mut lsn: Lsn,
|
mut lsn: Lsn,
|
||||||
latest: bool,
|
latest: bool,
|
||||||
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<Lsn> {
|
) -> anyhow::Result<Lsn> {
|
||||||
if latest {
|
if latest {
|
||||||
// Latest page version was requested. If LSN is given, it is a hint
|
// Latest page version was requested. If LSN is given, it is a hint
|
||||||
@@ -567,7 +573,7 @@ impl PageServerHandler {
|
|||||||
&self,
|
&self,
|
||||||
timeline: &Timeline,
|
timeline: &Timeline,
|
||||||
req: &PagestreamExistsRequest,
|
req: &PagestreamExistsRequest,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<PagestreamBeMessage> {
|
) -> anyhow::Result<PagestreamBeMessage> {
|
||||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||||
let lsn =
|
let lsn =
|
||||||
@@ -588,7 +594,7 @@ impl PageServerHandler {
|
|||||||
&self,
|
&self,
|
||||||
timeline: &Timeline,
|
timeline: &Timeline,
|
||||||
req: &PagestreamNblocksRequest,
|
req: &PagestreamNblocksRequest,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<PagestreamBeMessage> {
|
) -> anyhow::Result<PagestreamBeMessage> {
|
||||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||||
let lsn =
|
let lsn =
|
||||||
@@ -607,7 +613,7 @@ impl PageServerHandler {
|
|||||||
&self,
|
&self,
|
||||||
timeline: &Timeline,
|
timeline: &Timeline,
|
||||||
req: &PagestreamDbSizeRequest,
|
req: &PagestreamDbSizeRequest,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<PagestreamBeMessage> {
|
) -> anyhow::Result<PagestreamBeMessage> {
|
||||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||||
let lsn =
|
let lsn =
|
||||||
@@ -629,7 +635,7 @@ impl PageServerHandler {
|
|||||||
&self,
|
&self,
|
||||||
timeline: &Timeline,
|
timeline: &Timeline,
|
||||||
req: &PagestreamGetPageRequest,
|
req: &PagestreamGetPageRequest,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<PagestreamBeMessage> {
|
) -> anyhow::Result<PagestreamBeMessage> {
|
||||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||||
let lsn =
|
let lsn =
|
||||||
@@ -653,8 +659,7 @@ impl PageServerHandler {
|
|||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[instrument(skip(self, pgb))]
|
||||||
#[instrument(skip(self, pgb, ctx))]
|
|
||||||
async fn handle_basebackup_request(
|
async fn handle_basebackup_request(
|
||||||
&mut self,
|
&mut self,
|
||||||
pgb: &mut PostgresBackend,
|
pgb: &mut PostgresBackend,
|
||||||
@@ -663,10 +668,11 @@ impl PageServerHandler {
|
|||||||
lsn: Option<Lsn>,
|
lsn: Option<Lsn>,
|
||||||
prev_lsn: Option<Lsn>,
|
prev_lsn: Option<Lsn>,
|
||||||
full_backup: bool,
|
full_backup: bool,
|
||||||
ctx: RequestContext,
|
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// check that the timeline exists
|
// check that the timeline exists
|
||||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
let (timeline, ctx) =
|
||||||
|
get_active_timeline_with_timeout(tenant_id, timeline_id, &self.connection_ctx).await?;
|
||||||
|
|
||||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||||
if let Some(lsn) = lsn {
|
if let Some(lsn) = lsn {
|
||||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||||
@@ -763,7 +769,6 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
|||||||
pgb: &mut PostgresBackend,
|
pgb: &mut PostgresBackend,
|
||||||
query_string: &str,
|
query_string: &str,
|
||||||
) -> Result<(), QueryError> {
|
) -> Result<(), QueryError> {
|
||||||
let ctx = self.connection_ctx.attached_child();
|
|
||||||
debug!("process query {query_string:?}");
|
debug!("process query {query_string:?}");
|
||||||
|
|
||||||
if query_string.starts_with("pagestream ") {
|
if query_string.starts_with("pagestream ") {
|
||||||
@@ -781,7 +786,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
|||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
self.check_permission(Some(tenant_id))?;
|
||||||
|
|
||||||
self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
|
self.handle_pagerequests(pgb, tenant_id, timeline_id)
|
||||||
.await?;
|
.await?;
|
||||||
} else if query_string.starts_with("basebackup ") {
|
} else if query_string.starts_with("basebackup ") {
|
||||||
let (_, params_raw) = query_string.split_at("basebackup ".len());
|
let (_, params_raw) = query_string.split_at("basebackup ".len());
|
||||||
@@ -810,7 +815,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Check that the timeline exists
|
// Check that the timeline exists
|
||||||
self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
|
self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false)
|
||||||
.await?;
|
.await?;
|
||||||
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
}
|
}
|
||||||
@@ -831,7 +836,9 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
|||||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
self.check_permission(Some(tenant_id))?;
|
||||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
let (timeline, _ctx) =
|
||||||
|
get_active_timeline_with_timeout(tenant_id, timeline_id, &self.connection_ctx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||||
|
|
||||||
@@ -882,7 +889,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
|||||||
self.check_permission(Some(tenant_id))?;
|
self.check_permission(Some(tenant_id))?;
|
||||||
|
|
||||||
// Check that the timeline exists
|
// Check that the timeline exists
|
||||||
self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx)
|
self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true)
|
||||||
.await?;
|
.await?;
|
||||||
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
} else if query_string.starts_with("import basebackup ") {
|
} else if query_string.starts_with("import basebackup ") {
|
||||||
@@ -925,7 +932,6 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
|||||||
base_lsn,
|
base_lsn,
|
||||||
end_lsn,
|
end_lsn,
|
||||||
pg_version,
|
pg_version,
|
||||||
ctx,
|
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
@@ -962,7 +968,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
|||||||
self.check_permission(Some(tenant_id))?;
|
self.check_permission(Some(tenant_id))?;
|
||||||
|
|
||||||
match self
|
match self
|
||||||
.handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
|
.handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn)
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||||
@@ -992,7 +998,8 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
|||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
self.check_permission(Some(tenant_id))?;
|
||||||
|
|
||||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
let (tenant, _ctx) =
|
||||||
|
get_active_tenant_with_timeout(tenant_id, &self.connection_ctx).await?;
|
||||||
pgb.write_message(&BeMessage::RowDescription(&[
|
pgb.write_message(&BeMessage::RowDescription(&[
|
||||||
RowDescriptor::int8_col(b"checkpoint_distance"),
|
RowDescriptor::int8_col(b"checkpoint_distance"),
|
||||||
RowDescriptor::int8_col(b"checkpoint_timeout"),
|
RowDescriptor::int8_col(b"checkpoint_timeout"),
|
||||||
@@ -1038,30 +1045,6 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug)]
|
|
||||||
enum GetActiveTenantError {
|
|
||||||
#[error(
|
|
||||||
"Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
|
|
||||||
)]
|
|
||||||
WaitForActiveTimeout {
|
|
||||||
latest_state: TenantState,
|
|
||||||
wait_time: Duration,
|
|
||||||
},
|
|
||||||
#[error(transparent)]
|
|
||||||
Other(#[from] anyhow::Error),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<GetActiveTenantError> for QueryError {
|
|
||||||
fn from(e: GetActiveTenantError) -> Self {
|
|
||||||
match e {
|
|
||||||
GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
|
|
||||||
ConnectionError::Socket(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
|
|
||||||
),
|
|
||||||
GetActiveTenantError::Other(e) => QueryError::Other(e),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get active tenant.
|
/// Get active tenant.
|
||||||
///
|
///
|
||||||
/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
|
/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
|
||||||
@@ -1069,35 +1052,34 @@ impl From<GetActiveTenantError> for QueryError {
|
|||||||
/// all tenants are still loading.
|
/// all tenants are still loading.
|
||||||
async fn get_active_tenant_with_timeout(
|
async fn get_active_tenant_with_timeout(
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
_ctx: &RequestContext, /* require get a context to support cancellation in the future */
|
parent_ctx: &RequestContext,
|
||||||
) -> Result<Arc<Tenant>, GetActiveTenantError> {
|
) -> anyhow::Result<(Arc<Tenant>, TenantRequestContext)> {
|
||||||
let tenant = mgr::get_tenant(tenant_id, false).await?;
|
let child_ctx = RequestContext::with_parent(
|
||||||
let wait_time = Duration::from_secs(30);
|
parent_ctx.task_kind(),
|
||||||
match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
|
parent_ctx.download_behavior(),
|
||||||
Ok(Ok(())) => Ok(tenant),
|
parent_ctx,
|
||||||
// no .context(), the error message is good enough and some tests depend on it
|
);
|
||||||
Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)),
|
|
||||||
Err(_) => {
|
let tenant = mgr::get_tenant(tenant_id).await?;
|
||||||
let latest_state = tenant.current_state();
|
match tokio::time::timeout(
|
||||||
if latest_state == TenantState::Active {
|
Duration::from_secs(30),
|
||||||
Ok(tenant)
|
tenant.wait_to_become_active(child_ctx),
|
||||||
} else {
|
)
|
||||||
Err(GetActiveTenantError::WaitForActiveTimeout {
|
.await
|
||||||
latest_state,
|
{
|
||||||
wait_time,
|
Ok(Ok(ctx)) => Ok((tenant, ctx)),
|
||||||
})
|
Ok(Err(err)) => Err(err),
|
||||||
}
|
Err(_) => anyhow::bail!("Timeout waiting for tenant {tenant_id} to become Active"),
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Shorthand for getting a reference to a Timeline of an Active tenant.
|
/// Shorthand for getting a reference to a Timeline of an Active tenant.
|
||||||
async fn get_active_tenant_timeline(
|
async fn get_active_timeline_with_timeout(
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<Arc<Timeline>, GetActiveTenantError> {
|
) -> anyhow::Result<(Arc<Timeline>, TimelineRequestContext)> {
|
||||||
let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?;
|
get_active_tenant_with_timeout(tenant_id, ctx)
|
||||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
.await
|
||||||
Ok(timeline)
|
.and_then(|(tenant, ctx)| tenant.get_active_timeline(timeline_id, &ctx))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,10 +6,9 @@
|
|||||||
//! walingest.rs handles a few things like implicit relation creation and extension.
|
//! walingest.rs handles a few things like implicit relation creation and extension.
|
||||||
//! Clarify that)
|
//! Clarify that)
|
||||||
//!
|
//!
|
||||||
use super::tenant::{PageReconstructError, Timeline};
|
|
||||||
use crate::context::RequestContext;
|
|
||||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||||
use crate::repository::*;
|
use crate::repository::*;
|
||||||
|
use crate::tenant::{PageReconstructError, Timeline, TimelineRequestContext};
|
||||||
use crate::walrecord::NeonWalRecord;
|
use crate::walrecord::NeonWalRecord;
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use bytes::{Buf, Bytes};
|
use bytes::{Buf, Bytes};
|
||||||
@@ -20,7 +19,6 @@ use postgres_ffi::{Oid, TimestampTz, TransactionId};
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::collections::{hash_map, HashMap, HashSet};
|
use std::collections::{hash_map, HashMap, HashSet};
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
use tracing::{debug, trace, warn};
|
use tracing::{debug, trace, warn};
|
||||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||||
|
|
||||||
@@ -35,14 +33,6 @@ pub enum LsnForTimestamp {
|
|||||||
NoData(Lsn),
|
NoData(Lsn),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
|
||||||
pub enum CalculateLogicalSizeError {
|
|
||||||
#[error("cancelled")]
|
|
||||||
Cancelled,
|
|
||||||
#[error(transparent)]
|
|
||||||
Other(#[from] anyhow::Error),
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
///
|
||||||
/// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
|
/// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
|
||||||
/// and other special kinds of files, in a versioned key-value store. The
|
/// and other special kinds of files, in a versioned key-value store. The
|
||||||
@@ -98,7 +88,7 @@ impl Timeline {
|
|||||||
blknum: BlockNumber,
|
blknum: BlockNumber,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
latest: bool,
|
latest: bool,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<Bytes, PageReconstructError> {
|
) -> Result<Bytes, PageReconstructError> {
|
||||||
if tag.relnode == 0 {
|
if tag.relnode == 0 {
|
||||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||||
@@ -126,7 +116,7 @@ impl Timeline {
|
|||||||
dbnode: Oid,
|
dbnode: Oid,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
latest: bool,
|
latest: bool,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<usize, PageReconstructError> {
|
) -> Result<usize, PageReconstructError> {
|
||||||
let mut total_blocks = 0;
|
let mut total_blocks = 0;
|
||||||
|
|
||||||
@@ -145,7 +135,7 @@ impl Timeline {
|
|||||||
tag: RelTag,
|
tag: RelTag,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
latest: bool,
|
latest: bool,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<BlockNumber, PageReconstructError> {
|
) -> Result<BlockNumber, PageReconstructError> {
|
||||||
if tag.relnode == 0 {
|
if tag.relnode == 0 {
|
||||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||||
@@ -190,7 +180,7 @@ impl Timeline {
|
|||||||
tag: RelTag,
|
tag: RelTag,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
_latest: bool,
|
_latest: bool,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<bool, PageReconstructError> {
|
) -> Result<bool, PageReconstructError> {
|
||||||
if tag.relnode == 0 {
|
if tag.relnode == 0 {
|
||||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||||
@@ -221,7 +211,7 @@ impl Timeline {
|
|||||||
spcnode: Oid,
|
spcnode: Oid,
|
||||||
dbnode: Oid,
|
dbnode: Oid,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<HashSet<RelTag>, PageReconstructError> {
|
) -> Result<HashSet<RelTag>, PageReconstructError> {
|
||||||
// fetch directory listing
|
// fetch directory listing
|
||||||
let key = rel_dir_to_key(spcnode, dbnode);
|
let key = rel_dir_to_key(spcnode, dbnode);
|
||||||
@@ -250,7 +240,7 @@ impl Timeline {
|
|||||||
segno: u32,
|
segno: u32,
|
||||||
blknum: BlockNumber,
|
blknum: BlockNumber,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<Bytes, PageReconstructError> {
|
) -> Result<Bytes, PageReconstructError> {
|
||||||
let key = slru_block_to_key(kind, segno, blknum);
|
let key = slru_block_to_key(kind, segno, blknum);
|
||||||
self.get(key, lsn, ctx).await
|
self.get(key, lsn, ctx).await
|
||||||
@@ -262,7 +252,7 @@ impl Timeline {
|
|||||||
kind: SlruKind,
|
kind: SlruKind,
|
||||||
segno: u32,
|
segno: u32,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<BlockNumber, PageReconstructError> {
|
) -> Result<BlockNumber, PageReconstructError> {
|
||||||
let key = slru_segment_size_to_key(kind, segno);
|
let key = slru_segment_size_to_key(kind, segno);
|
||||||
let mut buf = self.get(key, lsn, ctx).await?;
|
let mut buf = self.get(key, lsn, ctx).await?;
|
||||||
@@ -275,7 +265,7 @@ impl Timeline {
|
|||||||
kind: SlruKind,
|
kind: SlruKind,
|
||||||
segno: u32,
|
segno: u32,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<bool, PageReconstructError> {
|
) -> Result<bool, PageReconstructError> {
|
||||||
// fetch directory listing
|
// fetch directory listing
|
||||||
let key = slru_dir_to_key(kind);
|
let key = slru_dir_to_key(kind);
|
||||||
@@ -300,7 +290,7 @@ impl Timeline {
|
|||||||
pub async fn find_lsn_for_timestamp(
|
pub async fn find_lsn_for_timestamp(
|
||||||
&self,
|
&self,
|
||||||
search_timestamp: TimestampTz,
|
search_timestamp: TimestampTz,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<LsnForTimestamp, PageReconstructError> {
|
) -> Result<LsnForTimestamp, PageReconstructError> {
|
||||||
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
||||||
let min_lsn = *gc_cutoff_lsn_guard;
|
let min_lsn = *gc_cutoff_lsn_guard;
|
||||||
@@ -373,7 +363,7 @@ impl Timeline {
|
|||||||
probe_lsn: Lsn,
|
probe_lsn: Lsn,
|
||||||
found_smaller: &mut bool,
|
found_smaller: &mut bool,
|
||||||
found_larger: &mut bool,
|
found_larger: &mut bool,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<bool, PageReconstructError> {
|
) -> Result<bool, PageReconstructError> {
|
||||||
for segno in self
|
for segno in self
|
||||||
.list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
|
.list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
|
||||||
@@ -409,7 +399,7 @@ impl Timeline {
|
|||||||
&self,
|
&self,
|
||||||
kind: SlruKind,
|
kind: SlruKind,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<HashSet<u32>, PageReconstructError> {
|
) -> Result<HashSet<u32>, PageReconstructError> {
|
||||||
// fetch directory entry
|
// fetch directory entry
|
||||||
let key = slru_dir_to_key(kind);
|
let key = slru_dir_to_key(kind);
|
||||||
@@ -426,7 +416,7 @@ impl Timeline {
|
|||||||
spcnode: Oid,
|
spcnode: Oid,
|
||||||
dbnode: Oid,
|
dbnode: Oid,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<Bytes, PageReconstructError> {
|
) -> Result<Bytes, PageReconstructError> {
|
||||||
let key = relmap_file_key(spcnode, dbnode);
|
let key = relmap_file_key(spcnode, dbnode);
|
||||||
|
|
||||||
@@ -437,7 +427,7 @@ impl Timeline {
|
|||||||
pub async fn list_dbdirs(
|
pub async fn list_dbdirs(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<HashMap<(Oid, Oid), bool>, PageReconstructError> {
|
) -> Result<HashMap<(Oid, Oid), bool>, PageReconstructError> {
|
||||||
// fetch directory entry
|
// fetch directory entry
|
||||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||||
@@ -452,7 +442,7 @@ impl Timeline {
|
|||||||
&self,
|
&self,
|
||||||
xid: TransactionId,
|
xid: TransactionId,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<Bytes, PageReconstructError> {
|
) -> Result<Bytes, PageReconstructError> {
|
||||||
let key = twophase_file_key(xid);
|
let key = twophase_file_key(xid);
|
||||||
let buf = self.get(key, lsn, ctx).await?;
|
let buf = self.get(key, lsn, ctx).await?;
|
||||||
@@ -462,7 +452,7 @@ impl Timeline {
|
|||||||
pub async fn list_twophase_files(
|
pub async fn list_twophase_files(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<HashSet<TransactionId>, PageReconstructError> {
|
) -> Result<HashSet<TransactionId>, PageReconstructError> {
|
||||||
// fetch directory entry
|
// fetch directory entry
|
||||||
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
|
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
|
||||||
@@ -476,7 +466,7 @@ impl Timeline {
|
|||||||
pub async fn get_control_file(
|
pub async fn get_control_file(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<Bytes, PageReconstructError> {
|
) -> Result<Bytes, PageReconstructError> {
|
||||||
self.get(CONTROLFILE_KEY, lsn, ctx).await
|
self.get(CONTROLFILE_KEY, lsn, ctx).await
|
||||||
}
|
}
|
||||||
@@ -484,7 +474,7 @@ impl Timeline {
|
|||||||
pub async fn get_checkpoint(
|
pub async fn get_checkpoint(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<Bytes, PageReconstructError> {
|
) -> Result<Bytes, PageReconstructError> {
|
||||||
self.get(CHECKPOINT_KEY, lsn, ctx).await
|
self.get(CHECKPOINT_KEY, lsn, ctx).await
|
||||||
}
|
}
|
||||||
@@ -497,28 +487,20 @@ impl Timeline {
|
|||||||
pub async fn get_current_logical_size_non_incremental(
|
pub async fn get_current_logical_size_non_incremental(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
cancel: CancellationToken,
|
ctx: &TimelineRequestContext,
|
||||||
ctx: &RequestContext,
|
) -> Result<u64, PageReconstructError> {
|
||||||
) -> Result<u64, CalculateLogicalSizeError> {
|
|
||||||
// Fetch list of database dirs and iterate them
|
// Fetch list of database dirs and iterate them
|
||||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
|
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||||
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
|
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
|
||||||
|
|
||||||
let mut total_size: u64 = 0;
|
let mut total_size: u64 = 0;
|
||||||
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||||
for rel in self
|
for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
|
||||||
.list_rels(*spcnode, *dbnode, lsn, ctx)
|
if ctx.is_cancelled() {
|
||||||
.await
|
return Err(PageReconstructError::Cancelled);
|
||||||
.context("list rels")?
|
|
||||||
{
|
|
||||||
if cancel.is_cancelled() {
|
|
||||||
return Err(CalculateLogicalSizeError::Cancelled);
|
|
||||||
}
|
}
|
||||||
let relsize_key = rel_size_to_key(rel);
|
let relsize_key = rel_size_to_key(rel);
|
||||||
let mut buf = self
|
let mut buf = self.get(relsize_key, lsn, ctx).await?;
|
||||||
.get(relsize_key, lsn, ctx)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("read relation size of {rel:?}"))?;
|
|
||||||
let relsize = buf.get_u32_le();
|
let relsize = buf.get_u32_le();
|
||||||
|
|
||||||
total_size += relsize as u64;
|
total_size += relsize as u64;
|
||||||
@@ -534,7 +516,7 @@ impl Timeline {
|
|||||||
pub async fn collect_keyspace(
|
pub async fn collect_keyspace(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<KeySpace> {
|
) -> anyhow::Result<KeySpace> {
|
||||||
// Iterate through key ranges, greedily packing them into partitions
|
// Iterate through key ranges, greedily packing them into partitions
|
||||||
let mut result = KeySpaceAccum::new();
|
let mut result = KeySpaceAccum::new();
|
||||||
@@ -555,7 +537,8 @@ impl Timeline {
|
|||||||
let mut rels: Vec<RelTag> = self
|
let mut rels: Vec<RelTag> = self
|
||||||
.list_rels(spcnode, dbnode, lsn, ctx)
|
.list_rels(spcnode, dbnode, lsn, ctx)
|
||||||
.await?
|
.await?
|
||||||
.into_iter()
|
.iter()
|
||||||
|
.cloned()
|
||||||
.collect();
|
.collect();
|
||||||
rels.sort_unstable();
|
rels.sort_unstable();
|
||||||
for rel in rels {
|
for rel in rels {
|
||||||
@@ -757,7 +740,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
spcnode: Oid,
|
spcnode: Oid,
|
||||||
dbnode: Oid,
|
dbnode: Oid,
|
||||||
img: Bytes,
|
img: Bytes,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Add it to the directory (if it doesn't exist already)
|
// Add it to the directory (if it doesn't exist already)
|
||||||
let buf = self.get(DBDIR_KEY, ctx).await?;
|
let buf = self.get(DBDIR_KEY, ctx).await?;
|
||||||
@@ -790,7 +773,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
xid: TransactionId,
|
xid: TransactionId,
|
||||||
img: Bytes,
|
img: Bytes,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Add it to the directory entry
|
// Add it to the directory entry
|
||||||
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||||
@@ -821,7 +804,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
spcnode: Oid,
|
spcnode: Oid,
|
||||||
dbnode: Oid,
|
dbnode: Oid,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let req_lsn = self.tline.get_last_record_lsn();
|
let req_lsn = self.tline.get_last_record_lsn();
|
||||||
|
|
||||||
@@ -858,7 +841,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
rel: RelTag,
|
rel: RelTag,
|
||||||
nblocks: BlockNumber,
|
nblocks: BlockNumber,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||||
// It's possible that this is the first rel for this db in this
|
// It's possible that this is the first rel for this db in this
|
||||||
@@ -907,7 +890,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
rel: RelTag,
|
rel: RelTag,
|
||||||
nblocks: BlockNumber,
|
nblocks: BlockNumber,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||||
let last_lsn = self.tline.get_last_record_lsn();
|
let last_lsn = self.tline.get_last_record_lsn();
|
||||||
@@ -938,7 +921,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
rel: RelTag,
|
rel: RelTag,
|
||||||
nblocks: BlockNumber,
|
nblocks: BlockNumber,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||||
|
|
||||||
@@ -960,7 +943,11 @@ impl<'a> DatadirModification<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Drop a relation.
|
/// Drop a relation.
|
||||||
pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
|
pub async fn put_rel_drop(
|
||||||
|
&mut self,
|
||||||
|
rel: RelTag,
|
||||||
|
ctx: &TimelineRequestContext,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||||
|
|
||||||
// Remove it from the directory entry
|
// Remove it from the directory entry
|
||||||
@@ -993,7 +980,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
kind: SlruKind,
|
kind: SlruKind,
|
||||||
segno: u32,
|
segno: u32,
|
||||||
nblocks: BlockNumber,
|
nblocks: BlockNumber,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Add it to the directory entry
|
// Add it to the directory entry
|
||||||
let dir_key = slru_dir_to_key(kind);
|
let dir_key = slru_dir_to_key(kind);
|
||||||
@@ -1037,7 +1024,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
kind: SlruKind,
|
kind: SlruKind,
|
||||||
segno: u32,
|
segno: u32,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Remove it from the directory entry
|
// Remove it from the directory entry
|
||||||
let dir_key = slru_dir_to_key(kind);
|
let dir_key = slru_dir_to_key(kind);
|
||||||
@@ -1068,7 +1055,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
pub async fn drop_twophase_file(
|
pub async fn drop_twophase_file(
|
||||||
&mut self,
|
&mut self,
|
||||||
xid: TransactionId,
|
xid: TransactionId,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Remove it from the directory entry
|
// Remove it from the directory entry
|
||||||
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||||
@@ -1165,7 +1152,11 @@ impl<'a> DatadirModification<'a> {
|
|||||||
|
|
||||||
// Internal helper functions to batch the modifications
|
// Internal helper functions to batch the modifications
|
||||||
|
|
||||||
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
|
async fn get(
|
||||||
|
&self,
|
||||||
|
key: Key,
|
||||||
|
ctx: &TimelineRequestContext,
|
||||||
|
) -> Result<Bytes, PageReconstructError> {
|
||||||
// Have we already updated the same key? Read the pending updated
|
// Have we already updated the same key? Read the pending updated
|
||||||
// version in that case.
|
// version in that case.
|
||||||
//
|
//
|
||||||
@@ -1459,15 +1450,15 @@ fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
|
|||||||
Key {
|
Key {
|
||||||
field1: 0x01,
|
field1: 0x01,
|
||||||
field2,
|
field2,
|
||||||
field3: 1,
|
field3: segno,
|
||||||
field4: segno,
|
field4: 0,
|
||||||
field5: 0,
|
field5: 0,
|
||||||
field6: 0,
|
field6: 0,
|
||||||
}..Key {
|
}..Key {
|
||||||
field1: 0x01,
|
field1: 0x01,
|
||||||
field2,
|
field2,
|
||||||
field3: 1,
|
field3: segno,
|
||||||
field4: segno,
|
field4: 0,
|
||||||
field5: 1,
|
field5: 1,
|
||||||
field6: 0,
|
field6: 0,
|
||||||
}
|
}
|
||||||
@@ -1593,18 +1584,18 @@ fn is_slru_block_key(key: Key) -> bool {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub fn create_test_timeline(
|
pub fn create_test_timeline(
|
||||||
tenant: &crate::tenant::Tenant,
|
tenant: &std::sync::Arc<crate::tenant::Tenant>,
|
||||||
timeline_id: utils::id::TimelineId,
|
timeline_id: utils::id::TimelineId,
|
||||||
pg_version: u32,
|
pg_version: u32,
|
||||||
ctx: &RequestContext,
|
tenant_ctx: &crate::tenant::TenantRequestContext,
|
||||||
) -> anyhow::Result<std::sync::Arc<Timeline>> {
|
) -> anyhow::Result<(std::sync::Arc<Timeline>, TimelineRequestContext)> {
|
||||||
let tline = tenant
|
let (tline, timeline_ctx) =
|
||||||
.create_empty_timeline(timeline_id, Lsn(8), pg_version, ctx)?
|
tenant.create_empty_timeline(timeline_id, Lsn(8), pg_version, tenant_ctx)?;
|
||||||
.initialize(ctx)?;
|
let tline = tline.initialize(&timeline_ctx)?;
|
||||||
let mut m = tline.begin_modification(Lsn(8));
|
let mut m = tline.begin_modification(Lsn(8));
|
||||||
m.init_empty()?;
|
m.init_empty()?;
|
||||||
m.commit()?;
|
m.commit()?;
|
||||||
Ok(tline)
|
Ok((tline, timeline_ctx))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::bool_assert_comparison)]
|
#[allow(clippy::bool_assert_comparison)]
|
||||||
|
|||||||
@@ -37,17 +37,6 @@ impl Key {
|
|||||||
| self.field6 as i128
|
| self.field6 as i128
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_i128(x: i128) -> Self {
|
|
||||||
Key {
|
|
||||||
field1: ((x >> 120) & 0xf) as u8,
|
|
||||||
field2: ((x >> 104) & 0xFFFF) as u32,
|
|
||||||
field3: (x >> 72) as u32,
|
|
||||||
field4: (x >> 40) as u32,
|
|
||||||
field5: (x >> 32) as u8,
|
|
||||||
field6: x as u32,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn next(&self) -> Key {
|
pub fn next(&self) -> Key {
|
||||||
self.add(1)
|
self.add(1)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,59 +1,21 @@
|
|||||||
//!
|
//!
|
||||||
//! This module provides centralized handling of tokio tasks in the Page Server.
|
//! This module provides some helpers for spawning tokio tasks in the pageserver.
|
||||||
//!
|
//!
|
||||||
//! We provide a few basic facilities:
|
//! Mostly just a wrapper around tokio::spawn, with some code to handle panics.
|
||||||
//! - A global registry of tasks that lists what kind of tasks they are, and
|
|
||||||
//! which tenant or timeline they are working on
|
|
||||||
//!
|
|
||||||
//! - The ability to request a task to shut down.
|
|
||||||
//!
|
|
||||||
//!
|
|
||||||
//! # How it works?
|
|
||||||
//!
|
|
||||||
//! There is a global hashmap of all the tasks (`TASKS`). Whenever a new
|
|
||||||
//! task is spawned, a PageServerTask entry is added there, and when a
|
|
||||||
//! task dies, it removes itself from the hashmap. If you want to kill a
|
|
||||||
//! task, you can scan the hashmap to find it.
|
|
||||||
//!
|
|
||||||
//! # Task shutdown
|
|
||||||
//!
|
|
||||||
//! To kill a task, we rely on co-operation from the victim. Each task is
|
|
||||||
//! expected to periodically call the `is_shutdown_requested()` function, and
|
|
||||||
//! if it returns true, exit gracefully. In addition to that, when waiting for
|
|
||||||
//! the network or other long-running operation, you can use
|
|
||||||
//! `shutdown_watcher()` function to get a Future that will become ready if
|
|
||||||
//! the current task has been requested to shut down. You can use that with
|
|
||||||
//! Tokio select!().
|
|
||||||
//!
|
|
||||||
//! TODO: This would be a good place to also handle panics in a somewhat sane way.
|
|
||||||
//! Depending on what task panics, we might want to kill the whole server, or
|
|
||||||
//! only a single tenant or timeline.
|
|
||||||
//!
|
//!
|
||||||
|
|
||||||
// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro.
|
|
||||||
// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224.
|
|
||||||
#![allow(clippy::declare_interior_mutable_const)]
|
|
||||||
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::fmt;
|
|
||||||
use std::future::Future;
|
use std::future::Future;
|
||||||
use std::panic::AssertUnwindSafe;
|
use std::panic::{resume_unwind, AssertUnwindSafe};
|
||||||
use std::sync::atomic::{AtomicU64, Ordering};
|
|
||||||
use std::sync::{Arc, Mutex};
|
|
||||||
|
|
||||||
use futures::FutureExt;
|
use futures::FutureExt;
|
||||||
use tokio::runtime::Runtime;
|
use tokio::runtime::Runtime;
|
||||||
use tokio::task::JoinHandle;
|
use tokio::task::JoinHandle;
|
||||||
use tokio::task_local;
|
|
||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
|
|
||||||
use tracing::{debug, error, info, warn};
|
use tracing::{debug, error, info};
|
||||||
|
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
use utils::id::{TenantId, TimelineId};
|
use crate::context::{self, TaskKind};
|
||||||
|
|
||||||
use crate::shutdown_pageserver;
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// There are four runtimes:
|
// There are four runtimes:
|
||||||
@@ -92,10 +54,6 @@ use crate::shutdown_pageserver;
|
|||||||
// runtime. If a GetPage request comes in before the load of a tenant has finished, the
|
// runtime. If a GetPage request comes in before the load of a tenant has finished, the
|
||||||
// GetPage request will wait for the tenant load to finish.
|
// GetPage request will wait for the tenant load to finish.
|
||||||
//
|
//
|
||||||
// The core Timeline code is synchronous, and uses a bunch of std Mutexes and RWLocks to
|
|
||||||
// protect data structures. Let's keep it that way. Synchronous code is easier to debug
|
|
||||||
// and analyze, and there's a lot of hairy, low-level, performance critical code there.
|
|
||||||
//
|
|
||||||
// It's nice to have different runtimes, so that you can quickly eyeball how much CPU
|
// It's nice to have different runtimes, so that you can quickly eyeball how much CPU
|
||||||
// time each class of operations is taking, with 'top -H' or similar.
|
// time each class of operations is taking, with 'top -H' or similar.
|
||||||
//
|
//
|
||||||
@@ -135,392 +93,81 @@ pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
|||||||
.expect("Failed to create background op runtime")
|
.expect("Failed to create background op runtime")
|
||||||
});
|
});
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
|
||||||
pub struct PageserverTaskId(u64);
|
|
||||||
|
|
||||||
impl fmt::Display for PageserverTaskId {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
||||||
self.0.fmt(f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Each task that we track is associated with a "task ID". It's just an
|
|
||||||
/// increasing number that we assign. Note that it is different from tokio::task::Id.
|
|
||||||
static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1);
|
|
||||||
|
|
||||||
/// Global registry of tasks
|
|
||||||
static TASKS: Lazy<Mutex<HashMap<u64, Arc<PageServerTask>>>> =
|
|
||||||
Lazy::new(|| Mutex::new(HashMap::new()));
|
|
||||||
|
|
||||||
task_local! {
|
|
||||||
// This is a cancellation token which will be cancelled when a task needs to shut down. The
|
|
||||||
// root token is kept in the global registry, so that anyone can send the signal to request
|
|
||||||
// task shutdown.
|
|
||||||
static SHUTDOWN_TOKEN: CancellationToken;
|
|
||||||
|
|
||||||
// Each task holds reference to its own PageServerTask here.
|
|
||||||
static CURRENT_TASK: Arc<PageServerTask>;
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
|
||||||
/// There are many kinds of tasks in the system. Some are associated with a particular
|
|
||||||
/// tenant or timeline, while others are global.
|
|
||||||
///
|
|
||||||
/// Note that we don't try to limit how many task of a certain kind can be running
|
|
||||||
/// at the same time.
|
|
||||||
///
|
|
||||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
|
||||||
pub enum TaskKind {
|
|
||||||
// Pageserver startup, i.e., `main`
|
|
||||||
Startup,
|
|
||||||
|
|
||||||
// libpq listener task. It just accepts connection and spawns a
|
|
||||||
// PageRequestHandler task for each connection.
|
|
||||||
LibpqEndpointListener,
|
|
||||||
|
|
||||||
// HTTP endpoint listener.
|
|
||||||
HttpEndpointListener,
|
|
||||||
|
|
||||||
// Task that handles a single connection. A PageRequestHandler task
|
|
||||||
// starts detached from any particular tenant or timeline, but it can be
|
|
||||||
// associated with one later, after receiving a command from the client.
|
|
||||||
PageRequestHandler,
|
|
||||||
|
|
||||||
/// Manages the WAL receiver connection for one timeline.
|
|
||||||
/// It subscribes to events from storage_broker and decides which safekeeper to connect to.
|
|
||||||
/// Once the decision has been made, it establishes the connection using the `tokio-postgres` library.
|
|
||||||
/// There is at most one connection at any given time.
|
|
||||||
///
|
|
||||||
/// That `tokio-postgres` library represents a connection as two objects: a `Client` and a `Connection`.
|
|
||||||
/// The `Client` object is what library users use to make requests & get responses.
|
|
||||||
/// Internally, `Client` hands over requests to the `Connection` object.
|
|
||||||
/// The `Connection` object is responsible for speaking the wire protocol.
|
|
||||||
///
|
|
||||||
/// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
|
|
||||||
/// That abstraction doesn't use `task_mgr`.
|
|
||||||
/// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
|
|
||||||
/// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
|
|
||||||
///
|
|
||||||
/// Once the connection is established, the `TaskHandle` task creates a
|
|
||||||
/// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
|
|
||||||
/// the `Connection` object.
|
|
||||||
/// A `CancellationToken` created by the `TaskHandle` task ensures
|
|
||||||
/// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
|
|
||||||
WalReceiverManager,
|
|
||||||
|
|
||||||
/// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`].
|
|
||||||
/// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
|
|
||||||
/// See the comment on [`WalReceiverManager`].
|
|
||||||
WalReceiverConnectionHandler,
|
|
||||||
|
|
||||||
/// The task that polls the `tokio-postgres::Connection` object.
|
|
||||||
/// Spawned by task [`WalReceiverConnectionHandler`].
|
|
||||||
/// See the comment on [`WalReceiverManager`].
|
|
||||||
WalReceiverConnectionPoller,
|
|
||||||
|
|
||||||
// Garbage collection worker. One per tenant
|
|
||||||
GarbageCollector,
|
|
||||||
|
|
||||||
// Compaction. One per tenant.
|
|
||||||
Compaction,
|
|
||||||
|
|
||||||
// Initial logical size calculation
|
|
||||||
InitialLogicalSizeCalculation,
|
|
||||||
|
|
||||||
OndemandLogicalSizeCalculation,
|
|
||||||
|
|
||||||
// Task that flushes frozen in-memory layers to disk
|
|
||||||
LayerFlushTask,
|
|
||||||
|
|
||||||
// Task that uploads a file to remote storage
|
|
||||||
RemoteUploadTask,
|
|
||||||
|
|
||||||
// Task that downloads a file from remote storage
|
|
||||||
RemoteDownloadTask,
|
|
||||||
|
|
||||||
// task that handles the initial downloading of all tenants
|
|
||||||
InitialLoad,
|
|
||||||
|
|
||||||
// task that handles attaching a tenant
|
|
||||||
Attach,
|
|
||||||
|
|
||||||
// task that handhes metrics collection
|
|
||||||
MetricsCollection,
|
|
||||||
|
|
||||||
// task that drives downloading layers
|
|
||||||
DownloadAllRemoteLayers,
|
|
||||||
// Task that calculates synthetis size for all active tenants
|
|
||||||
CalculateSyntheticSize,
|
|
||||||
|
|
||||||
// A request that comes in via the pageserver HTTP API.
|
|
||||||
MgmtRequest,
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
UnitTest,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
struct MutableTaskState {
|
|
||||||
/// Tenant and timeline that this task is associated with.
|
|
||||||
tenant_id: Option<TenantId>,
|
|
||||||
timeline_id: Option<TimelineId>,
|
|
||||||
|
|
||||||
/// Handle for waiting for the task to exit. It can be None, if the
|
|
||||||
/// the task has already exited.
|
|
||||||
join_handle: Option<JoinHandle<()>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
struct PageServerTask {
|
|
||||||
#[allow(dead_code)] // unused currently
|
|
||||||
task_id: PageserverTaskId,
|
|
||||||
|
|
||||||
kind: TaskKind,
|
|
||||||
|
|
||||||
name: String,
|
|
||||||
|
|
||||||
// To request task shutdown, just cancel this token.
|
|
||||||
cancel: CancellationToken,
|
|
||||||
|
|
||||||
mutable: Mutex<MutableTaskState>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Launch a new task
|
/// Launch a new task
|
||||||
/// Note: if shutdown_process_on_error is set to true failure
|
///
|
||||||
/// of the task will lead to shutdown of entire process
|
/// This is a wrapper around tokio::spawn. One difference is that the Future
|
||||||
|
/// is marked to return nothing to avoid silently swallowing errors. This
|
||||||
|
/// forces the future to handle errors by itself. If you need the return
|
||||||
|
/// value, you could create another function that passes it through, but we
|
||||||
|
/// don't have a need for that currently.
|
||||||
|
///
|
||||||
|
/// If shutdown_process_on_panic is set to true, panic of the task will lead
|
||||||
|
/// to shutdown of entire process. Otherwise we log the panic and continue.
|
||||||
pub fn spawn<F>(
|
pub fn spawn<F>(
|
||||||
runtime: &tokio::runtime::Handle,
|
runtime: &tokio::runtime::Handle,
|
||||||
kind: TaskKind,
|
|
||||||
tenant_id: Option<TenantId>,
|
|
||||||
timeline_id: Option<TimelineId>,
|
|
||||||
name: &str,
|
name: &str,
|
||||||
shutdown_process_on_error: bool,
|
shutdown_process_on_panic: bool,
|
||||||
future: F,
|
future: F,
|
||||||
) -> PageserverTaskId
|
) -> JoinHandle<F::Output>
|
||||||
where
|
where
|
||||||
F: Future<Output = anyhow::Result<()>> + Send + 'static,
|
F: Future<Output = ()> + Send + 'static,
|
||||||
{
|
{
|
||||||
let cancel = CancellationToken::new();
|
|
||||||
let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed);
|
|
||||||
let task = Arc::new(PageServerTask {
|
|
||||||
task_id: PageserverTaskId(task_id),
|
|
||||||
kind,
|
|
||||||
name: name.to_string(),
|
|
||||||
cancel: cancel.clone(),
|
|
||||||
mutable: Mutex::new(MutableTaskState {
|
|
||||||
tenant_id,
|
|
||||||
timeline_id,
|
|
||||||
join_handle: None,
|
|
||||||
}),
|
|
||||||
});
|
|
||||||
|
|
||||||
TASKS.lock().unwrap().insert(task_id, Arc::clone(&task));
|
|
||||||
|
|
||||||
let mut task_mut = task.mutable.lock().unwrap();
|
|
||||||
|
|
||||||
let task_name = name.to_string();
|
let task_name = name.to_string();
|
||||||
let task_cloned = Arc::clone(&task);
|
runtime.spawn(task_wrapper(task_name, shutdown_process_on_panic, future))
|
||||||
let join_handle = runtime.spawn(task_wrapper(
|
|
||||||
task_name,
|
|
||||||
task_id,
|
|
||||||
task_cloned,
|
|
||||||
cancel,
|
|
||||||
shutdown_process_on_error,
|
|
||||||
future,
|
|
||||||
));
|
|
||||||
task_mut.join_handle = Some(join_handle);
|
|
||||||
drop(task_mut);
|
|
||||||
|
|
||||||
// The task is now running. Nothing more to do here
|
|
||||||
PageserverTaskId(task_id)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This wrapper function runs in a newly-spawned task. It initializes the
|
/// This wrapper function runs in a newly-spawned task. To handle panics.
|
||||||
/// task-local variables and calls the payload function.
|
async fn task_wrapper<F, R>(task_name: String, shutdown_process_on_panic: bool, future: F) -> R
|
||||||
async fn task_wrapper<F>(
|
where
|
||||||
task_name: String,
|
F: Future<Output = R> + Send + 'static,
|
||||||
task_id: u64,
|
|
||||||
task: Arc<PageServerTask>,
|
|
||||||
shutdown_token: CancellationToken,
|
|
||||||
shutdown_process_on_error: bool,
|
|
||||||
future: F,
|
|
||||||
) where
|
|
||||||
F: Future<Output = anyhow::Result<()>> + Send + 'static,
|
|
||||||
{
|
{
|
||||||
debug!("Starting task '{}'", task_name);
|
debug!("Starting task '{}'", task_name);
|
||||||
|
|
||||||
let result = SHUTDOWN_TOKEN
|
// We use AssertUnwindSafe here so that the payload function
|
||||||
.scope(
|
// doesn't need to be UnwindSafe. We don't do anything after the
|
||||||
shutdown_token,
|
// unwinding that would expose us to unwind-unsafe behavior.
|
||||||
CURRENT_TASK.scope(task, {
|
let result = AssertUnwindSafe(future).catch_unwind().await;
|
||||||
// We use AssertUnwindSafe here so that the payload function
|
|
||||||
// doesn't need to be UnwindSafe. We don't do anything after the
|
|
||||||
// unwinding that would expose us to unwind-unsafe behavior.
|
|
||||||
AssertUnwindSafe(future).catch_unwind()
|
|
||||||
}),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
task_finish(result, task_name, task_id, shutdown_process_on_error).await;
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn task_finish(
|
// Handle panics
|
||||||
result: std::result::Result<
|
match result {
|
||||||
anyhow::Result<()>,
|
Ok(result) => {
|
||||||
std::boxed::Box<dyn std::any::Any + std::marker::Send>,
|
debug!("Task '{}' exited normally", task_name);
|
||||||
>,
|
result
|
||||||
task_name: String,
|
}
|
||||||
task_id: u64,
|
Err(err) => {
|
||||||
shutdown_process_on_error: bool,
|
if shutdown_process_on_panic {
|
||||||
) {
|
error!("Shutting down: task '{}' panicked: {:?}", task_name, err);
|
||||||
// Remove our entry from the global hashmap.
|
shutdown_pageserver(1).await;
|
||||||
let task = TASKS
|
unreachable!();
|
||||||
.lock()
|
} else {
|
||||||
.unwrap()
|
error!("Task '{}' panicked: {:?}", task_name, err);
|
||||||
.remove(&task_id)
|
resume_unwind(err);
|
||||||
.expect("no task in registry");
|
|
||||||
|
|
||||||
let mut shutdown_process = false;
|
|
||||||
{
|
|
||||||
let task_mut = task.mutable.lock().unwrap();
|
|
||||||
|
|
||||||
match result {
|
|
||||||
Ok(Ok(())) => {
|
|
||||||
debug!("Task '{}' exited normally", task_name);
|
|
||||||
}
|
|
||||||
Ok(Err(err)) => {
|
|
||||||
if shutdown_process_on_error {
|
|
||||||
error!(
|
|
||||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
|
||||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
|
||||||
);
|
|
||||||
shutdown_process = true;
|
|
||||||
} else {
|
|
||||||
error!(
|
|
||||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
|
||||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(err) => {
|
|
||||||
if shutdown_process_on_error {
|
|
||||||
error!(
|
|
||||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
|
||||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
|
||||||
);
|
|
||||||
shutdown_process = true;
|
|
||||||
} else {
|
|
||||||
error!(
|
|
||||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
|
||||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if shutdown_process {
|
|
||||||
shutdown_pageserver(1).await;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// expected to be called from the task of the given id.
|
|
||||||
pub fn associate_with(tenant_id: Option<TenantId>, timeline_id: Option<TimelineId>) {
|
|
||||||
CURRENT_TASK.with(|ct| {
|
|
||||||
let mut task_mut = ct.mutable.lock().unwrap();
|
|
||||||
task_mut.tenant_id = tenant_id;
|
|
||||||
task_mut.timeline_id = timeline_id;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Is there a task running that matches the criteria
|
|
||||||
|
|
||||||
/// Signal and wait for tasks to shut down.
|
|
||||||
///
|
///
|
||||||
|
/// Perform pageserver shutdown. This is called on receiving a signal,
|
||||||
|
/// or if one of the tasks marked as 'shutdown_process_on_error' dies.
|
||||||
///
|
///
|
||||||
/// The arguments are used to select the tasks to kill. Any None arguments are
|
/// This never returns.
|
||||||
/// ignored. For example, to shut down all WalReceiver tasks:
|
pub async fn shutdown_pageserver(exit_code: i32) {
|
||||||
///
|
// Shut down the libpq endpoint task. This prevents new connections from
|
||||||
/// shutdown_tasks(Some(TaskKind::WalReceiver), None, None)
|
// being accepted.
|
||||||
///
|
context::shutdown_tasks(TaskKind::LibpqEndpointListener).await;
|
||||||
/// Or to shut down all tasks for given timeline:
|
|
||||||
///
|
|
||||||
/// shutdown_tasks(None, Some(tenant_id), Some(timeline_id))
|
|
||||||
///
|
|
||||||
pub async fn shutdown_tasks(
|
|
||||||
kind: Option<TaskKind>,
|
|
||||||
tenant_id: Option<TenantId>,
|
|
||||||
timeline_id: Option<TimelineId>,
|
|
||||||
) {
|
|
||||||
let mut victim_tasks = Vec::new();
|
|
||||||
|
|
||||||
{
|
// Shut down all tenants gracefully
|
||||||
let tasks = TASKS.lock().unwrap();
|
crate::tenant::mgr::shutdown_all_tenants().await;
|
||||||
for task in tasks.values() {
|
|
||||||
let task_mut = task.mutable.lock().unwrap();
|
|
||||||
if (kind.is_none() || Some(task.kind) == kind)
|
|
||||||
&& (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
|
|
||||||
&& (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
|
|
||||||
{
|
|
||||||
task.cancel.cancel();
|
|
||||||
victim_tasks.push(Arc::clone(task));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for task in victim_tasks {
|
// Shut down the HTTP endpoint last, so that you can still check the server's
|
||||||
let join_handle = {
|
// status while it's shutting down.
|
||||||
let mut task_mut = task.mutable.lock().unwrap();
|
// FIXME: We should probably stop accepting commands like attach/detach earlier.
|
||||||
info!("waiting for {} to shut down", task.name);
|
context::shutdown_tasks(TaskKind::HttpEndpointListener).await;
|
||||||
let join_handle = task_mut.join_handle.take();
|
|
||||||
drop(task_mut);
|
// There should be nothing left, but let's be sure
|
||||||
join_handle
|
context::shutdown_all_tasks().await;
|
||||||
};
|
|
||||||
if let Some(join_handle) = join_handle {
|
info!("Shut down successfully completed");
|
||||||
let _ = join_handle.await;
|
std::process::exit(exit_code);
|
||||||
} else {
|
|
||||||
// Possibly one of:
|
|
||||||
// * The task had not even fully started yet.
|
|
||||||
// * It was shut down concurrently and already exited
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn current_task_kind() -> Option<TaskKind> {
|
|
||||||
CURRENT_TASK.try_with(|ct| ct.kind).ok()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn current_task_id() -> Option<PageserverTaskId> {
|
|
||||||
CURRENT_TASK.try_with(|ct| ct.task_id).ok()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A Future that can be used to check if the current task has been requested to
|
|
||||||
/// shut down.
|
|
||||||
pub async fn shutdown_watcher() {
|
|
||||||
let token = SHUTDOWN_TOKEN
|
|
||||||
.try_with(|t| t.clone())
|
|
||||||
.expect("shutdown_requested() called in an unexpected task or thread");
|
|
||||||
|
|
||||||
token.cancelled().await;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Clone the current task's cancellation token, which can be moved across tasks.
|
|
||||||
///
|
|
||||||
/// When the task which is currently executing is shutdown, the cancellation token will be
|
|
||||||
/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
|
|
||||||
/// `tokio::task::JoinSet::spawn`.
|
|
||||||
pub fn shutdown_token() -> CancellationToken {
|
|
||||||
SHUTDOWN_TOKEN
|
|
||||||
.try_with(|t| t.clone())
|
|
||||||
.expect("shutdown_token() called in an unexpected task or thread")
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Has the current task been requested to shut down?
|
|
||||||
pub fn is_shutdown_requested() -> bool {
|
|
||||||
if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
|
|
||||||
cancel.is_cancelled()
|
|
||||||
} else {
|
|
||||||
if !cfg!(test) {
|
|
||||||
warn!("is_shutdown_requested() called in an unexpected task or thread");
|
|
||||||
}
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -28,12 +28,7 @@ pub mod defaults {
|
|||||||
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
||||||
|
|
||||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||||
|
pub const DEFAULT_GC_PERIOD: &str = "100 s";
|
||||||
// Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
|
|
||||||
// If there's a need to decrease this value, first make sure that GC
|
|
||||||
// doesn't hold a layer map write lock for non-trivial operations.
|
|
||||||
// Relevant: https://github.com/neondatabase/neon/issues/3394
|
|
||||||
pub const DEFAULT_GC_PERIOD: &str = "1 hr";
|
|
||||||
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
|
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
|
||||||
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
|
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
|
||||||
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
|
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -1,583 +0,0 @@
|
|||||||
use std::collections::BTreeMap;
|
|
||||||
use std::ops::Range;
|
|
||||||
|
|
||||||
use tracing::info;
|
|
||||||
|
|
||||||
use super::layer_coverage::LayerCoverageTuple;
|
|
||||||
|
|
||||||
/// Layers in this module are identified and indexed by this data.
|
|
||||||
///
|
|
||||||
/// This is a helper struct to enable sorting layers by lsn.start.
|
|
||||||
///
|
|
||||||
/// These three values are enough to uniquely identify a layer, since
|
|
||||||
/// a layer is obligated to contain all contents within range, so two
|
|
||||||
/// deltas (or images) with the same range have identical content.
|
|
||||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
|
||||||
pub struct LayerKey {
|
|
||||||
// TODO I use i128 and u64 because it was easy for prototyping,
|
|
||||||
// testing, and benchmarking. If we can use the Lsn and Key
|
|
||||||
// types without overhead that would be preferable.
|
|
||||||
pub key: Range<i128>,
|
|
||||||
pub lsn: Range<u64>,
|
|
||||||
pub is_image: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PartialOrd for LayerKey {
|
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
|
||||||
Some(self.cmp(other))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Ord for LayerKey {
|
|
||||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
|
||||||
// NOTE we really care about comparing by lsn.start first
|
|
||||||
self.lsn
|
|
||||||
.start
|
|
||||||
.cmp(&other.lsn.start)
|
|
||||||
.then(self.lsn.end.cmp(&other.lsn.end))
|
|
||||||
.then(self.key.start.cmp(&other.key.start))
|
|
||||||
.then(self.key.end.cmp(&other.key.end))
|
|
||||||
.then(self.is_image.cmp(&other.is_image))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Efficiently queryable layer coverage for each LSN.
|
|
||||||
///
|
|
||||||
/// Allows answering layer map queries very efficiently,
|
|
||||||
/// but doesn't allow retroactive insertion, which is
|
|
||||||
/// sometimes necessary. See BufferedHistoricLayerCoverage.
|
|
||||||
pub struct HistoricLayerCoverage<Value> {
|
|
||||||
/// The latest state
|
|
||||||
head: LayerCoverageTuple<Value>,
|
|
||||||
|
|
||||||
/// All previous states
|
|
||||||
historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: Clone> Default for HistoricLayerCoverage<T> {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self::new()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<Value: Clone> HistoricLayerCoverage<Value> {
|
|
||||||
pub fn new() -> Self {
|
|
||||||
Self {
|
|
||||||
head: LayerCoverageTuple::default(),
|
|
||||||
historic: BTreeMap::default(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Add a layer
|
|
||||||
///
|
|
||||||
/// Panics if new layer has older lsn.start than an existing layer.
|
|
||||||
/// See BufferedHistoricLayerCoverage for a more general insertion method.
|
|
||||||
pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
|
|
||||||
// It's only a persistent map, not a retroactive one
|
|
||||||
if let Some(last_entry) = self.historic.iter().next_back() {
|
|
||||||
let last_lsn = last_entry.0;
|
|
||||||
if layer_key.lsn.start < *last_lsn {
|
|
||||||
panic!("unexpected retroactive insert");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Insert into data structure
|
|
||||||
if layer_key.is_image {
|
|
||||||
self.head
|
|
||||||
.image_coverage
|
|
||||||
.insert(layer_key.key, layer_key.lsn.clone(), value);
|
|
||||||
} else {
|
|
||||||
self.head
|
|
||||||
.delta_coverage
|
|
||||||
.insert(layer_key.key, layer_key.lsn.clone(), value);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Remember history. Clone is O(1)
|
|
||||||
self.historic.insert(layer_key.lsn.start, self.head.clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Query at a particular LSN, inclusive
|
|
||||||
pub fn get_version(&self, lsn: u64) -> Option<&LayerCoverageTuple<Value>> {
|
|
||||||
match self.historic.range(..=lsn).next_back() {
|
|
||||||
Some((_, v)) => Some(v),
|
|
||||||
None => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Remove all entries after a certain LSN (inclusive)
|
|
||||||
pub fn trim(&mut self, begin: &u64) {
|
|
||||||
self.historic.split_off(begin);
|
|
||||||
self.head = self
|
|
||||||
.historic
|
|
||||||
.iter()
|
|
||||||
.rev()
|
|
||||||
.next()
|
|
||||||
.map(|(_, v)| v.clone())
|
|
||||||
.unwrap_or_default();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This is the most basic test that demonstrates intended usage.
|
|
||||||
/// All layers in this test have height 1.
|
|
||||||
#[test]
|
|
||||||
fn test_persistent_simple() {
|
|
||||||
let mut map = HistoricLayerCoverage::<String>::new();
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 0..5,
|
|
||||||
lsn: 100..101,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Layer 1".to_string(),
|
|
||||||
);
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 3..9,
|
|
||||||
lsn: 110..111,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Layer 2".to_string(),
|
|
||||||
);
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 5..6,
|
|
||||||
lsn: 120..121,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Layer 3".to_string(),
|
|
||||||
);
|
|
||||||
|
|
||||||
// After Layer 1 insertion
|
|
||||||
let version = map.get_version(105).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
|
|
||||||
|
|
||||||
// After Layer 2 insertion
|
|
||||||
let version = map.get_version(115).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(8), Some("Layer 2".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(11), None);
|
|
||||||
|
|
||||||
// After Layer 3 insertion
|
|
||||||
let version = map.get_version(125).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(5), Some("Layer 3".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(7), Some("Layer 2".to_string()));
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Cover simple off-by-one edge cases
|
|
||||||
#[test]
|
|
||||||
fn test_off_by_one() {
|
|
||||||
let mut map = HistoricLayerCoverage::<String>::new();
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 3..5,
|
|
||||||
lsn: 100..110,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Layer 1".to_string(),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Check different LSNs
|
|
||||||
let version = map.get_version(99);
|
|
||||||
assert!(version.is_none());
|
|
||||||
let version = map.get_version(100).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
|
|
||||||
let version = map.get_version(110).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
|
|
||||||
|
|
||||||
// Check different keys
|
|
||||||
let version = map.get_version(105).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(2), None);
|
|
||||||
assert_eq!(version.image_coverage.query(3), Some("Layer 1".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(5), None);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Cover edge cases where layers begin or end on the same key
|
|
||||||
#[test]
|
|
||||||
fn test_key_collision() {
|
|
||||||
let mut map = HistoricLayerCoverage::<String>::new();
|
|
||||||
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 3..5,
|
|
||||||
lsn: 100..110,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Layer 10".to_string(),
|
|
||||||
);
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 5..8,
|
|
||||||
lsn: 100..110,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Layer 11".to_string(),
|
|
||||||
);
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 3..4,
|
|
||||||
lsn: 200..210,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Layer 20".to_string(),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Check after layer 11
|
|
||||||
let version = map.get_version(105).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(2), None);
|
|
||||||
assert_eq!(
|
|
||||||
version.image_coverage.query(3),
|
|
||||||
Some("Layer 10".to_string())
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
version.image_coverage.query(5),
|
|
||||||
Some("Layer 11".to_string())
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
version.image_coverage.query(7),
|
|
||||||
Some("Layer 11".to_string())
|
|
||||||
);
|
|
||||||
assert_eq!(version.image_coverage.query(8), None);
|
|
||||||
|
|
||||||
// Check after layer 20
|
|
||||||
let version = map.get_version(205).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(2), None);
|
|
||||||
assert_eq!(
|
|
||||||
version.image_coverage.query(3),
|
|
||||||
Some("Layer 20".to_string())
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
version.image_coverage.query(5),
|
|
||||||
Some("Layer 11".to_string())
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
version.image_coverage.query(7),
|
|
||||||
Some("Layer 11".to_string())
|
|
||||||
);
|
|
||||||
assert_eq!(version.image_coverage.query(8), None);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Test when rectangles have nontrivial height and possibly overlap
|
|
||||||
#[test]
|
|
||||||
fn test_persistent_overlapping() {
|
|
||||||
let mut map = HistoricLayerCoverage::<String>::new();
|
|
||||||
|
|
||||||
// Add 3 key-disjoint layers with varying LSN ranges
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 1..2,
|
|
||||||
lsn: 100..200,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Layer 1".to_string(),
|
|
||||||
);
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 4..5,
|
|
||||||
lsn: 110..200,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Layer 2".to_string(),
|
|
||||||
);
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 7..8,
|
|
||||||
lsn: 120..300,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Layer 3".to_string(),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Add wide and short layer
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 0..9,
|
|
||||||
lsn: 130..199,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Layer 4".to_string(),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Add wide layer taller than some
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 0..9,
|
|
||||||
lsn: 140..201,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Layer 5".to_string(),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Add wide layer taller than all
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 0..9,
|
|
||||||
lsn: 150..301,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Layer 6".to_string(),
|
|
||||||
);
|
|
||||||
|
|
||||||
// After layer 4 insertion
|
|
||||||
let version = map.get_version(135).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(0), Some("Layer 4".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(2), Some("Layer 4".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(5), Some("Layer 4".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(8), Some("Layer 4".to_string()));
|
|
||||||
|
|
||||||
// After layer 5 insertion
|
|
||||||
let version = map.get_version(145).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(0), Some("Layer 5".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(1), Some("Layer 5".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(2), Some("Layer 5".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(4), Some("Layer 5".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(5), Some("Layer 5".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(8), Some("Layer 5".to_string()));
|
|
||||||
|
|
||||||
// After layer 6 insertion
|
|
||||||
let version = map.get_version(155).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(0), Some("Layer 6".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(1), Some("Layer 6".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(2), Some("Layer 6".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(4), Some("Layer 6".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(5), Some("Layer 6".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(7), Some("Layer 6".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(8), Some("Layer 6".to_string()));
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Wrapper for HistoricLayerCoverage that allows us to hack around the lack
|
|
||||||
/// of support for retroactive insertion by rebuilding the map since the
|
|
||||||
/// change.
|
|
||||||
///
|
|
||||||
/// Why is this needed? We most often insert new layers with newer LSNs,
|
|
||||||
/// but during compaction we create layers with non-latest LSN, and during
|
|
||||||
/// GC we delete historic layers.
|
|
||||||
///
|
|
||||||
/// Even though rebuilding is an expensive (N log N) solution to the problem,
|
|
||||||
/// it's not critical since we do something equally expensive just to decide
|
|
||||||
/// whether or not to create new image layers.
|
|
||||||
/// TODO It's not expensive but it's not great to hold a layer map write lock
|
|
||||||
/// for that long.
|
|
||||||
///
|
|
||||||
/// If this becomes an actual bottleneck, one solution would be to build a
|
|
||||||
/// segment tree that holds PersistentLayerMaps. Though this would mean that
|
|
||||||
/// we take an additional log(N) performance hit for queries, which will probably
|
|
||||||
/// still be more critical.
|
|
||||||
///
|
|
||||||
/// See this for more on persistent and retroactive techniques:
|
|
||||||
/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
|
|
||||||
pub struct BufferedHistoricLayerCoverage<Value> {
|
|
||||||
/// A persistent layer map that we rebuild when we need to retroactively update
|
|
||||||
historic_coverage: HistoricLayerCoverage<Value>,
|
|
||||||
|
|
||||||
/// We buffer insertion into the PersistentLayerMap to decrease the number of rebuilds.
|
|
||||||
buffer: BTreeMap<LayerKey, Option<Value>>,
|
|
||||||
|
|
||||||
/// All current layers. This is not used for search. Only to make rebuilds easier.
|
|
||||||
layers: BTreeMap<LayerKey, Value>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: std::fmt::Debug> std::fmt::Debug for BufferedHistoricLayerCoverage<T> {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
f.debug_struct("RetroactiveLayerMap")
|
|
||||||
.field("buffer", &self.buffer)
|
|
||||||
.field("layers", &self.layers)
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: Clone> Default for BufferedHistoricLayerCoverage<T> {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self::new()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
|
||||||
pub fn new() -> Self {
|
|
||||||
Self {
|
|
||||||
historic_coverage: HistoricLayerCoverage::<Value>::new(),
|
|
||||||
buffer: BTreeMap::new(),
|
|
||||||
layers: BTreeMap::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
|
|
||||||
self.buffer.insert(layer_key, Some(value));
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn remove(&mut self, layer_key: LayerKey) {
|
|
||||||
self.buffer.insert(layer_key, None);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn rebuild(&mut self) {
|
|
||||||
// Find the first LSN that needs to be rebuilt
|
|
||||||
let rebuild_since: u64 = match self.buffer.iter().next() {
|
|
||||||
Some((LayerKey { lsn, .. }, _)) => lsn.start,
|
|
||||||
None => return, // No need to rebuild if buffer is empty
|
|
||||||
};
|
|
||||||
|
|
||||||
// Apply buffered updates to self.layers
|
|
||||||
let num_updates = self.buffer.len();
|
|
||||||
self.buffer.retain(|layer_key, layer| {
|
|
||||||
match layer {
|
|
||||||
Some(l) => {
|
|
||||||
self.layers.insert(layer_key.clone(), l.clone());
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
self.layers.remove(layer_key);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
false
|
|
||||||
});
|
|
||||||
|
|
||||||
// Rebuild
|
|
||||||
let mut num_inserted = 0;
|
|
||||||
self.historic_coverage.trim(&rebuild_since);
|
|
||||||
for (layer_key, layer) in self.layers.range(
|
|
||||||
LayerKey {
|
|
||||||
lsn: rebuild_since..0,
|
|
||||||
key: 0..0,
|
|
||||||
is_image: false,
|
|
||||||
}..,
|
|
||||||
) {
|
|
||||||
self.historic_coverage
|
|
||||||
.insert(layer_key.clone(), layer.clone());
|
|
||||||
num_inserted += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO maybe only warn if ratio is at least 10
|
|
||||||
info!(
|
|
||||||
"Rebuilt layer map. Did {} insertions to process a batch of {} updates.",
|
|
||||||
num_inserted, num_updates,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Iterate all the layers
|
|
||||||
pub fn iter(&self) -> impl '_ + Iterator<Item = Value> {
|
|
||||||
// NOTE we can actually perform this without rebuilding,
|
|
||||||
// but it's not necessary for now.
|
|
||||||
if !self.buffer.is_empty() {
|
|
||||||
panic!("rebuild pls")
|
|
||||||
}
|
|
||||||
|
|
||||||
self.layers.values().cloned()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return a reference to a queryable map, assuming all updates
|
|
||||||
/// have already been processed using self.rebuild()
|
|
||||||
pub fn get(&self) -> anyhow::Result<&HistoricLayerCoverage<Value>> {
|
|
||||||
// NOTE we error here instead of implicitly rebuilding because
|
|
||||||
// rebuilding is somewhat expensive.
|
|
||||||
// TODO maybe implicitly rebuild and log/sentry an error?
|
|
||||||
if !self.buffer.is_empty() {
|
|
||||||
anyhow::bail!("rebuild required")
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(&self.historic_coverage)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_retroactive_regression_1() {
|
|
||||||
let mut map = BufferedHistoricLayerCoverage::new();
|
|
||||||
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 0..21267647932558653966460912964485513215,
|
|
||||||
lsn: 23761336..23761457,
|
|
||||||
is_image: false,
|
|
||||||
},
|
|
||||||
"sdfsdfs".to_string(),
|
|
||||||
);
|
|
||||||
|
|
||||||
map.rebuild();
|
|
||||||
|
|
||||||
let version = map.get().unwrap().get_version(23761457).unwrap();
|
|
||||||
assert_eq!(
|
|
||||||
version.delta_coverage.query(100),
|
|
||||||
Some("sdfsdfs".to_string())
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_retroactive_simple() {
|
|
||||||
let mut map = BufferedHistoricLayerCoverage::new();
|
|
||||||
|
|
||||||
// Append some images in increasing LSN order
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 0..5,
|
|
||||||
lsn: 100..101,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Image 1".to_string(),
|
|
||||||
);
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 3..9,
|
|
||||||
lsn: 110..111,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Image 2".to_string(),
|
|
||||||
);
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 4..6,
|
|
||||||
lsn: 120..121,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Image 3".to_string(),
|
|
||||||
);
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 8..9,
|
|
||||||
lsn: 120..121,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Image 4".to_string(),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Add a delta layer out of order
|
|
||||||
map.insert(
|
|
||||||
LayerKey {
|
|
||||||
key: 2..5,
|
|
||||||
lsn: 105..106,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
"Delta 1".to_string(),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Rebuild so we can start querying
|
|
||||||
map.rebuild();
|
|
||||||
|
|
||||||
// Query key 4
|
|
||||||
let version = map.get().unwrap().get_version(90);
|
|
||||||
assert!(version.is_none());
|
|
||||||
let version = map.get().unwrap().get_version(102).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(4), Some("Image 1".to_string()));
|
|
||||||
let version = map.get().unwrap().get_version(107).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(4), Some("Delta 1".to_string()));
|
|
||||||
let version = map.get().unwrap().get_version(115).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string()));
|
|
||||||
let version = map.get().unwrap().get_version(125).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(4), Some("Image 3".to_string()));
|
|
||||||
|
|
||||||
// Remove Image 3
|
|
||||||
map.remove(LayerKey {
|
|
||||||
key: 4..6,
|
|
||||||
lsn: 120..121,
|
|
||||||
is_image: true,
|
|
||||||
});
|
|
||||||
map.rebuild();
|
|
||||||
|
|
||||||
// Check deletion worked
|
|
||||||
let version = map.get().unwrap().get_version(125).unwrap();
|
|
||||||
assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string()));
|
|
||||||
assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
|
|
||||||
}
|
|
||||||
@@ -1,154 +0,0 @@
|
|||||||
use std::ops::Range;
|
|
||||||
|
|
||||||
// TODO the `im` crate has 20x more downloads and also has
|
|
||||||
// persistent/immutable BTree. It also runs a bit faster but
|
|
||||||
// results are not the same on some tests.
|
|
||||||
use rpds::RedBlackTreeMapSync;
|
|
||||||
|
|
||||||
/// Data structure that can efficiently:
|
|
||||||
/// - find the latest layer by lsn.end at a given key
|
|
||||||
/// - iterate the latest layers in a key range
|
|
||||||
/// - insert layers in non-decreasing lsn.start order
|
|
||||||
///
|
|
||||||
/// The struct is parameterized over Value for easier
|
|
||||||
/// testing, but in practice it's some sort of layer.
|
|
||||||
pub struct LayerCoverage<Value> {
|
|
||||||
/// For every change in coverage (as we sweep the key space)
|
|
||||||
/// we store (lsn.end, value).
|
|
||||||
///
|
|
||||||
/// We use an immutable/persistent tree so that we can keep historic
|
|
||||||
/// versions of this coverage without cloning the whole thing and
|
|
||||||
/// incurring quadratic memory cost. See HistoricLayerCoverage.
|
|
||||||
///
|
|
||||||
/// We use the Sync version of the map because we want Self to
|
|
||||||
/// be Sync. Using nonsync might be faster, if we can work with
|
|
||||||
/// that.
|
|
||||||
nodes: RedBlackTreeMapSync<i128, Option<(u64, Value)>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: Clone> Default for LayerCoverage<T> {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self::new()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<Value: Clone> LayerCoverage<Value> {
|
|
||||||
pub fn new() -> Self {
|
|
||||||
Self {
|
|
||||||
nodes: RedBlackTreeMapSync::default(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Helper function to subdivide the key range without changing any values
|
|
||||||
///
|
|
||||||
/// Complexity: O(log N)
|
|
||||||
fn add_node(&mut self, key: i128) {
|
|
||||||
let value = match self.nodes.range(..=key).last() {
|
|
||||||
Some((_, Some(v))) => Some(v.clone()),
|
|
||||||
Some((_, None)) => None,
|
|
||||||
None => None,
|
|
||||||
};
|
|
||||||
self.nodes.insert_mut(key, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Insert a layer.
|
|
||||||
///
|
|
||||||
/// Complexity: worst case O(N), in practice O(log N). See NOTE in implementation.
|
|
||||||
pub fn insert(&mut self, key: Range<i128>, lsn: Range<u64>, value: Value) {
|
|
||||||
// Add nodes at endpoints
|
|
||||||
//
|
|
||||||
// NOTE The order of lines is important. We add nodes at the start
|
|
||||||
// and end of the key range **before updating any nodes** in order
|
|
||||||
// to pin down the current coverage outside of the relevant key range.
|
|
||||||
// Only the coverage inside the layer's key range should change.
|
|
||||||
self.add_node(key.start);
|
|
||||||
self.add_node(key.end);
|
|
||||||
|
|
||||||
// Raise the height where necessary
|
|
||||||
//
|
|
||||||
// NOTE This loop is worst case O(N), but amortized O(log N) in the special
|
|
||||||
// case when rectangles have no height. In practice I don't think we'll see
|
|
||||||
// the kind of layer intersections needed to trigger O(N) behavior. The worst
|
|
||||||
// case is N/2 horizontal layers overlapped with N/2 vertical layers in a
|
|
||||||
// grid pattern.
|
|
||||||
let mut to_update = Vec::new();
|
|
||||||
let mut to_remove = Vec::new();
|
|
||||||
let mut prev_covered = false;
|
|
||||||
for (k, node) in self.nodes.range(key.clone()) {
|
|
||||||
let needs_cover = match node {
|
|
||||||
None => true,
|
|
||||||
Some((h, _)) => h < &lsn.end,
|
|
||||||
};
|
|
||||||
if needs_cover {
|
|
||||||
match prev_covered {
|
|
||||||
true => to_remove.push(*k),
|
|
||||||
false => to_update.push(*k),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
prev_covered = needs_cover;
|
|
||||||
}
|
|
||||||
if !prev_covered {
|
|
||||||
to_remove.push(key.end);
|
|
||||||
}
|
|
||||||
for k in to_update {
|
|
||||||
self.nodes.insert_mut(k, Some((lsn.end, value.clone())));
|
|
||||||
}
|
|
||||||
for k in to_remove {
|
|
||||||
self.nodes.remove_mut(&k);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the latest (by lsn.end) layer at a given key
|
|
||||||
///
|
|
||||||
/// Complexity: O(log N)
|
|
||||||
pub fn query(&self, key: i128) -> Option<Value> {
|
|
||||||
self.nodes
|
|
||||||
.range(..=key)
|
|
||||||
.rev()
|
|
||||||
.next()?
|
|
||||||
.1
|
|
||||||
.as_ref()
|
|
||||||
.map(|(_, v)| v.clone())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Iterate the changes in layer coverage in a given range. You will likely
|
|
||||||
/// want to start with self.query(key.start), and then follow up with self.range
|
|
||||||
///
|
|
||||||
/// Complexity: O(log N + result_size)
|
|
||||||
pub fn range(&self, key: Range<i128>) -> impl '_ + Iterator<Item = (i128, Option<Value>)> {
|
|
||||||
self.nodes
|
|
||||||
.range(key)
|
|
||||||
.map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// O(1) clone
|
|
||||||
pub fn clone(&self) -> Self {
|
|
||||||
Self {
|
|
||||||
nodes: self.nodes.clone(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Image and delta coverage at a specific LSN.
|
|
||||||
pub struct LayerCoverageTuple<Value> {
|
|
||||||
pub image_coverage: LayerCoverage<Value>,
|
|
||||||
pub delta_coverage: LayerCoverage<Value>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: Clone> Default for LayerCoverageTuple<T> {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self {
|
|
||||||
image_coverage: LayerCoverage::default(),
|
|
||||||
delta_coverage: LayerCoverage::default(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<Value: Clone> LayerCoverageTuple<Value> {
|
|
||||||
pub fn clone(&self) -> Self {
|
|
||||||
Self {
|
|
||||||
image_coverage: self.image_coverage.clone(),
|
|
||||||
delta_coverage: self.delta_coverage.clone(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -8,6 +8,8 @@ use std::sync::Arc;
|
|||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
|
use futures::stream::FuturesUnordered;
|
||||||
|
use futures::StreamExt;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use tokio::sync::RwLock;
|
use tokio::sync::RwLock;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
@@ -16,44 +18,16 @@ use remote_storage::GenericRemoteStorage;
|
|||||||
use utils::crashsafe;
|
use utils::crashsafe;
|
||||||
|
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::RequestContext;
|
||||||
use crate::task_mgr::{self, TaskKind};
|
|
||||||
use crate::tenant::config::TenantConfOpt;
|
use crate::tenant::config::TenantConfOpt;
|
||||||
use crate::tenant::{Tenant, TenantState};
|
use crate::tenant::{Tenant, TenantRequestContext, TenantState};
|
||||||
use crate::IGNORED_TENANT_FILE_NAME;
|
use crate::IGNORED_TENANT_FILE_NAME;
|
||||||
|
|
||||||
use utils::fs_ext::PathExt;
|
use utils::fs_ext::PathExt;
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::{TenantId, TimelineId};
|
||||||
|
|
||||||
/// The tenants known to the pageserver.
|
static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
|
||||||
/// The enum variants are used to distinguish the different states that the pageserver can be in.
|
Lazy::new(|| RwLock::new(HashMap::new()));
|
||||||
enum TenantsMap {
|
|
||||||
/// [`init_tenant_mgr`] is not done yet.
|
|
||||||
Initializing,
|
|
||||||
/// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
|
|
||||||
/// New tenants can be added using [`tenant_map_insert`].
|
|
||||||
Open(HashMap<TenantId, Arc<Tenant>>),
|
|
||||||
/// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
|
|
||||||
/// Existing tenants are still accessible, but no new tenants can be created.
|
|
||||||
ShuttingDown(HashMap<TenantId, Arc<Tenant>>),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TenantsMap {
|
|
||||||
fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
|
|
||||||
match self {
|
|
||||||
TenantsMap::Initializing => None,
|
|
||||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
|
|
||||||
match self {
|
|
||||||
TenantsMap::Initializing => None,
|
|
||||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));
|
|
||||||
|
|
||||||
/// Initialize repositories with locally available timelines.
|
/// Initialize repositories with locally available timelines.
|
||||||
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
|
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
|
||||||
@@ -64,16 +38,13 @@ pub async fn init_tenant_mgr(
|
|||||||
remote_storage: Option<GenericRemoteStorage>,
|
remote_storage: Option<GenericRemoteStorage>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Scan local filesystem for attached tenants
|
// Scan local filesystem for attached tenants
|
||||||
|
let mut number_of_tenants = 0;
|
||||||
let tenants_dir = conf.tenants_path();
|
let tenants_dir = conf.tenants_path();
|
||||||
|
|
||||||
let mut tenants = HashMap::new();
|
|
||||||
|
|
||||||
let mut dir_entries = fs::read_dir(&tenants_dir)
|
let mut dir_entries = fs::read_dir(&tenants_dir)
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
|
.with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
|
||||||
|
|
||||||
let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
|
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
match dir_entries.next_entry().await {
|
match dir_entries.next_entry().await {
|
||||||
Ok(None) => break,
|
Ok(None) => break,
|
||||||
@@ -117,10 +88,10 @@ pub async fn init_tenant_mgr(
|
|||||||
conf,
|
conf,
|
||||||
&tenant_dir_path,
|
&tenant_dir_path,
|
||||||
remote_storage.clone(),
|
remote_storage.clone(),
|
||||||
&ctx,
|
|
||||||
) {
|
) {
|
||||||
Ok(tenant) => {
|
Ok(tenant) => {
|
||||||
tenants.insert(tenant.tenant_id(), tenant);
|
TENANTS.write().await.insert(tenant.tenant_id(), tenant);
|
||||||
|
number_of_tenants += 1;
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
|
error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
|
||||||
@@ -139,11 +110,7 @@ pub async fn init_tenant_mgr(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("Processed {} local tenants at startup", tenants.len());
|
info!("Processed {number_of_tenants} local tenants at startup");
|
||||||
|
|
||||||
let mut tenants_map = TENANTS.write().await;
|
|
||||||
assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
|
|
||||||
*tenants_map = TenantsMap::Open(tenants);
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -151,7 +118,6 @@ pub fn schedule_local_tenant_processing(
|
|||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_path: &Path,
|
tenant_path: &Path,
|
||||||
remote_storage: Option<GenericRemoteStorage>,
|
remote_storage: Option<GenericRemoteStorage>,
|
||||||
ctx: &RequestContext,
|
|
||||||
) -> anyhow::Result<Arc<Tenant>> {
|
) -> anyhow::Result<Arc<Tenant>> {
|
||||||
anyhow::ensure!(
|
anyhow::ensure!(
|
||||||
tenant_path.is_dir(),
|
tenant_path.is_dir(),
|
||||||
@@ -186,7 +152,7 @@ pub fn schedule_local_tenant_processing(
|
|||||||
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
|
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
|
||||||
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
|
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
|
||||||
if let Some(remote_storage) = remote_storage {
|
if let Some(remote_storage) = remote_storage {
|
||||||
Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx)
|
Tenant::spawn_attach(conf, tenant_id, remote_storage)
|
||||||
} else {
|
} else {
|
||||||
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
|
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
|
||||||
Tenant::create_broken_tenant(conf, tenant_id)
|
Tenant::create_broken_tenant(conf, tenant_id)
|
||||||
@@ -194,7 +160,7 @@ pub fn schedule_local_tenant_processing(
|
|||||||
} else {
|
} else {
|
||||||
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
|
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
|
||||||
// Start loading the tenant into memory. It will initially be in Loading state.
|
// Start loading the tenant into memory. It will initially be in Loading state.
|
||||||
Tenant::spawn_load(conf, tenant_id, remote_storage, ctx)
|
Tenant::spawn_load(conf, tenant_id, remote_storage)
|
||||||
};
|
};
|
||||||
Ok(tenant)
|
Ok(tenant)
|
||||||
}
|
}
|
||||||
@@ -202,63 +168,26 @@ pub fn schedule_local_tenant_processing(
|
|||||||
///
|
///
|
||||||
/// Shut down all tenants. This runs as part of pageserver shutdown.
|
/// Shut down all tenants. This runs as part of pageserver shutdown.
|
||||||
///
|
///
|
||||||
/// NB: We leave the tenants in the map, so that they remain accessible through
|
|
||||||
/// the management API until we shut it down. If we removed the shut-down tenants
|
|
||||||
/// from the tenants map, the management API would return 404 for these tenants,
|
|
||||||
/// because TenantsMap::get() now returns `None`.
|
|
||||||
/// That could be easily misinterpreted by control plane, the consumer of the
|
|
||||||
/// management API. For example, it could attach the tenant on a different pageserver.
|
|
||||||
/// We would then be in split-brain once this pageserver restarts.
|
|
||||||
pub async fn shutdown_all_tenants() {
|
pub async fn shutdown_all_tenants() {
|
||||||
// Prevent new tenants from being created.
|
|
||||||
let tenants_to_shut_down = {
|
let tenants_to_shut_down = {
|
||||||
let mut m = TENANTS.write().await;
|
let mut m = TENANTS.write().await;
|
||||||
match &mut *m {
|
let mut tenants_to_shut_down = Vec::with_capacity(m.len());
|
||||||
TenantsMap::Initializing => {
|
for (_, tenant) in m.drain() {
|
||||||
*m = TenantsMap::ShuttingDown(HashMap::default());
|
if tenant.is_active() {
|
||||||
info!("tenants map is empty");
|
// updates tenant state, forbidding new GC and compaction iterations from starting
|
||||||
return;
|
tenant.set_stopping();
|
||||||
}
|
tenants_to_shut_down.push(tenant)
|
||||||
TenantsMap::Open(tenants) => {
|
|
||||||
let tenants_clone = tenants.clone();
|
|
||||||
*m = TenantsMap::ShuttingDown(std::mem::take(tenants));
|
|
||||||
tenants_clone
|
|
||||||
}
|
|
||||||
TenantsMap::ShuttingDown(_) => {
|
|
||||||
error!("already shutting down, this function isn't supposed to be called more than once");
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
drop(m);
|
||||||
|
tenants_to_shut_down
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
|
let mut shutdown_futures: FuturesUnordered<_> = FuturesUnordered::new();
|
||||||
for (_, tenant) in tenants_to_shut_down {
|
for tenant in tenants_to_shut_down.iter() {
|
||||||
if tenant.is_active() {
|
shutdown_futures.push(tenant.graceful_shutdown(true));
|
||||||
// updates tenant state, forbidding new GC and compaction iterations from starting
|
|
||||||
tenant.set_stopping();
|
|
||||||
tenants_to_freeze_and_flush.push(tenant);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Shut down all existing walreceiver connections and stop accepting the new ones.
|
|
||||||
task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
|
|
||||||
|
|
||||||
// Ok, no background tasks running anymore. Flush any remaining data in
|
|
||||||
// memory to disk.
|
|
||||||
//
|
|
||||||
// We assume that any incoming connections that might request pages from
|
|
||||||
// the tenant have already been terminated by the caller, so there
|
|
||||||
// should be no more activity in any of the repositories.
|
|
||||||
//
|
|
||||||
// On error, log it but continue with the shutdown for other tenants.
|
|
||||||
for tenant in tenants_to_freeze_and_flush {
|
|
||||||
let tenant_id = tenant.tenant_id();
|
|
||||||
debug!("shutdown tenant {tenant_id}");
|
|
||||||
|
|
||||||
if let Err(err) = tenant.freeze_and_flush().await {
|
|
||||||
error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
while let Some(_result) = shutdown_futures.next().await {}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn create_tenant(
|
pub async fn create_tenant(
|
||||||
@@ -266,53 +195,64 @@ pub async fn create_tenant(
|
|||||||
tenant_conf: TenantConfOpt,
|
tenant_conf: TenantConfOpt,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
remote_storage: Option<GenericRemoteStorage>,
|
remote_storage: Option<GenericRemoteStorage>,
|
||||||
ctx: &RequestContext,
|
) -> anyhow::Result<Option<Arc<Tenant>>> {
|
||||||
) -> Result<Arc<Tenant>, TenantMapInsertError> {
|
match TENANTS.write().await.entry(tenant_id) {
|
||||||
tenant_map_insert(tenant_id, |vacant_entry| {
|
hash_map::Entry::Occupied(_) => {
|
||||||
// We're holding the tenants lock in write mode while doing local IO.
|
debug!("tenant {tenant_id} already exists");
|
||||||
// If this section ever becomes contentious, introduce a new `TenantState::Creating`
|
Ok(None)
|
||||||
// and do the work in that state.
|
}
|
||||||
let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
|
hash_map::Entry::Vacant(v) => {
|
||||||
let created_tenant =
|
// Hold the write_tenants() lock, since all of this is local IO.
|
||||||
schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?;
|
// If this section ever becomes contentious, introduce a new `TenantState::Creating`.
|
||||||
let crated_tenant_id = created_tenant.tenant_id();
|
let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
|
||||||
anyhow::ensure!(
|
let created_tenant =
|
||||||
|
schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
|
||||||
|
let crated_tenant_id = created_tenant.tenant_id();
|
||||||
|
anyhow::ensure!(
|
||||||
tenant_id == crated_tenant_id,
|
tenant_id == crated_tenant_id,
|
||||||
"loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
|
"loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
|
||||||
);
|
);
|
||||||
vacant_entry.insert(Arc::clone(&created_tenant));
|
v.insert(Arc::clone(&created_tenant));
|
||||||
Ok(created_tenant)
|
Ok(Some(created_tenant))
|
||||||
}).await
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn update_tenant_config(
|
pub async fn update_tenant_config(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_conf: TenantConfOpt,
|
tenant_conf: TenantConfOpt,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
info!("configuring tenant {tenant_id}");
|
info!("configuring tenant {tenant_id}");
|
||||||
get_tenant(tenant_id, true)
|
let (tenant, _ctx) = get_active_tenant(tenant_id, ctx).await?;
|
||||||
.await?
|
|
||||||
.update_tenant_config(tenant_conf);
|
tenant.update_tenant_config(tenant_conf);
|
||||||
Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
|
Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
|
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
|
||||||
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
||||||
pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
|
pub async fn get_active_tenant(
|
||||||
|
tenant_id: TenantId,
|
||||||
|
parent_ctx: &RequestContext,
|
||||||
|
) -> anyhow::Result<(Arc<Tenant>, TenantRequestContext)> {
|
||||||
|
let tenant = get_tenant(tenant_id).await?;
|
||||||
|
let tenant_ctx = match tenant.get_context(parent_ctx) {
|
||||||
|
Ok(ctx) => ctx,
|
||||||
|
Err(state) => anyhow::bail!("Tenant {} is not active, state: {:?}", tenant_id, state,),
|
||||||
|
};
|
||||||
|
Ok((tenant, tenant_ctx))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_tenant(tenant_id: TenantId) -> anyhow::Result<Arc<Tenant>> {
|
||||||
let m = TENANTS.read().await;
|
let m = TENANTS.read().await;
|
||||||
let tenant = m
|
let tenant = m
|
||||||
.get(&tenant_id)
|
.get(&tenant_id)
|
||||||
.with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
|
.with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
|
||||||
if active_only && !tenant.is_active() {
|
|
||||||
anyhow::bail!(
|
Ok(Arc::clone(tenant))
|
||||||
"Tenant {tenant_id} is not active. Current state: {:?}",
|
|
||||||
tenant.current_state()
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
Ok(Arc::clone(tenant))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn delete_timeline(
|
pub async fn delete_timeline(
|
||||||
@@ -320,9 +260,9 @@ pub async fn delete_timeline(
|
|||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
match get_tenant(tenant_id, true).await {
|
match get_active_tenant(tenant_id, ctx).await {
|
||||||
Ok(tenant) => {
|
Ok((tenant, ctx)) => {
|
||||||
tenant.delete_timeline(timeline_id, ctx).await?;
|
tenant.delete_timeline(timeline_id, &ctx).await?;
|
||||||
}
|
}
|
||||||
Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
|
Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
|
||||||
}
|
}
|
||||||
@@ -350,9 +290,8 @@ pub async fn load_tenant(
|
|||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
remote_storage: Option<GenericRemoteStorage>,
|
remote_storage: Option<GenericRemoteStorage>,
|
||||||
ctx: &RequestContext,
|
) -> anyhow::Result<()> {
|
||||||
) -> Result<(), TenantMapInsertError> {
|
run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
|
||||||
tenant_map_insert(tenant_id, |vacant_entry| {
|
|
||||||
let tenant_path = conf.tenant_path(&tenant_id);
|
let tenant_path = conf.tenant_path(&tenant_id);
|
||||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
|
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
|
||||||
if tenant_ignore_mark.exists() {
|
if tenant_ignore_mark.exists() {
|
||||||
@@ -360,7 +299,7 @@ pub async fn load_tenant(
|
|||||||
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
|
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage, ctx)
|
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage)
|
||||||
.with_context(|| {
|
.with_context(|| {
|
||||||
format!("Failed to schedule tenant processing in path {tenant_path:?}")
|
format!("Failed to schedule tenant processing in path {tenant_path:?}")
|
||||||
})?;
|
})?;
|
||||||
@@ -389,24 +328,16 @@ pub async fn ignore_tenant(
|
|||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
|
||||||
pub enum TenantMapListError {
|
|
||||||
#[error("tenant map is still initiailizing")]
|
|
||||||
Initializing,
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Get list of tenants, for the mgmt API
|
/// Get list of tenants, for the mgmt API
|
||||||
///
|
///
|
||||||
pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
|
pub async fn list_tenants() -> Vec<(TenantId, TenantState)> {
|
||||||
let tenants = TENANTS.read().await;
|
TENANTS
|
||||||
let m = match &*tenants {
|
.read()
|
||||||
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
|
.await
|
||||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
|
.iter()
|
||||||
};
|
|
||||||
Ok(m.iter()
|
|
||||||
.map(|(id, tenant)| (*id, tenant.current_state()))
|
.map(|(id, tenant)| (*id, tenant.current_state()))
|
||||||
.collect())
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Execute Attach mgmt API command.
|
/// Execute Attach mgmt API command.
|
||||||
@@ -417,62 +348,34 @@ pub async fn attach_tenant(
|
|||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
remote_storage: GenericRemoteStorage,
|
remote_storage: GenericRemoteStorage,
|
||||||
ctx: &RequestContext,
|
) -> anyhow::Result<()> {
|
||||||
) -> Result<(), TenantMapInsertError> {
|
run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
|
||||||
tenant_map_insert(tenant_id, |vacant_entry| {
|
|
||||||
let tenant_path = conf.tenant_path(&tenant_id);
|
let tenant_path = conf.tenant_path(&tenant_id);
|
||||||
anyhow::ensure!(
|
anyhow::ensure!(
|
||||||
!tenant_path.exists(),
|
!tenant_path.exists(),
|
||||||
"Cannot attach tenant {tenant_id}, local tenant directory already exists"
|
"Cannot attach tenant {tenant_id}, local tenant directory already exists"
|
||||||
);
|
);
|
||||||
|
|
||||||
let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx);
|
let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
|
||||||
vacant_entry.insert(tenant);
|
vacant_entry.insert(tenant);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
async fn run_if_no_tenant_in_memory<F, V>(tenant_id: TenantId, run: F) -> anyhow::Result<V>
|
||||||
pub enum TenantMapInsertError {
|
|
||||||
#[error("tenant map is still initializing")]
|
|
||||||
StillInitializing,
|
|
||||||
#[error("tenant map is shutting down")]
|
|
||||||
ShuttingDown,
|
|
||||||
#[error("tenant {0} already exists, state: {1:?}")]
|
|
||||||
TenantAlreadyExists(TenantId, TenantState),
|
|
||||||
#[error(transparent)]
|
|
||||||
Closure(#[from] anyhow::Error),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Give the given closure access to the tenants map entry for the given `tenant_id`, iff that
|
|
||||||
/// entry is vacant. The closure is responsible for creating the tenant object and inserting
|
|
||||||
/// it into the tenants map through the vacnt entry that it receives as argument.
|
|
||||||
///
|
|
||||||
/// NB: the closure should return quickly because the current implementation of tenants map
|
|
||||||
/// serializes access through an `RwLock`.
|
|
||||||
async fn tenant_map_insert<F, V>(
|
|
||||||
tenant_id: TenantId,
|
|
||||||
insert_fn: F,
|
|
||||||
) -> Result<V, TenantMapInsertError>
|
|
||||||
where
|
where
|
||||||
F: FnOnce(hash_map::VacantEntry<TenantId, Arc<Tenant>>) -> anyhow::Result<V>,
|
F: FnOnce(hash_map::VacantEntry<TenantId, Arc<Tenant>>) -> anyhow::Result<V>,
|
||||||
{
|
{
|
||||||
let mut guard = TENANTS.write().await;
|
match TENANTS.write().await.entry(tenant_id) {
|
||||||
let m = match &mut *guard {
|
hash_map::Entry::Occupied(e) => {
|
||||||
TenantsMap::Initializing => return Err(TenantMapInsertError::StillInitializing),
|
anyhow::bail!(
|
||||||
TenantsMap::ShuttingDown(_) => return Err(TenantMapInsertError::ShuttingDown),
|
"tenant {tenant_id} already exists, state: {:?}",
|
||||||
TenantsMap::Open(m) => m,
|
e.get().current_state()
|
||||||
};
|
)
|
||||||
match m.entry(tenant_id) {
|
}
|
||||||
hash_map::Entry::Occupied(e) => Err(TenantMapInsertError::TenantAlreadyExists(
|
hash_map::Entry::Vacant(v) => run(v),
|
||||||
tenant_id,
|
|
||||||
e.get().current_state(),
|
|
||||||
)),
|
|
||||||
hash_map::Entry::Vacant(v) => match insert_fn(v) {
|
|
||||||
Ok(v) => Ok(v),
|
|
||||||
Err(e) => Err(TenantMapInsertError::Closure(e)),
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -491,27 +394,31 @@ where
|
|||||||
// The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
|
// The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
|
||||||
// tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
|
// tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
|
||||||
// avoid holding the lock for the entire process.
|
// avoid holding the lock for the entire process.
|
||||||
{
|
let tenant = {
|
||||||
let tenants_accessor = TENANTS.write().await;
|
let tenants_accessor = TENANTS.write().await;
|
||||||
match tenants_accessor.get(&tenant_id) {
|
match tenants_accessor.get(&tenant_id) {
|
||||||
Some(tenant) => match tenant.current_state() {
|
Some(tenant) => match tenant.current_state() {
|
||||||
TenantState::Attaching
|
TenantState::Attaching
|
||||||
| TenantState::Loading
|
| TenantState::Loading
|
||||||
| TenantState::Broken
|
| TenantState::Broken
|
||||||
| TenantState::Active => tenant.set_stopping(),
|
| TenantState::Active => {
|
||||||
|
tenant.set_stopping();
|
||||||
|
Arc::clone(tenant)
|
||||||
|
}
|
||||||
TenantState::Stopping => {
|
TenantState::Stopping => {
|
||||||
anyhow::bail!("Tenant {tenant_id} is stopping already")
|
anyhow::bail!("Tenant {tenant_id} is stopping already")
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
|
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
|
||||||
}
|
}
|
||||||
}
|
};
|
||||||
|
|
||||||
// shutdown all tenant and timeline tasks: gc, compaction, page service)
|
// Shut down all tenant and timeline tasks.
|
||||||
// No new tasks will be started for this tenant because it's in `Stopping` state.
|
tenant.graceful_shutdown(true).await;
|
||||||
// Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
|
|
||||||
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
|
|
||||||
|
|
||||||
|
// All tasks that operated on the tenant or any of its timelines have no finished,
|
||||||
|
// and they are in Stopped state so that new ones cannot appear anymore. Proceed
|
||||||
|
// with the cleanup.
|
||||||
match tenant_cleanup
|
match tenant_cleanup
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
|
.with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
|
||||||
@@ -533,111 +440,3 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "testing")]
|
|
||||||
use {
|
|
||||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
|
||||||
utils::http::error::ApiError,
|
|
||||||
};
|
|
||||||
|
|
||||||
#[cfg(feature = "testing")]
|
|
||||||
pub async fn immediate_gc(
|
|
||||||
tenant_id: TenantId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
gc_req: TimelineGcRequest,
|
|
||||||
ctx: &RequestContext,
|
|
||||||
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
|
|
||||||
let guard = TENANTS.read().await;
|
|
||||||
let tenant = guard
|
|
||||||
.get(&tenant_id)
|
|
||||||
.map(Arc::clone)
|
|
||||||
.with_context(|| format!("Tenant {tenant_id} not found"))
|
|
||||||
.map_err(ApiError::NotFound)?;
|
|
||||||
|
|
||||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
|
||||||
// Use tenant's pitr setting
|
|
||||||
let pitr = tenant.get_pitr_interval();
|
|
||||||
|
|
||||||
// Run in task_mgr to avoid race with tenant_detach operation
|
|
||||||
let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
|
||||||
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
|
|
||||||
task_mgr::spawn(
|
|
||||||
&tokio::runtime::Handle::current(),
|
|
||||||
TaskKind::GarbageCollector,
|
|
||||||
Some(tenant_id),
|
|
||||||
Some(timeline_id),
|
|
||||||
&format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
|
|
||||||
false,
|
|
||||||
async move {
|
|
||||||
fail::fail_point!("immediate_gc_task_pre");
|
|
||||||
let result = tenant
|
|
||||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx)
|
|
||||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
|
||||||
.await;
|
|
||||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
|
||||||
// better once the types support it.
|
|
||||||
match task_done.send(result) {
|
|
||||||
Ok(_) => (),
|
|
||||||
Err(result) => error!("failed to send gc result: {result:?}"),
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
);
|
|
||||||
|
|
||||||
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
|
|
||||||
drop(guard);
|
|
||||||
|
|
||||||
Ok(wait_task_done)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(feature = "testing")]
|
|
||||||
pub async fn immediate_compact(
|
|
||||||
tenant_id: TenantId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
ctx: &RequestContext,
|
|
||||||
) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
|
|
||||||
let guard = TENANTS.read().await;
|
|
||||||
|
|
||||||
let tenant = guard
|
|
||||||
.get(&tenant_id)
|
|
||||||
.map(Arc::clone)
|
|
||||||
.with_context(|| format!("Tenant {tenant_id} not found"))
|
|
||||||
.map_err(ApiError::NotFound)?;
|
|
||||||
|
|
||||||
let timeline = tenant
|
|
||||||
.get_timeline(timeline_id, true)
|
|
||||||
.map_err(ApiError::NotFound)?;
|
|
||||||
|
|
||||||
// Run in task_mgr to avoid race with tenant_detach operation
|
|
||||||
let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
|
|
||||||
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
|
|
||||||
task_mgr::spawn(
|
|
||||||
&tokio::runtime::Handle::current(),
|
|
||||||
TaskKind::Compaction,
|
|
||||||
Some(tenant_id),
|
|
||||||
Some(timeline_id),
|
|
||||||
&format!(
|
|
||||||
"timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
|
|
||||||
),
|
|
||||||
false,
|
|
||||||
async move {
|
|
||||||
let result = timeline
|
|
||||||
.compact(&ctx)
|
|
||||||
.instrument(
|
|
||||||
info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
|
|
||||||
match task_done.send(result) {
|
|
||||||
Ok(_) => (),
|
|
||||||
Err(result) => error!("failed to send compaction result: {result:?}"),
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
|
|
||||||
drop(guard);
|
|
||||||
|
|
||||||
Ok(wait_task_done)
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -135,7 +135,7 @@
|
|||||||
//! - Initiate upload queue with that [`IndexPart`].
|
//! - Initiate upload queue with that [`IndexPart`].
|
||||||
//! - Reschedule all lost operations by comparing the local filesystem state
|
//! - Reschedule all lost operations by comparing the local filesystem state
|
||||||
//! and remote state as per [`IndexPart`]. This is done in
|
//! and remote state as per [`IndexPart`]. This is done in
|
||||||
//! [`Timeline::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
|
//! [`Timeline::setup_timeline`] and [`Timeline::reconcile_with_remote`].
|
||||||
//!
|
//!
|
||||||
//! Note that if we crash during file deletion between the index update
|
//! Note that if we crash during file deletion between the index update
|
||||||
//! that removes the file from the list of files, and deleting the remote file,
|
//! that removes the file from the list of files, and deleting the remote file,
|
||||||
@@ -214,6 +214,7 @@ use anyhow::ensure;
|
|||||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||||
use std::ops::DerefMut;
|
use std::ops::DerefMut;
|
||||||
use tokio::runtime::Runtime;
|
use tokio::runtime::Runtime;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{debug, info, warn};
|
use tracing::{debug, info, warn};
|
||||||
use tracing::{info_span, Instrument};
|
use tracing::{info_span, Instrument};
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
@@ -225,12 +226,12 @@ use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
|||||||
use crate::{
|
use crate::{
|
||||||
config::PageServerConf,
|
config::PageServerConf,
|
||||||
task_mgr,
|
task_mgr,
|
||||||
task_mgr::TaskKind,
|
|
||||||
task_mgr::BACKGROUND_RUNTIME,
|
task_mgr::BACKGROUND_RUNTIME,
|
||||||
tenant::metadata::TimelineMetadata,
|
tenant::metadata::TimelineMetadata,
|
||||||
tenant::upload_queue::{
|
tenant::upload_queue::{
|
||||||
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
|
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
|
||||||
},
|
},
|
||||||
|
tenant::TimelineRequestContext,
|
||||||
{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
|
{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -313,25 +314,50 @@ impl RemoteTimelineClient {
|
|||||||
/// Initialize the upload queue for a remote storage that already received
|
/// Initialize the upload queue for a remote storage that already received
|
||||||
/// an index file upload, i.e., it's not empty.
|
/// an index file upload, i.e., it's not empty.
|
||||||
/// The given `index_part` must be the one on the remote.
|
/// The given `index_part` must be the one on the remote.
|
||||||
pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
|
pub fn init_upload_queue(
|
||||||
|
self: &Arc<Self>,
|
||||||
|
index_part: &IndexPart,
|
||||||
|
upload_ctx: TimelineRequestContext,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let cancellation_token = upload_ctx.cancellation_token().clone();
|
||||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
upload_queue.initialize_with_current_remote_index_part(index_part, upload_ctx)?;
|
||||||
self.update_remote_physical_size_gauge(Some(index_part));
|
self.update_remote_physical_size_gauge(Some(index_part));
|
||||||
|
self.spawn_cancellation_watch(cancellation_token);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Initialize the upload queue for the case where the remote storage is empty,
|
/// Initialize the upload queue for the case where the remote storage is empty,
|
||||||
/// i.e., it doesn't have an `IndexPart`.
|
/// i.e., it doesn't have an `IndexPart`.
|
||||||
pub fn init_upload_queue_for_empty_remote(
|
pub fn init_upload_queue_for_empty_remote(
|
||||||
&self,
|
self: &Arc<Self>,
|
||||||
local_metadata: &TimelineMetadata,
|
local_metadata: &TimelineMetadata,
|
||||||
|
upload_ctx: TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
|
let cancellation_token = upload_ctx.cancellation_token().clone();
|
||||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||||
upload_queue.initialize_empty_remote(local_metadata)?;
|
upload_queue.initialize_empty_remote(local_metadata, upload_ctx)?;
|
||||||
self.update_remote_physical_size_gauge(None);
|
self.update_remote_physical_size_gauge(None);
|
||||||
|
self.spawn_cancellation_watch(cancellation_token);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Spawn a task that calls `stop` on cancellation. It's important that we
|
||||||
|
/// stop the upload queue promptly, because it holds onto the RequestContext,
|
||||||
|
/// which in turn prevents the Timeline from shutting down.
|
||||||
|
fn spawn_cancellation_watch(self: &Arc<Self>, cancellation_token: CancellationToken) {
|
||||||
|
let self_rc = Arc::clone(self);
|
||||||
|
task_mgr::spawn(
|
||||||
|
self.runtime.handle(),
|
||||||
|
"remote upload queue cancellation watch",
|
||||||
|
false,
|
||||||
|
async move {
|
||||||
|
cancellation_token.cancelled().await;
|
||||||
|
self_rc.stop();
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
|
pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
|
||||||
match &*self.upload_queue.lock().unwrap() {
|
match &*self.upload_queue.lock().unwrap() {
|
||||||
UploadQueue::Uninitialized => None,
|
UploadQueue::Uninitialized => None,
|
||||||
@@ -625,7 +651,10 @@ impl RemoteTimelineClient {
|
|||||||
///
|
///
|
||||||
/// Wait for all previously scheduled uploads/deletions to complete
|
/// Wait for all previously scheduled uploads/deletions to complete
|
||||||
///
|
///
|
||||||
pub async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
|
pub async fn wait_completion(
|
||||||
|
self: &Arc<Self>,
|
||||||
|
ctx: &TimelineRequestContext,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
let (sender, mut receiver) = tokio::sync::watch::channel(());
|
let (sender, mut receiver) = tokio::sync::watch::channel(());
|
||||||
let barrier_op = UploadOp::Barrier(sender);
|
let barrier_op = UploadOp::Barrier(sender);
|
||||||
|
|
||||||
@@ -639,9 +668,16 @@ impl RemoteTimelineClient {
|
|||||||
self.launch_queued_tasks(upload_queue);
|
self.launch_queued_tasks(upload_queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
if receiver.changed().await.is_err() {
|
tokio::select! {
|
||||||
anyhow::bail!("wait_completion aborted because upload queue was stopped");
|
result = receiver.changed() => {
|
||||||
}
|
if result.is_err() {
|
||||||
|
anyhow::bail!("wait_completion aborted because upload queue was stopped");
|
||||||
|
}
|
||||||
|
},
|
||||||
|
_ = ctx.cancelled() => {
|
||||||
|
anyhow::bail!("request cancelled while waiting on uploads to finish");
|
||||||
|
},
|
||||||
|
};
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -719,16 +755,15 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
// Spawn task to perform the task
|
// Spawn task to perform the task
|
||||||
let self_rc = Arc::clone(self);
|
let self_rc = Arc::clone(self);
|
||||||
|
|
||||||
|
let cancellation_token = upload_queue.upload_ctx.cancellation_token().clone();
|
||||||
|
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
self.runtime.handle(),
|
self.runtime.handle(),
|
||||||
TaskKind::RemoteUploadTask,
|
|
||||||
Some(self.tenant_id),
|
|
||||||
Some(self.timeline_id),
|
|
||||||
"remote upload",
|
"remote upload",
|
||||||
false,
|
false,
|
||||||
async move {
|
async move {
|
||||||
self_rc.perform_upload_task(task).await;
|
self_rc.perform_upload_task(task, cancellation_token).await;
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
.instrument(info_span!(parent: None, "remote_upload", tenant = %self.tenant_id, timeline = %self.timeline_id, upload_task_id = %task_id)),
|
.instrument(info_span!(parent: None, "remote_upload", tenant = %self.tenant_id, timeline = %self.timeline_id, upload_task_id = %task_id)),
|
||||||
);
|
);
|
||||||
@@ -748,7 +783,11 @@ impl RemoteTimelineClient {
|
|||||||
/// The task can be shut down, however. That leads to stopping the whole
|
/// The task can be shut down, however. That leads to stopping the whole
|
||||||
/// queue.
|
/// queue.
|
||||||
///
|
///
|
||||||
async fn perform_upload_task(self: &Arc<Self>, task: Arc<UploadTask>) {
|
async fn perform_upload_task(
|
||||||
|
self: &Arc<Self>,
|
||||||
|
task: Arc<UploadTask>,
|
||||||
|
cancellation_token: CancellationToken,
|
||||||
|
) {
|
||||||
// Loop to retry until it completes.
|
// Loop to retry until it completes.
|
||||||
loop {
|
loop {
|
||||||
// If we're requested to shut down, close up shop and exit.
|
// If we're requested to shut down, close up shop and exit.
|
||||||
@@ -760,7 +799,7 @@ impl RemoteTimelineClient {
|
|||||||
// the Future, but we're not 100% sure if the remote storage library
|
// the Future, but we're not 100% sure if the remote storage library
|
||||||
// is cancellation safe, so we don't dare to do that. Hopefully, the
|
// is cancellation safe, so we don't dare to do that. Hopefully, the
|
||||||
// upload finishes or times out soon enough.
|
// upload finishes or times out soon enough.
|
||||||
if task_mgr::is_shutdown_requested() {
|
if cancellation_token.is_cancelled() {
|
||||||
info!("upload task cancelled by shutdown request");
|
info!("upload task cancelled by shutdown request");
|
||||||
self.calls_unfinished_metric_end(&task.op);
|
self.calls_unfinished_metric_end(&task.op);
|
||||||
self.stop();
|
self.stop();
|
||||||
@@ -858,7 +897,7 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
// sleep until it's time to retry, or we're cancelled
|
// sleep until it's time to retry, or we're cancelled
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = task_mgr::shutdown_watcher() => { },
|
_ = cancellation_token.cancelled() => { },
|
||||||
_ = exponential_backoff(
|
_ = exponential_backoff(
|
||||||
retries,
|
retries,
|
||||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||||
@@ -1010,10 +1049,9 @@ impl RemoteTimelineClient {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::{
|
use crate::context::{DownloadBehavior, RequestContext, TaskKind};
|
||||||
tenant::harness::{TenantHarness, TIMELINE_ID},
|
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
|
||||||
DEFAULT_PG_VERSION,
|
use crate::DEFAULT_PG_VERSION;
|
||||||
};
|
|
||||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
||||||
use std::{collections::HashSet, path::Path};
|
use std::{collections::HashSet, path::Path};
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
@@ -1032,7 +1070,7 @@ mod tests {
|
|||||||
Lsn(0),
|
Lsn(0),
|
||||||
// Any version will do
|
// Any version will do
|
||||||
// but it should be consistent with the one in the tests
|
// but it should be consistent with the one in the tests
|
||||||
crate::DEFAULT_PG_VERSION,
|
DEFAULT_PG_VERSION,
|
||||||
);
|
);
|
||||||
|
|
||||||
// go through serialize + deserialize to fix the header, including checksum
|
// go through serialize + deserialize to fix the header, including checksum
|
||||||
@@ -1076,9 +1114,9 @@ mod tests {
|
|||||||
let _entered = runtime.enter();
|
let _entered = runtime.enter();
|
||||||
|
|
||||||
let harness = TenantHarness::create("upload_scheduling")?;
|
let harness = TenantHarness::create("upload_scheduling")?;
|
||||||
let (tenant, ctx) = runtime.block_on(harness.load());
|
let (tenant, tenant_ctx) = runtime.block_on(harness.load());
|
||||||
let _timeline =
|
let (_timeline, timeline_ctx) =
|
||||||
tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
|
tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &tenant_ctx)?;
|
||||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||||
|
|
||||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
|
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
|
||||||
@@ -1132,7 +1170,11 @@ mod tests {
|
|||||||
println!("remote_timeline_dir: {}", remote_timeline_dir.display());
|
println!("remote_timeline_dir: {}", remote_timeline_dir.display());
|
||||||
|
|
||||||
let metadata = dummy_metadata(Lsn(0x10));
|
let metadata = dummy_metadata(Lsn(0x10));
|
||||||
client.init_upload_queue_for_empty_remote(&metadata)?;
|
let upload_ctx = timeline_ctx.register_another(RequestContext::new(
|
||||||
|
TaskKind::RemoteUploadTask,
|
||||||
|
DownloadBehavior::Error,
|
||||||
|
));
|
||||||
|
client.init_upload_queue_for_empty_remote(&metadata, upload_ctx)?;
|
||||||
|
|
||||||
// Create a couple of dummy files, schedule upload for them
|
// Create a couple of dummy files, schedule upload for them
|
||||||
let content_foo = dummy_contents("foo");
|
let content_foo = dummy_contents("foo");
|
||||||
@@ -1172,7 +1214,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Wait for the uploads to finish
|
// Wait for the uploads to finish
|
||||||
runtime.block_on(client.wait_completion())?;
|
runtime.block_on(client.wait_completion(&timeline_ctx))?;
|
||||||
{
|
{
|
||||||
let mut guard = client.upload_queue.lock().unwrap();
|
let mut guard = client.upload_queue.lock().unwrap();
|
||||||
let upload_queue = guard.initialized_mut().unwrap();
|
let upload_queue = guard.initialized_mut().unwrap();
|
||||||
@@ -1209,7 +1251,7 @@ mod tests {
|
|||||||
assert_remote_files(&["foo", "bar", "index_part.json"], &remote_timeline_dir);
|
assert_remote_files(&["foo", "bar", "index_part.json"], &remote_timeline_dir);
|
||||||
|
|
||||||
// Finish them
|
// Finish them
|
||||||
runtime.block_on(client.wait_completion())?;
|
runtime.block_on(client.wait_completion(&timeline_ctx))?;
|
||||||
|
|
||||||
assert_remote_files(&["bar", "baz", "index_part.json"], &remote_timeline_dir);
|
assert_remote_files(&["bar", "baz", "index_part.json"], &remote_timeline_dir);
|
||||||
|
|
||||||
|
|||||||
@@ -3,11 +3,9 @@ use std::collections::{HashMap, HashSet};
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use tokio::sync::oneshot::error::RecvError;
|
|
||||||
use tokio::sync::Semaphore;
|
use tokio::sync::Semaphore;
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::tenant::{PageReconstructError, TenantRequestContext, TimelineRequestContext};
|
||||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
|
||||||
|
|
||||||
use super::Tenant;
|
use super::Tenant;
|
||||||
use utils::id::TimelineId;
|
use utils::id::TimelineId;
|
||||||
@@ -24,13 +22,7 @@ use tracing::*;
|
|||||||
pub struct ModelInputs {
|
pub struct ModelInputs {
|
||||||
updates: Vec<Update>,
|
updates: Vec<Update>,
|
||||||
retention_period: u64,
|
retention_period: u64,
|
||||||
|
|
||||||
/// Relevant lsns per timeline.
|
|
||||||
///
|
|
||||||
/// This field is not required for deserialization purposes, which is mostly used in tests. The
|
|
||||||
/// LSNs explain the outcome (updates) but are not needed in size calculation.
|
|
||||||
#[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
|
#[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
|
||||||
#[serde(default)]
|
|
||||||
timeline_inputs: HashMap<TimelineId, TimelineInputs>,
|
timeline_inputs: HashMap<TimelineId, TimelineInputs>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -39,8 +31,6 @@ pub struct ModelInputs {
|
|||||||
#[serde_with::serde_as]
|
#[serde_with::serde_as]
|
||||||
#[derive(Debug, serde::Serialize, serde::Deserialize)]
|
#[derive(Debug, serde::Serialize, serde::Deserialize)]
|
||||||
struct TimelineInputs {
|
struct TimelineInputs {
|
||||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
|
||||||
ancestor_lsn: Lsn,
|
|
||||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||||
last_record: Lsn,
|
last_record: Lsn,
|
||||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||||
@@ -53,116 +43,6 @@ struct TimelineInputs {
|
|||||||
next_gc_cutoff: Lsn,
|
next_gc_cutoff: Lsn,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Adjust BranchFrom sorting so that we always process ancestor
|
|
||||||
// before descendants. This is needed to correctly calculate size of
|
|
||||||
// descendant timelines.
|
|
||||||
//
|
|
||||||
// Note that we may have multiple BranchFroms at the same LSN, so we
|
|
||||||
// need to sort them in the tree order.
|
|
||||||
//
|
|
||||||
// see updates_sort_with_branches_at_same_lsn test below
|
|
||||||
fn sort_updates_in_tree_order(updates: Vec<Update>) -> anyhow::Result<Vec<Update>> {
|
|
||||||
let mut sorted_updates = Vec::with_capacity(updates.len());
|
|
||||||
let mut known_timelineids = HashSet::new();
|
|
||||||
let mut i = 0;
|
|
||||||
while i < updates.len() {
|
|
||||||
let curr_upd = &updates[i];
|
|
||||||
|
|
||||||
if let Command::BranchFrom(parent_id) = curr_upd.command {
|
|
||||||
let parent_id = match parent_id {
|
|
||||||
Some(parent_id) if known_timelineids.contains(&parent_id) => {
|
|
||||||
// we have already processed ancestor
|
|
||||||
// process this BranchFrom Update normally
|
|
||||||
known_timelineids.insert(curr_upd.timeline_id);
|
|
||||||
sorted_updates.push(*curr_upd);
|
|
||||||
i += 1;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
known_timelineids.insert(curr_upd.timeline_id);
|
|
||||||
sorted_updates.push(*curr_upd);
|
|
||||||
i += 1;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
Some(parent_id) => parent_id,
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut j = i;
|
|
||||||
|
|
||||||
// we have not processed ancestor yet.
|
|
||||||
// there is a chance that it is at the same Lsn
|
|
||||||
if !known_timelineids.contains(&parent_id) {
|
|
||||||
let mut curr_lsn_branchfroms: HashMap<TimelineId, Vec<(TimelineId, usize)>> =
|
|
||||||
HashMap::new();
|
|
||||||
|
|
||||||
// inspect all branchpoints at the same lsn
|
|
||||||
while j < updates.len() && updates[j].lsn == curr_upd.lsn {
|
|
||||||
let lookahead_upd = &updates[j];
|
|
||||||
j += 1;
|
|
||||||
|
|
||||||
if let Command::BranchFrom(lookahead_parent_id) = lookahead_upd.command {
|
|
||||||
match lookahead_parent_id {
|
|
||||||
Some(lookahead_parent_id)
|
|
||||||
if !known_timelineids.contains(&lookahead_parent_id) =>
|
|
||||||
{
|
|
||||||
// we have not processed ancestor yet
|
|
||||||
// store it for later
|
|
||||||
let es =
|
|
||||||
curr_lsn_branchfroms.entry(lookahead_parent_id).or_default();
|
|
||||||
es.push((lookahead_upd.timeline_id, j));
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
// we have already processed ancestor
|
|
||||||
// process this BranchFrom Update normally
|
|
||||||
known_timelineids.insert(lookahead_upd.timeline_id);
|
|
||||||
sorted_updates.push(*lookahead_upd);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// process BranchFroms in the tree order
|
|
||||||
// check that we don't have a cycle if somet entry is orphan
|
|
||||||
// (this should not happen, but better to be safe)
|
|
||||||
let mut processed_some_entry = true;
|
|
||||||
while processed_some_entry {
|
|
||||||
processed_some_entry = false;
|
|
||||||
|
|
||||||
curr_lsn_branchfroms.retain(|parent_id, branchfroms| {
|
|
||||||
if known_timelineids.contains(parent_id) {
|
|
||||||
for (timeline_id, j) in branchfroms {
|
|
||||||
known_timelineids.insert(*timeline_id);
|
|
||||||
sorted_updates.push(updates[*j - 1]);
|
|
||||||
}
|
|
||||||
processed_some_entry = true;
|
|
||||||
false
|
|
||||||
} else {
|
|
||||||
true
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if !curr_lsn_branchfroms.is_empty() {
|
|
||||||
// orphans are expected to be rare and transient between tenant reloads
|
|
||||||
// for example, an broken ancestor without the child branch being broken.
|
|
||||||
anyhow::bail!(
|
|
||||||
"orphan branch(es) detected in BranchFroms: {curr_lsn_branchfroms:?}"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
assert!(j > i);
|
|
||||||
i = j;
|
|
||||||
} else {
|
|
||||||
// not a BranchFrom, keep the same order
|
|
||||||
sorted_updates.push(*curr_upd);
|
|
||||||
i += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(sorted_updates)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Gathers the inputs for the tenant sizing model.
|
/// Gathers the inputs for the tenant sizing model.
|
||||||
///
|
///
|
||||||
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
|
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
|
||||||
@@ -182,26 +62,25 @@ pub(super) async fn gather_inputs(
|
|||||||
tenant: &Tenant,
|
tenant: &Tenant,
|
||||||
limit: &Arc<Semaphore>,
|
limit: &Arc<Semaphore>,
|
||||||
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
|
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
|
||||||
ctx: &RequestContext,
|
tenant_ctx: &TenantRequestContext,
|
||||||
) -> anyhow::Result<ModelInputs> {
|
) -> anyhow::Result<ModelInputs> {
|
||||||
// with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
|
// with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
|
||||||
// our advantage with `?` error handling.
|
// our advantage with `?` error handling.
|
||||||
let mut joinset = tokio::task::JoinSet::new();
|
let mut joinset = tokio::task::JoinSet::new();
|
||||||
|
|
||||||
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
|
let timelines = tenant
|
||||||
tenant
|
.refresh_gc_info(tenant_ctx)
|
||||||
.refresh_gc_info(ctx)
|
|
||||||
.await
|
.await
|
||||||
.context("Failed to refresh gc_info before gathering inputs")?;
|
.context("Failed to refresh gc_info before gathering inputs")?;
|
||||||
|
|
||||||
let timelines = tenant.list_timelines();
|
|
||||||
|
|
||||||
if timelines.is_empty() {
|
if timelines.is_empty() {
|
||||||
// perhaps the tenant has just been created, and as such doesn't have any data yet
|
// All timelines are below tenant's gc_horizon; alternative would be to use
|
||||||
|
// Tenant::list_timelines but then those gc_info's would not be updated yet, possibly
|
||||||
|
// missing GcInfo::retain_lsns or having obsolete values for cutoff's.
|
||||||
return Ok(ModelInputs {
|
return Ok(ModelInputs {
|
||||||
updates: vec![],
|
updates: vec![],
|
||||||
retention_period: 0,
|
retention_period: 0,
|
||||||
timeline_inputs: HashMap::default(),
|
timeline_inputs: HashMap::new(),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -212,27 +91,27 @@ pub(super) async fn gather_inputs(
|
|||||||
|
|
||||||
let mut updates = Vec::new();
|
let mut updates = Vec::new();
|
||||||
|
|
||||||
// record the per timeline values useful to debug the model inputs, also used to track
|
// record the per timline values used to determine `retention_period`
|
||||||
// ancestor_lsn without keeping a hold of Timeline
|
|
||||||
let mut timeline_inputs = HashMap::with_capacity(timelines.len());
|
let mut timeline_inputs = HashMap::with_capacity(timelines.len());
|
||||||
|
|
||||||
// used to determine the `retention_period` for the size model
|
// used to determine the `retention_period` for the size model
|
||||||
let mut max_cutoff_distance = None;
|
let mut max_cutoff_distance = None;
|
||||||
|
|
||||||
// mapping from (TimelineId, Lsn) => if this branch point has been handled already via
|
let mut ctx_dropguards: Vec<tokio_util::sync::DropGuard> = Vec::new();
|
||||||
// GcInfo::retain_lsns or if it needs to have its logical_size calculated.
|
|
||||||
let mut referenced_branch_froms = HashMap::<(TimelineId, Lsn), bool>::new();
|
|
||||||
|
|
||||||
for timeline in timelines {
|
for timeline in timelines {
|
||||||
if !timeline.is_active() {
|
|
||||||
anyhow::bail!(
|
|
||||||
"timeline {} is not active, cannot calculate tenant_size now",
|
|
||||||
timeline.timeline_id
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let last_record_lsn = timeline.get_last_record_lsn();
|
let last_record_lsn = timeline.get_last_record_lsn();
|
||||||
|
|
||||||
|
let ctx = match timeline.get_context(tenant_ctx) {
|
||||||
|
Ok(ctx) => ctx,
|
||||||
|
Err(state) => {
|
||||||
|
info!("skipping tenant size calculation for timeline because it is in {state:?} state");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
ctx_dropguards.push(ctx.cancellation_token().clone().drop_guard());
|
||||||
|
let ctx = Arc::new(ctx);
|
||||||
|
|
||||||
let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
|
let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
|
||||||
// there's a race between the update (holding tenant.gc_lock) and this read but it
|
// there's a race between the update (holding tenant.gc_lock) and this read but it
|
||||||
// might not be an issue, because it's not for Timeline::gc
|
// might not be an issue, because it's not for Timeline::gc
|
||||||
@@ -296,55 +175,48 @@ pub(super) async fn gather_inputs(
|
|||||||
|
|
||||||
// all timelines branch from something, because it might be impossible to pinpoint
|
// all timelines branch from something, because it might be impossible to pinpoint
|
||||||
// which is the tenant_size_model's "default" branch.
|
// which is the tenant_size_model's "default" branch.
|
||||||
|
|
||||||
let ancestor_lsn = timeline.get_ancestor_lsn();
|
|
||||||
|
|
||||||
updates.push(Update {
|
updates.push(Update {
|
||||||
lsn: ancestor_lsn,
|
lsn: timeline.get_ancestor_lsn(),
|
||||||
command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
|
command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
|
||||||
timeline_id: timeline.timeline_id,
|
timeline_id: timeline.timeline_id,
|
||||||
});
|
});
|
||||||
|
|
||||||
if let Some(parent_timeline_id) = timeline.get_ancestor_timeline_id() {
|
for (lsn, _kind) in interesting_lsns.iter() {
|
||||||
// refresh_gc_info will update branchpoints and pitr_cutoff but only do it for branches
|
let lsn = *lsn;
|
||||||
// which are over gc_horizon. for example, a "main" branch which never received any
|
if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, lsn)) {
|
||||||
// updates apart from initdb not have branch points recorded.
|
|
||||||
referenced_branch_froms
|
|
||||||
.entry((parent_timeline_id, timeline.get_ancestor_lsn()))
|
|
||||||
.or_default();
|
|
||||||
}
|
|
||||||
|
|
||||||
for (lsn, _kind) in &interesting_lsns {
|
|
||||||
// mark this visited so don't need to re-process this parent
|
|
||||||
*referenced_branch_froms
|
|
||||||
.entry((timeline.timeline_id, *lsn))
|
|
||||||
.or_default() = true;
|
|
||||||
|
|
||||||
if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
|
|
||||||
updates.push(Update {
|
updates.push(Update {
|
||||||
lsn: *lsn,
|
lsn,
|
||||||
timeline_id: timeline.timeline_id,
|
timeline_id: timeline.timeline_id,
|
||||||
command: Command::Update(*size),
|
command: Command::Update(*size),
|
||||||
});
|
});
|
||||||
|
|
||||||
needed_cache.insert((timeline.timeline_id, *lsn));
|
needed_cache.insert((timeline.timeline_id, lsn));
|
||||||
} else {
|
} else {
|
||||||
let timeline = Arc::clone(&timeline);
|
let timeline = Arc::clone(&timeline);
|
||||||
let parallel_size_calcs = Arc::clone(limit);
|
let parallel_size_calcs = Arc::clone(limit);
|
||||||
let ctx = ctx.attached_child();
|
let ctx_clone = Arc::clone(&ctx);
|
||||||
joinset.spawn(calculate_logical_size(
|
joinset.spawn(async move {
|
||||||
parallel_size_calcs,
|
calculate_logical_size(parallel_size_calcs, timeline, lsn, &ctx_clone).await
|
||||||
timeline,
|
});
|
||||||
*lsn,
|
|
||||||
ctx,
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// all timelines also have an end point if they have made any progress
|
||||||
|
if last_record_lsn > timeline.get_ancestor_lsn()
|
||||||
|
&& !interesting_lsns
|
||||||
|
.iter()
|
||||||
|
.any(|(lsn, _)| lsn == &last_record_lsn)
|
||||||
|
{
|
||||||
|
updates.push(Update {
|
||||||
|
lsn: last_record_lsn,
|
||||||
|
command: Command::EndOfBranch,
|
||||||
|
timeline_id: timeline.timeline_id,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
timeline_inputs.insert(
|
timeline_inputs.insert(
|
||||||
timeline.timeline_id,
|
timeline.timeline_id,
|
||||||
TimelineInputs {
|
TimelineInputs {
|
||||||
ancestor_lsn,
|
|
||||||
last_record: last_record_lsn,
|
last_record: last_record_lsn,
|
||||||
// this is not used above, because it might not have updated recently enough
|
// this is not used above, because it might not have updated recently enough
|
||||||
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
|
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
|
||||||
@@ -355,81 +227,6 @@ pub(super) async fn gather_inputs(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// iterate over discovered branch points and make sure we are getting logical sizes at those
|
|
||||||
// points.
|
|
||||||
for ((timeline_id, lsn), handled) in referenced_branch_froms.iter() {
|
|
||||||
if *handled {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let timeline_id = *timeline_id;
|
|
||||||
let lsn = *lsn;
|
|
||||||
|
|
||||||
match timeline_inputs.get(&timeline_id) {
|
|
||||||
Some(inputs) if inputs.ancestor_lsn == lsn => {
|
|
||||||
// we don't need an update at this branch point which is also point where
|
|
||||||
// timeline_id branch was branched from.
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
Some(_) => {}
|
|
||||||
None => {
|
|
||||||
// we should have this because we have iterated through all of the timelines
|
|
||||||
anyhow::bail!("missing timeline_input for {timeline_id}")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(size) = logical_size_cache.get(&(timeline_id, lsn)) {
|
|
||||||
updates.push(Update {
|
|
||||||
lsn,
|
|
||||||
timeline_id,
|
|
||||||
command: Command::Update(*size),
|
|
||||||
});
|
|
||||||
|
|
||||||
needed_cache.insert((timeline_id, lsn));
|
|
||||||
} else {
|
|
||||||
let timeline = tenant
|
|
||||||
.get_timeline(timeline_id, false)
|
|
||||||
.context("find referenced ancestor timeline")?;
|
|
||||||
let parallel_size_calcs = Arc::clone(limit);
|
|
||||||
joinset.spawn(calculate_logical_size(
|
|
||||||
parallel_size_calcs,
|
|
||||||
timeline.clone(),
|
|
||||||
lsn,
|
|
||||||
ctx.attached_child(),
|
|
||||||
));
|
|
||||||
|
|
||||||
if let Some(parent_id) = timeline.get_ancestor_timeline_id() {
|
|
||||||
// we should not find new ones because we iterated tenants all timelines
|
|
||||||
anyhow::ensure!(
|
|
||||||
timeline_inputs.contains_key(&parent_id),
|
|
||||||
"discovered new timeline {parent_id} (parent of {timeline_id})"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// finally add in EndOfBranch for all timelines where their last_record_lsn is not a branch
|
|
||||||
// point. this is needed by the model.
|
|
||||||
for (timeline_id, inputs) in timeline_inputs.iter() {
|
|
||||||
let lsn = inputs.last_record;
|
|
||||||
|
|
||||||
if referenced_branch_froms.contains_key(&(*timeline_id, lsn)) {
|
|
||||||
// this means that the (timeline_id, last_record_lsn) represents a branch point
|
|
||||||
// we do not want to add EndOfBranch updates for these points because it doesn't fit
|
|
||||||
// into the current tenant_size_model.
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if lsn > inputs.ancestor_lsn {
|
|
||||||
// all timelines also have an end point if they have made any progress
|
|
||||||
updates.push(Update {
|
|
||||||
lsn,
|
|
||||||
command: Command::EndOfBranch,
|
|
||||||
timeline_id: *timeline_id,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut have_any_error = false;
|
let mut have_any_error = false;
|
||||||
|
|
||||||
while let Some(res) = joinset.join_next().await {
|
while let Some(res) = joinset.join_next().await {
|
||||||
@@ -486,13 +283,8 @@ pub(super) async fn gather_inputs(
|
|||||||
// for branch points, which come as multiple updates at the same LSN, the Command::Update
|
// for branch points, which come as multiple updates at the same LSN, the Command::Update
|
||||||
// is needed before a branch is made out of that branch Command::BranchFrom. this is
|
// is needed before a branch is made out of that branch Command::BranchFrom. this is
|
||||||
// handled by the variant order in `Command`.
|
// handled by the variant order in `Command`.
|
||||||
//
|
|
||||||
updates.sort_unstable();
|
updates.sort_unstable();
|
||||||
|
|
||||||
// And another sort to handle Command::BranchFrom ordering
|
|
||||||
// in case when there are multiple branches at the same LSN.
|
|
||||||
let sorted_updates = sort_updates_in_tree_order(updates)?;
|
|
||||||
|
|
||||||
let retention_period = match max_cutoff_distance {
|
let retention_period = match max_cutoff_distance {
|
||||||
Some(max) => max.0,
|
Some(max) => max.0,
|
||||||
None => {
|
None => {
|
||||||
@@ -501,7 +293,7 @@ pub(super) async fn gather_inputs(
|
|||||||
};
|
};
|
||||||
|
|
||||||
Ok(ModelInputs {
|
Ok(ModelInputs {
|
||||||
updates: sorted_updates,
|
updates,
|
||||||
retention_period,
|
retention_period,
|
||||||
timeline_inputs,
|
timeline_inputs,
|
||||||
})
|
})
|
||||||
@@ -519,23 +311,21 @@ impl ModelInputs {
|
|||||||
command: op,
|
command: op,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
} = update;
|
} = update;
|
||||||
|
|
||||||
let Lsn(now) = *lsn;
|
let Lsn(now) = *lsn;
|
||||||
match op {
|
match op {
|
||||||
Command::Update(sz) => {
|
Command::Update(sz) => {
|
||||||
storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz))?;
|
storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz));
|
||||||
}
|
}
|
||||||
Command::EndOfBranch => {
|
Command::EndOfBranch => {
|
||||||
storage.insert_point(&Some(*timeline_id), "".into(), now, None)?;
|
storage.insert_point(&Some(*timeline_id), "".into(), now, None);
|
||||||
}
|
}
|
||||||
Command::BranchFrom(parent) => {
|
Command::BranchFrom(parent) => {
|
||||||
// This branch command may fail if it cannot find a parent to branch from.
|
storage.branch(parent, Some(*timeline_id));
|
||||||
storage.branch(parent, Some(*timeline_id))?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(storage.calculate(self.retention_period)?.total_children())
|
Ok(storage.calculate(self.retention_period).total_children())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -583,7 +373,7 @@ enum LsnKind {
|
|||||||
struct TimelineAtLsnSizeResult(
|
struct TimelineAtLsnSizeResult(
|
||||||
Arc<crate::tenant::Timeline>,
|
Arc<crate::tenant::Timeline>,
|
||||||
utils::lsn::Lsn,
|
utils::lsn::Lsn,
|
||||||
Result<u64, CalculateLogicalSizeError>,
|
Result<u64, PageReconstructError>,
|
||||||
);
|
);
|
||||||
|
|
||||||
#[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
|
#[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
|
||||||
@@ -591,16 +381,13 @@ async fn calculate_logical_size(
|
|||||||
limit: Arc<tokio::sync::Semaphore>,
|
limit: Arc<tokio::sync::Semaphore>,
|
||||||
timeline: Arc<crate::tenant::Timeline>,
|
timeline: Arc<crate::tenant::Timeline>,
|
||||||
lsn: utils::lsn::Lsn,
|
lsn: utils::lsn::Lsn,
|
||||||
ctx: RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<TimelineAtLsnSizeResult, RecvError> {
|
) -> Result<TimelineAtLsnSizeResult, PageReconstructError> {
|
||||||
let _permit = tokio::sync::Semaphore::acquire_owned(limit)
|
let _permit = tokio::sync::Semaphore::acquire_owned(limit)
|
||||||
.await
|
.await
|
||||||
.expect("global semaphore should not had been closed");
|
.expect("global semaphore should not have been closed");
|
||||||
|
|
||||||
let size_res = timeline
|
let size_res = timeline.calculate_logical_size(lsn, ctx).await;
|
||||||
.spawn_ondemand_logical_size_calculation(lsn, ctx)
|
|
||||||
.instrument(info_span!("spawn_ondemand_logical_size_calculation"))
|
|
||||||
.await?;
|
|
||||||
Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
|
Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -685,146 +472,9 @@ fn updates_sort() {
|
|||||||
fn verify_size_for_multiple_branches() {
|
fn verify_size_for_multiple_branches() {
|
||||||
// this is generated from integration test test_tenant_size_with_multiple_branches, but this way
|
// this is generated from integration test test_tenant_size_with_multiple_branches, but this way
|
||||||
// it has the stable lsn's
|
// it has the stable lsn's
|
||||||
//
|
let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072,"timeline_inputs":{"cd9d9409c216e64bf580904facedb01b":{"last_record":"0/18D5E40","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/18B5E40","pitr_cutoff":"0/18B5E40","next_gc_cutoff":"0/18B5E40"},"10b532a550540bc15385eac4edde416a":{"last_record":"0/1839818","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/1819818","pitr_cutoff":"0/1819818","next_gc_cutoff":"0/1819818"},"230fc9d756f7363574c0d66533564dcc":{"last_record":"0/222F438","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/220F438","pitr_cutoff":"0/220F438","next_gc_cutoff":"0/220F438"}}}"#;
|
||||||
// timelineinputs have been left out, because those explain the inputs, but don't participate
|
|
||||||
// in further size calculations.
|
|
||||||
let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072}"#;
|
|
||||||
|
|
||||||
let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
|
let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
|
||||||
|
|
||||||
assert_eq!(inputs.calculate().unwrap(), 36_409_872);
|
assert_eq!(inputs.calculate().unwrap(), 36_409_872);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn updates_sort_with_branches_at_same_lsn() {
|
|
||||||
use std::str::FromStr;
|
|
||||||
use Command::{BranchFrom, EndOfBranch};
|
|
||||||
|
|
||||||
macro_rules! lsn {
|
|
||||||
($e:expr) => {
|
|
||||||
Lsn::from_str($e).unwrap()
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
let ids = [
|
|
||||||
TimelineId::from_str("00000000000000000000000000000000").unwrap(),
|
|
||||||
TimelineId::from_str("11111111111111111111111111111111").unwrap(),
|
|
||||||
TimelineId::from_str("22222222222222222222222222222222").unwrap(),
|
|
||||||
TimelineId::from_str("33333333333333333333333333333333").unwrap(),
|
|
||||||
TimelineId::from_str("44444444444444444444444444444444").unwrap(),
|
|
||||||
];
|
|
||||||
|
|
||||||
// issue https://github.com/neondatabase/neon/issues/3179
|
|
||||||
let commands = vec![
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/0"),
|
|
||||||
command: BranchFrom(None),
|
|
||||||
timeline_id: ids[0],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/169AD58"),
|
|
||||||
command: Command::Update(25387008),
|
|
||||||
timeline_id: ids[0],
|
|
||||||
},
|
|
||||||
// next three are wrongly sorted, because
|
|
||||||
// ids[1] is branched from before ids[1] exists
|
|
||||||
// and ids[2] is branched from before ids[2] exists
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/169AD58"),
|
|
||||||
command: BranchFrom(Some(ids[1])),
|
|
||||||
timeline_id: ids[3],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/169AD58"),
|
|
||||||
command: BranchFrom(Some(ids[0])),
|
|
||||||
timeline_id: ids[2],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/169AD58"),
|
|
||||||
command: BranchFrom(Some(ids[2])),
|
|
||||||
timeline_id: ids[1],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/1CA85B8"),
|
|
||||||
command: Command::Update(28925952),
|
|
||||||
timeline_id: ids[1],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/1CD85B8"),
|
|
||||||
command: Command::Update(29024256),
|
|
||||||
timeline_id: ids[1],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/1CD85B8"),
|
|
||||||
command: BranchFrom(Some(ids[1])),
|
|
||||||
timeline_id: ids[4],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/22DCE70"),
|
|
||||||
command: Command::Update(32546816),
|
|
||||||
timeline_id: ids[3],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/230CE70"),
|
|
||||||
command: EndOfBranch,
|
|
||||||
timeline_id: ids[3],
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
let expected = vec![
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/0"),
|
|
||||||
command: BranchFrom(None),
|
|
||||||
timeline_id: ids[0],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/169AD58"),
|
|
||||||
command: Command::Update(25387008),
|
|
||||||
timeline_id: ids[0],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/169AD58"),
|
|
||||||
command: BranchFrom(Some(ids[0])),
|
|
||||||
timeline_id: ids[2],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/169AD58"),
|
|
||||||
command: BranchFrom(Some(ids[2])),
|
|
||||||
timeline_id: ids[1],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/169AD58"),
|
|
||||||
command: BranchFrom(Some(ids[1])),
|
|
||||||
timeline_id: ids[3],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/1CA85B8"),
|
|
||||||
command: Command::Update(28925952),
|
|
||||||
timeline_id: ids[1],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/1CD85B8"),
|
|
||||||
command: Command::Update(29024256),
|
|
||||||
timeline_id: ids[1],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/1CD85B8"),
|
|
||||||
command: BranchFrom(Some(ids[1])),
|
|
||||||
timeline_id: ids[4],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/22DCE70"),
|
|
||||||
command: Command::Update(32546816),
|
|
||||||
timeline_id: ids[3],
|
|
||||||
},
|
|
||||||
Update {
|
|
||||||
lsn: lsn!("0/230CE70"),
|
|
||||||
command: EndOfBranch,
|
|
||||||
timeline_id: ids[3],
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
let sorted_commands = sort_updates_in_tree_order(commands).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(sorted_commands, expected);
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -196,50 +196,3 @@ pub fn downcast_remote_layer(
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Debug for dyn Layer {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
f.debug_struct("Layer")
|
|
||||||
.field("short_id", &self.short_id())
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Holds metadata about a layer without any content. Used mostly for testing.
|
|
||||||
pub struct LayerDescriptor {
|
|
||||||
pub key: Range<Key>,
|
|
||||||
pub lsn: Range<Lsn>,
|
|
||||||
pub is_incremental: bool,
|
|
||||||
pub short_id: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Layer for LayerDescriptor {
|
|
||||||
fn get_key_range(&self) -> Range<Key> {
|
|
||||||
self.key.clone()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
|
||||||
self.lsn.clone()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_incremental(&self) -> bool {
|
|
||||||
self.is_incremental
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_value_reconstruct_data(
|
|
||||||
&self,
|
|
||||||
_key: Key,
|
|
||||||
_lsn_range: Range<Lsn>,
|
|
||||||
_reconstruct_data: &mut ValueReconstructState,
|
|
||||||
) -> Result<ValueReconstructResult> {
|
|
||||||
todo!("This method shouldn't be part of the Layer trait")
|
|
||||||
}
|
|
||||||
|
|
||||||
fn short_id(&self) -> String {
|
|
||||||
self.short_id.clone()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn dump(&self, _verbose: bool) -> Result<()> {
|
|
||||||
todo!()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,46 +1,39 @@
|
|||||||
//! This module contains functions to serve per-tenant background processes,
|
//! This module contains functions to serve per-tenant background processes,
|
||||||
//! such as compaction and GC
|
//! such as compaction and GC
|
||||||
|
|
||||||
use std::ops::ControlFlow;
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext, TaskKind};
|
||||||
use crate::metrics::TENANT_TASK_EVENTS;
|
use crate::metrics::TENANT_TASK_EVENTS;
|
||||||
use crate::task_mgr;
|
use crate::task_mgr;
|
||||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
use crate::task_mgr::BACKGROUND_RUNTIME;
|
||||||
use crate::tenant::mgr;
|
use crate::tenant::Tenant;
|
||||||
use crate::tenant::{Tenant, TenantState};
|
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::id::TenantId;
|
|
||||||
|
|
||||||
pub fn start_background_loops(tenant_id: TenantId) {
|
pub fn start_background_loops(tenant: &Arc<Tenant>) {
|
||||||
|
let tenant_id = tenant.tenant_id;
|
||||||
|
|
||||||
|
let tenant_clone = Arc::clone(tenant);
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
BACKGROUND_RUNTIME.handle(),
|
BACKGROUND_RUNTIME.handle(),
|
||||||
TaskKind::Compaction,
|
|
||||||
Some(tenant_id),
|
|
||||||
None,
|
|
||||||
&format!("compactor for tenant {tenant_id}"),
|
&format!("compactor for tenant {tenant_id}"),
|
||||||
false,
|
false,
|
||||||
async move {
|
async move {
|
||||||
compaction_loop(tenant_id)
|
compaction_loop(&tenant_clone)
|
||||||
.instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
|
.instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
|
||||||
.await;
|
.await;
|
||||||
Ok(())
|
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
let tenant_clone = Arc::clone(tenant);
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
BACKGROUND_RUNTIME.handle(),
|
BACKGROUND_RUNTIME.handle(),
|
||||||
TaskKind::GarbageCollector,
|
|
||||||
Some(tenant_id),
|
|
||||||
None,
|
|
||||||
&format!("garbage collector for tenant {tenant_id}"),
|
&format!("garbage collector for tenant {tenant_id}"),
|
||||||
false,
|
false,
|
||||||
async move {
|
async move {
|
||||||
gc_loop(tenant_id)
|
gc_loop(&tenant_clone)
|
||||||
.instrument(info_span!("gc_loop", tenant_id = %tenant_id))
|
.instrument(info_span!("gc_loop", tenant_id = %tenant_id))
|
||||||
.await;
|
.await;
|
||||||
Ok(())
|
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -48,26 +41,27 @@ pub fn start_background_loops(tenant_id: TenantId) {
|
|||||||
///
|
///
|
||||||
/// Compaction task's main loop
|
/// Compaction task's main loop
|
||||||
///
|
///
|
||||||
async fn compaction_loop(tenant_id: TenantId) {
|
async fn compaction_loop(tenant: &Arc<Tenant>) {
|
||||||
let wait_duration = Duration::from_secs(2);
|
let wait_duration = Duration::from_secs(2);
|
||||||
info!("starting");
|
info!("starting");
|
||||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||||
async {
|
async {
|
||||||
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
|
let top_ctx = RequestContext::new(TaskKind::Compaction, DownloadBehavior::Download);
|
||||||
|
|
||||||
|
let tenant_ctx = match tenant.get_context(&top_ctx) {
|
||||||
|
Ok(ctx) => ctx,
|
||||||
|
Err(state) => {
|
||||||
|
// This could happen if the tenant is detached or the pageserver is shut
|
||||||
|
// down immediately after loading or attaching completed and the tenant
|
||||||
|
// was activated. It seems unlikely enough in practice that we better print
|
||||||
|
// a warning, as it could also be a bug.
|
||||||
|
error!("Not running compaction loop, tenant is not active: {state:?}");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
loop {
|
loop {
|
||||||
trace!("waking up");
|
trace!("waking up");
|
||||||
|
|
||||||
let tenant = tokio::select! {
|
|
||||||
_ = task_mgr::shutdown_watcher() => {
|
|
||||||
info!("received cancellation request");
|
|
||||||
return;
|
|
||||||
},
|
|
||||||
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
|
|
||||||
ControlFlow::Break(()) => return,
|
|
||||||
ControlFlow::Continue(tenant) => tenant,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut sleep_duration = tenant.get_compaction_period();
|
let mut sleep_duration = tenant.get_compaction_period();
|
||||||
if sleep_duration == Duration::ZERO {
|
if sleep_duration == Duration::ZERO {
|
||||||
info!("automatic compaction is disabled");
|
info!("automatic compaction is disabled");
|
||||||
@@ -75,7 +69,7 @@ async fn compaction_loop(tenant_id: TenantId) {
|
|||||||
sleep_duration = Duration::from_secs(10);
|
sleep_duration = Duration::from_secs(10);
|
||||||
} else {
|
} else {
|
||||||
// Run compaction
|
// Run compaction
|
||||||
if let Err(e) = tenant.compaction_iteration(&ctx).await {
|
if let Err(e) = tenant.compaction_iteration(&tenant_ctx).await {
|
||||||
sleep_duration = wait_duration;
|
sleep_duration = wait_duration;
|
||||||
error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
|
error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||||
}
|
}
|
||||||
@@ -83,7 +77,7 @@ async fn compaction_loop(tenant_id: TenantId) {
|
|||||||
|
|
||||||
// Sleep
|
// Sleep
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = task_mgr::shutdown_watcher() => {
|
_ = tenant_ctx.cancelled() => {
|
||||||
info!("received cancellation request during idling");
|
info!("received cancellation request during idling");
|
||||||
break;
|
break;
|
||||||
},
|
},
|
||||||
@@ -100,28 +94,28 @@ async fn compaction_loop(tenant_id: TenantId) {
|
|||||||
///
|
///
|
||||||
/// GC task's main loop
|
/// GC task's main loop
|
||||||
///
|
///
|
||||||
async fn gc_loop(tenant_id: TenantId) {
|
async fn gc_loop(tenant: &Arc<Tenant>) {
|
||||||
let wait_duration = Duration::from_secs(2);
|
let wait_duration = Duration::from_secs(2);
|
||||||
info!("starting");
|
info!("starting");
|
||||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||||
async {
|
async {
|
||||||
// GC might require downloading, to find the cutoff LSN that corresponds to the
|
// GC might require downloading, to find the cutoff LSN that corresponds to the
|
||||||
// cutoff specified as time.
|
// cutoff specified as time.
|
||||||
let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
let top_ctx = RequestContext::new(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||||
|
let tenant_ctx = match tenant.get_context(&top_ctx) {
|
||||||
|
Ok(ctx) => ctx,
|
||||||
|
Err(state) => {
|
||||||
|
// This could happen if the tenant is detached or the pageserver is shut
|
||||||
|
// down immediately after loading or attaching completed and the tenant
|
||||||
|
// was activated. It seems unlikely enough in practice that we better print
|
||||||
|
// a warning, as it could also be a bug.
|
||||||
|
error!("Not running GC loop, tenant is not active: {state:?}");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
loop {
|
loop {
|
||||||
trace!("waking up");
|
trace!("waking up");
|
||||||
|
|
||||||
let tenant = tokio::select! {
|
|
||||||
_ = task_mgr::shutdown_watcher() => {
|
|
||||||
info!("received cancellation request");
|
|
||||||
return;
|
|
||||||
},
|
|
||||||
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
|
|
||||||
ControlFlow::Break(()) => return,
|
|
||||||
ControlFlow::Continue(tenant) => tenant,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
let gc_period = tenant.get_gc_period();
|
let gc_period = tenant.get_gc_period();
|
||||||
let gc_horizon = tenant.get_gc_horizon();
|
let gc_horizon = tenant.get_gc_horizon();
|
||||||
let mut sleep_duration = gc_period;
|
let mut sleep_duration = gc_period;
|
||||||
@@ -132,7 +126,10 @@ async fn gc_loop(tenant_id: TenantId) {
|
|||||||
} else {
|
} else {
|
||||||
// Run gc
|
// Run gc
|
||||||
if gc_horizon > 0 {
|
if gc_horizon > 0 {
|
||||||
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await
|
// Run compaction
|
||||||
|
if let Err(e) = tenant
|
||||||
|
.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &tenant_ctx)
|
||||||
|
.await
|
||||||
{
|
{
|
||||||
sleep_duration = wait_duration;
|
sleep_duration = wait_duration;
|
||||||
error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
|
error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||||
@@ -142,7 +139,7 @@ async fn gc_loop(tenant_id: TenantId) {
|
|||||||
|
|
||||||
// Sleep
|
// Sleep
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = task_mgr::shutdown_watcher() => {
|
_ = tenant_ctx.cancelled() => {
|
||||||
info!("received cancellation request during idling");
|
info!("received cancellation request during idling");
|
||||||
break;
|
break;
|
||||||
},
|
},
|
||||||
@@ -154,46 +151,3 @@ async fn gc_loop(tenant_id: TenantId) {
|
|||||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||||
trace!("GC loop stopped.");
|
trace!("GC loop stopped.");
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_active_tenant(
|
|
||||||
tenant_id: TenantId,
|
|
||||||
wait: Duration,
|
|
||||||
) -> ControlFlow<(), Arc<Tenant>> {
|
|
||||||
let tenant = loop {
|
|
||||||
match mgr::get_tenant(tenant_id, false).await {
|
|
||||||
Ok(tenant) => break tenant,
|
|
||||||
Err(e) => {
|
|
||||||
error!("Failed to get a tenant {tenant_id}: {e:#}");
|
|
||||||
tokio::time::sleep(wait).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// if the tenant has a proper status already, no need to wait for anything
|
|
||||||
if tenant.current_state() == TenantState::Active {
|
|
||||||
ControlFlow::Continue(tenant)
|
|
||||||
} else {
|
|
||||||
let mut tenant_state_updates = tenant.subscribe_for_state_updates();
|
|
||||||
loop {
|
|
||||||
match tenant_state_updates.changed().await {
|
|
||||||
Ok(()) => {
|
|
||||||
let new_state = *tenant_state_updates.borrow();
|
|
||||||
match new_state {
|
|
||||||
TenantState::Active => {
|
|
||||||
debug!("Tenant state changed to active, continuing the task loop");
|
|
||||||
return ControlFlow::Continue(tenant);
|
|
||||||
}
|
|
||||||
state => {
|
|
||||||
debug!("Not running the task loop, tenant is not active: {state:?}");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(_sender_dropped_error) => {
|
|
||||||
info!("Tenant dropped the state updates sender, quitting waiting for tenant and the task loop");
|
|
||||||
return ControlFlow::Break(());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -4,6 +4,7 @@ use super::storage_layer::LayerFileName;
|
|||||||
use crate::tenant::metadata::TimelineMetadata;
|
use crate::tenant::metadata::TimelineMetadata;
|
||||||
use crate::tenant::remote_timeline_client::index::IndexPart;
|
use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||||
|
use crate::tenant::TimelineRequestContext;
|
||||||
use std::collections::{HashMap, VecDeque};
|
use std::collections::{HashMap, VecDeque};
|
||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
|
|
||||||
@@ -73,6 +74,13 @@ pub(crate) struct UploadQueueInitialized {
|
|||||||
/// tasks to finish. For example, metadata upload cannot be performed before all
|
/// tasks to finish. For example, metadata upload cannot be performed before all
|
||||||
/// preceding layer file uploads have completed.
|
/// preceding layer file uploads have completed.
|
||||||
pub(crate) queued_operations: VecDeque<UploadOp>,
|
pub(crate) queued_operations: VecDeque<UploadOp>,
|
||||||
|
|
||||||
|
/// Context used for the upload tasks. Note that this is associated with the
|
||||||
|
/// Timeline, so this prevents the Timeline from being shut down. To ensure quick
|
||||||
|
/// shutdown, RemoteTimelineClient spawns a task to wait for cancellation on the
|
||||||
|
/// context and stop the queue. Otherwise we woudn't notice the cancellation
|
||||||
|
/// until next upload attempt.
|
||||||
|
pub(crate) upload_ctx: TimelineRequestContext,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) struct UploadQueueStopped {
|
pub(crate) struct UploadQueueStopped {
|
||||||
@@ -83,6 +91,7 @@ impl UploadQueue {
|
|||||||
pub(crate) fn initialize_empty_remote(
|
pub(crate) fn initialize_empty_remote(
|
||||||
&mut self,
|
&mut self,
|
||||||
metadata: &TimelineMetadata,
|
metadata: &TimelineMetadata,
|
||||||
|
upload_ctx: TimelineRequestContext,
|
||||||
) -> anyhow::Result<&mut UploadQueueInitialized> {
|
) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||||
match self {
|
match self {
|
||||||
UploadQueue::Uninitialized => (),
|
UploadQueue::Uninitialized => (),
|
||||||
@@ -108,6 +117,7 @@ impl UploadQueue {
|
|||||||
num_inprogress_deletions: 0,
|
num_inprogress_deletions: 0,
|
||||||
inprogress_tasks: HashMap::new(),
|
inprogress_tasks: HashMap::new(),
|
||||||
queued_operations: VecDeque::new(),
|
queued_operations: VecDeque::new(),
|
||||||
|
upload_ctx,
|
||||||
};
|
};
|
||||||
|
|
||||||
*self = UploadQueue::Initialized(state);
|
*self = UploadQueue::Initialized(state);
|
||||||
@@ -117,6 +127,7 @@ impl UploadQueue {
|
|||||||
pub(crate) fn initialize_with_current_remote_index_part(
|
pub(crate) fn initialize_with_current_remote_index_part(
|
||||||
&mut self,
|
&mut self,
|
||||||
index_part: &IndexPart,
|
index_part: &IndexPart,
|
||||||
|
upload_ctx: TimelineRequestContext,
|
||||||
) -> anyhow::Result<&mut UploadQueueInitialized> {
|
) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||||
match self {
|
match self {
|
||||||
UploadQueue::Uninitialized => (),
|
UploadQueue::Uninitialized => (),
|
||||||
@@ -153,6 +164,7 @@ impl UploadQueue {
|
|||||||
num_inprogress_deletions: 0,
|
num_inprogress_deletions: 0,
|
||||||
inprogress_tasks: HashMap::new(),
|
inprogress_tasks: HashMap::new(),
|
||||||
queued_operations: VecDeque::new(),
|
queued_operations: VecDeque::new(),
|
||||||
|
upload_ctx,
|
||||||
};
|
};
|
||||||
|
|
||||||
*self = UploadQueue::Initialized(state);
|
*self = UploadQueue::Initialized(state);
|
||||||
|
|||||||
@@ -25,14 +25,13 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
|
|||||||
use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
|
use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
|
||||||
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
|
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::{Context, Result};
|
||||||
use bytes::{Buf, Bytes, BytesMut};
|
use bytes::{Buf, Bytes, BytesMut};
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
|
||||||
use crate::pgdatadir_mapping::*;
|
use crate::pgdatadir_mapping::*;
|
||||||
use crate::tenant::PageReconstructError;
|
use crate::tenant::PageReconstructError;
|
||||||
use crate::tenant::Timeline;
|
use crate::tenant::{Timeline, TimelineRequestContext};
|
||||||
use crate::walrecord::*;
|
use crate::walrecord::*;
|
||||||
use crate::ZERO_PAGE;
|
use crate::ZERO_PAGE;
|
||||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||||
@@ -56,7 +55,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
pub async fn new(
|
pub async fn new(
|
||||||
timeline: &'a Timeline,
|
timeline: &'a Timeline,
|
||||||
startpoint: Lsn,
|
startpoint: Lsn,
|
||||||
ctx: &'_ RequestContext,
|
ctx: &'_ TimelineRequestContext,
|
||||||
) -> anyhow::Result<WalIngest<'a>> {
|
) -> anyhow::Result<WalIngest<'a>> {
|
||||||
// Fetch the latest checkpoint into memory, so that we can compare with it
|
// Fetch the latest checkpoint into memory, so that we can compare with it
|
||||||
// quickly in `ingest_record` and update it when it changes.
|
// quickly in `ingest_record` and update it when it changes.
|
||||||
@@ -85,8 +84,8 @@ impl<'a> WalIngest<'a> {
|
|||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
modification: &mut DatadirModification<'_>,
|
modification: &mut DatadirModification<'_>,
|
||||||
decoded: &mut DecodedWALRecord,
|
decoded: &mut DecodedWALRecord,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> Result<(), PageReconstructError> {
|
||||||
modification.lsn = lsn;
|
modification.lsn = lsn;
|
||||||
decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
|
decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
|
||||||
|
|
||||||
@@ -289,7 +288,8 @@ impl<'a> WalIngest<'a> {
|
|||||||
{
|
{
|
||||||
let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
|
let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
|
||||||
buf.copy_to_slice(&mut checkpoint_bytes);
|
buf.copy_to_slice(&mut checkpoint_bytes);
|
||||||
let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
let xlog_checkpoint =
|
||||||
|
CheckPoint::decode(&checkpoint_bytes).context("error decoding checkpoint")?;
|
||||||
trace!(
|
trace!(
|
||||||
"xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
|
"xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
|
||||||
xlog_checkpoint.oldestXid,
|
xlog_checkpoint.oldestXid,
|
||||||
@@ -316,7 +316,10 @@ impl<'a> WalIngest<'a> {
|
|||||||
|
|
||||||
// If checkpoint data was updated, store the new version in the repository
|
// If checkpoint data was updated, store the new version in the repository
|
||||||
if self.checkpoint_modified {
|
if self.checkpoint_modified {
|
||||||
let new_checkpoint_bytes = self.checkpoint.encode()?;
|
let new_checkpoint_bytes = self
|
||||||
|
.checkpoint
|
||||||
|
.encode()
|
||||||
|
.context("error encoding checkpoint")?;
|
||||||
|
|
||||||
modification.put_checkpoint(new_checkpoint_bytes)?;
|
modification.put_checkpoint(new_checkpoint_bytes)?;
|
||||||
self.checkpoint_modified = false;
|
self.checkpoint_modified = false;
|
||||||
@@ -335,7 +338,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
decoded: &DecodedWALRecord,
|
decoded: &DecodedWALRecord,
|
||||||
blk: &DecodedBkpBlock,
|
blk: &DecodedBkpBlock,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<(), PageReconstructError> {
|
) -> Result<(), PageReconstructError> {
|
||||||
let rel = RelTag {
|
let rel = RelTag {
|
||||||
spcnode: blk.rnode_spcnode,
|
spcnode: blk.rnode_spcnode,
|
||||||
@@ -396,7 +399,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
buf: &mut Bytes,
|
buf: &mut Bytes,
|
||||||
modification: &mut DatadirModification<'_>,
|
modification: &mut DatadirModification<'_>,
|
||||||
decoded: &mut DecodedWALRecord,
|
decoded: &mut DecodedWALRecord,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Handle VM bit updates that are implicitly part of heap records.
|
// Handle VM bit updates that are implicitly part of heap records.
|
||||||
|
|
||||||
@@ -547,7 +550,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
modification: &mut DatadirModification<'_>,
|
modification: &mut DatadirModification<'_>,
|
||||||
rec: &XlCreateDatabase,
|
rec: &XlCreateDatabase,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let db_id = rec.db_id;
|
let db_id = rec.db_id;
|
||||||
let tablespace_id = rec.tablespace_id;
|
let tablespace_id = rec.tablespace_id;
|
||||||
@@ -623,7 +626,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
modification: &mut DatadirModification<'_>,
|
modification: &mut DatadirModification<'_>,
|
||||||
rec: &XlSmgrCreate,
|
rec: &XlSmgrCreate,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let rel = RelTag {
|
let rel = RelTag {
|
||||||
spcnode: rec.rnode.spcnode,
|
spcnode: rec.rnode.spcnode,
|
||||||
@@ -642,7 +645,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
modification: &mut DatadirModification<'_>,
|
modification: &mut DatadirModification<'_>,
|
||||||
rec: &XlSmgrTruncate,
|
rec: &XlSmgrTruncate,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let spcnode = rec.rnode.spcnode;
|
let spcnode = rec.rnode.spcnode;
|
||||||
let dbnode = rec.rnode.dbnode;
|
let dbnode = rec.rnode.dbnode;
|
||||||
@@ -713,7 +716,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
modification: &mut DatadirModification<'_>,
|
modification: &mut DatadirModification<'_>,
|
||||||
parsed: &XlXactParsedRecord,
|
parsed: &XlXactParsedRecord,
|
||||||
is_commit: bool,
|
is_commit: bool,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Record update of CLOG pages
|
// Record update of CLOG pages
|
||||||
let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||||
@@ -786,7 +789,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
modification: &mut DatadirModification<'_>,
|
modification: &mut DatadirModification<'_>,
|
||||||
xlrec: &XlClogTruncate,
|
xlrec: &XlClogTruncate,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
info!(
|
info!(
|
||||||
"RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}",
|
"RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}",
|
||||||
@@ -927,7 +930,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
modification: &mut DatadirModification<'_>,
|
modification: &mut DatadirModification<'_>,
|
||||||
xlrec: &XlMultiXactTruncate,
|
xlrec: &XlMultiXactTruncate,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
self.checkpoint.oldestMulti = xlrec.end_trunc_off;
|
self.checkpoint.oldestMulti = xlrec.end_trunc_off;
|
||||||
self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
|
self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
|
||||||
@@ -965,7 +968,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
modification: &mut DatadirModification<'_>,
|
modification: &mut DatadirModification<'_>,
|
||||||
xlrec: &XlRelmapUpdate,
|
xlrec: &XlRelmapUpdate,
|
||||||
decoded: &DecodedWALRecord,
|
decoded: &DecodedWALRecord,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut buf = decoded.record.clone();
|
let mut buf = decoded.record.clone();
|
||||||
buf.advance(decoded.main_data_offset);
|
buf.advance(decoded.main_data_offset);
|
||||||
@@ -986,7 +989,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
modification: &mut DatadirModification<'_>,
|
modification: &mut DatadirModification<'_>,
|
||||||
rel: RelTag,
|
rel: RelTag,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
modification.put_rel_creation(rel, 0, ctx).await?;
|
modification.put_rel_creation(rel, 0, ctx).await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -998,7 +1001,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
rel: RelTag,
|
rel: RelTag,
|
||||||
blknum: BlockNumber,
|
blknum: BlockNumber,
|
||||||
img: Bytes,
|
img: Bytes,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<(), PageReconstructError> {
|
) -> Result<(), PageReconstructError> {
|
||||||
self.handle_rel_extend(modification, rel, blknum, ctx)
|
self.handle_rel_extend(modification, rel, blknum, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
@@ -1012,7 +1015,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
rel: RelTag,
|
rel: RelTag,
|
||||||
blknum: BlockNumber,
|
blknum: BlockNumber,
|
||||||
rec: NeonWalRecord,
|
rec: NeonWalRecord,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
self.handle_rel_extend(modification, rel, blknum, ctx)
|
self.handle_rel_extend(modification, rel, blknum, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
@@ -1025,7 +1028,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
modification: &mut DatadirModification<'_>,
|
modification: &mut DatadirModification<'_>,
|
||||||
rel: RelTag,
|
rel: RelTag,
|
||||||
nblocks: BlockNumber,
|
nblocks: BlockNumber,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
modification.put_rel_truncation(rel, nblocks, ctx).await?;
|
modification.put_rel_truncation(rel, nblocks, ctx).await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -1035,7 +1038,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
modification: &mut DatadirModification<'_>,
|
modification: &mut DatadirModification<'_>,
|
||||||
rel: RelTag,
|
rel: RelTag,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
modification.put_rel_drop(rel, ctx).await?;
|
modification.put_rel_drop(rel, ctx).await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -1045,7 +1048,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
rel: RelTag,
|
rel: RelTag,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<BlockNumber> {
|
) -> anyhow::Result<BlockNumber> {
|
||||||
let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? {
|
let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? {
|
||||||
0
|
0
|
||||||
@@ -1060,7 +1063,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
modification: &mut DatadirModification<'_>,
|
modification: &mut DatadirModification<'_>,
|
||||||
rel: RelTag,
|
rel: RelTag,
|
||||||
blknum: BlockNumber,
|
blknum: BlockNumber,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<(), PageReconstructError> {
|
) -> Result<(), PageReconstructError> {
|
||||||
let new_nblocks = blknum + 1;
|
let new_nblocks = blknum + 1;
|
||||||
// Check if the relation exists. We implicitly create relations on first
|
// Check if the relation exists. We implicitly create relations on first
|
||||||
@@ -1098,7 +1101,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
segno: u32,
|
segno: u32,
|
||||||
blknum: BlockNumber,
|
blknum: BlockNumber,
|
||||||
img: Bytes,
|
img: Bytes,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
self.handle_slru_extend(modification, kind, segno, blknum, ctx)
|
self.handle_slru_extend(modification, kind, segno, blknum, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
@@ -1112,7 +1115,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
kind: SlruKind,
|
kind: SlruKind,
|
||||||
segno: u32,
|
segno: u32,
|
||||||
blknum: BlockNumber,
|
blknum: BlockNumber,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// we don't use a cache for this like we do for relations. SLRUS are explcitly
|
// we don't use a cache for this like we do for relations. SLRUS are explcitly
|
||||||
// extended with ZEROPAGE records, not with commit records, so it happens
|
// extended with ZEROPAGE records, not with commit records, so it happens
|
||||||
@@ -1186,7 +1189,7 @@ mod tests {
|
|||||||
|
|
||||||
async fn init_walingest_test<'a>(
|
async fn init_walingest_test<'a>(
|
||||||
tline: &'a Timeline,
|
tline: &'a Timeline,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> Result<WalIngest<'a>> {
|
) -> Result<WalIngest<'a>> {
|
||||||
let mut m = tline.begin_modification(Lsn(0x10));
|
let mut m = tline.begin_modification(Lsn(0x10));
|
||||||
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
|
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
|
||||||
@@ -1199,8 +1202,9 @@ mod tests {
|
|||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_relsize() -> Result<()> {
|
async fn test_relsize() -> Result<()> {
|
||||||
let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
|
let (tenant, tenant_ctx) = TenantHarness::create("test_relsize")?.load().await;
|
||||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
|
let (tline, ctx) =
|
||||||
|
create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &tenant_ctx)?;
|
||||||
let mut walingest = init_walingest_test(&tline, &ctx).await?;
|
let mut walingest = init_walingest_test(&tline, &ctx).await?;
|
||||||
|
|
||||||
let mut m = tline.begin_modification(Lsn(0x20));
|
let mut m = tline.begin_modification(Lsn(0x20));
|
||||||
@@ -1418,8 +1422,9 @@ mod tests {
|
|||||||
// and then created it again within the same layer.
|
// and then created it again within the same layer.
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_drop_extend() -> Result<()> {
|
async fn test_drop_extend() -> Result<()> {
|
||||||
let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
|
let (tenant, tenant_ctx) = TenantHarness::create("test_drop_extend")?.load().await;
|
||||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
|
let (tline, ctx) =
|
||||||
|
create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &tenant_ctx)?;
|
||||||
let mut walingest = init_walingest_test(&tline, &ctx).await?;
|
let mut walingest = init_walingest_test(&tline, &ctx).await?;
|
||||||
|
|
||||||
let mut m = tline.begin_modification(Lsn(0x20));
|
let mut m = tline.begin_modification(Lsn(0x20));
|
||||||
@@ -1487,8 +1492,9 @@ mod tests {
|
|||||||
// and then extended it again within the same layer.
|
// and then extended it again within the same layer.
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_truncate_extend() -> Result<()> {
|
async fn test_truncate_extend() -> Result<()> {
|
||||||
let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
|
let (tenant, tenant_ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
|
||||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
|
let (tline, ctx) =
|
||||||
|
create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &tenant_ctx)?;
|
||||||
let mut walingest = init_walingest_test(&tline, &ctx).await?;
|
let mut walingest = init_walingest_test(&tline, &ctx).await?;
|
||||||
|
|
||||||
// Create a 20 MB relation (the size is arbitrary)
|
// Create a 20 MB relation (the size is arbitrary)
|
||||||
@@ -1627,8 +1633,9 @@ mod tests {
|
|||||||
/// split into multiple 1 GB segments in Postgres.
|
/// split into multiple 1 GB segments in Postgres.
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_large_rel() -> Result<()> {
|
async fn test_large_rel() -> Result<()> {
|
||||||
let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
|
let (tenant, tenant_ctx) = TenantHarness::create("test_large_rel")?.load().await;
|
||||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
|
let (tline, ctx) =
|
||||||
|
create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &tenant_ctx)?;
|
||||||
let mut walingest = init_walingest_test(&tline, &ctx).await?;
|
let mut walingest = init_walingest_test(&tline, &ctx).await?;
|
||||||
|
|
||||||
let mut lsn = 0x10;
|
let mut lsn = 0x10;
|
||||||
|
|||||||
@@ -103,23 +103,18 @@ pub enum TaskStateUpdate<E> {
|
|||||||
impl<E: Clone> TaskHandle<E> {
|
impl<E: Clone> TaskHandle<E> {
|
||||||
/// Initializes the task, starting it immediately after the creation.
|
/// Initializes the task, starting it immediately after the creation.
|
||||||
pub fn spawn<Fut>(
|
pub fn spawn<Fut>(
|
||||||
task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, CancellationToken) -> Fut + Send + 'static,
|
task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>) -> Fut + Send + 'static,
|
||||||
|
cancellation: CancellationToken,
|
||||||
) -> Self
|
) -> Self
|
||||||
where
|
where
|
||||||
Fut: Future<Output = anyhow::Result<()>> + Send,
|
Fut: Future<Output = anyhow::Result<()>> + Send,
|
||||||
E: Send + Sync + 'static,
|
E: Send + Sync + 'static,
|
||||||
{
|
{
|
||||||
let cancellation = CancellationToken::new();
|
|
||||||
let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
|
let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
|
||||||
|
|
||||||
let cancellation_clone = cancellation.clone();
|
|
||||||
let join_handle = WALRECEIVER_RUNTIME.spawn(async move {
|
let join_handle = WALRECEIVER_RUNTIME.spawn(async move {
|
||||||
events_sender.send(TaskStateUpdate::Started).ok();
|
events_sender.send(TaskStateUpdate::Started).ok();
|
||||||
task(events_sender, cancellation_clone).await
|
task(events_sender).await
|
||||||
// events_sender is dropped at some point during the .await above.
|
|
||||||
// But the task is still running on WALRECEIVER_RUNTIME.
|
|
||||||
// That is the window when `!jh.is_finished()`
|
|
||||||
// is true inside `fn next_task_event()` below.
|
|
||||||
});
|
});
|
||||||
|
|
||||||
TaskHandle {
|
TaskHandle {
|
||||||
@@ -136,23 +131,7 @@ impl<E: Clone> TaskHandle<E> {
|
|||||||
TaskEvent::End(match self.join_handle.as_mut() {
|
TaskEvent::End(match self.join_handle.as_mut() {
|
||||||
Some(jh) => {
|
Some(jh) => {
|
||||||
if !jh.is_finished() {
|
if !jh.is_finished() {
|
||||||
// Barring any implementation errors in this module, we can
|
warn!("sender is dropped while join handle is still alive");
|
||||||
// only arrive here while the task that executes the future
|
|
||||||
// passed to `Self::spawn()` is still execution. Cf the comment
|
|
||||||
// in Self::spawn().
|
|
||||||
//
|
|
||||||
// This was logging at warning level in earlier versions, presumably
|
|
||||||
// to leave some breadcrumbs in case we had an implementation
|
|
||||||
// error that would would make us get stuck in `jh.await`.
|
|
||||||
//
|
|
||||||
// There hasn't been such a bug so far.
|
|
||||||
// But in a busy system, e.g., during pageserver restart,
|
|
||||||
// we arrive here often enough that the warning-level logs
|
|
||||||
// became a distraction.
|
|
||||||
// So, tone them down to info-level.
|
|
||||||
//
|
|
||||||
// XXX: rewrite this module to eliminate the race condition.
|
|
||||||
info!("sender is dropped while join handle is still alive");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let res = jh
|
let res = jh
|
||||||
|
|||||||
@@ -11,10 +11,9 @@
|
|||||||
|
|
||||||
use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
|
use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::context::{DownloadBehavior, RequestContext, TaskKind};
|
||||||
use crate::task_mgr::TaskKind;
|
|
||||||
use crate::task_mgr::WALRECEIVER_RUNTIME;
|
use crate::task_mgr::WALRECEIVER_RUNTIME;
|
||||||
use crate::tenant::Timeline;
|
use crate::tenant::{Timeline, TimelineRequestContext};
|
||||||
use crate::{task_mgr, walreceiver::TaskStateUpdate};
|
use crate::{task_mgr, walreceiver::TaskStateUpdate};
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use chrono::{NaiveDateTime, Utc};
|
use chrono::{NaiveDateTime, Utc};
|
||||||
@@ -47,7 +46,7 @@ pub fn spawn_connection_manager_task(
|
|||||||
lagging_wal_timeout: Duration,
|
lagging_wal_timeout: Duration,
|
||||||
max_lsn_wal_lag: NonZeroU64,
|
max_lsn_wal_lag: NonZeroU64,
|
||||||
auth_token: Option<Arc<String>>,
|
auth_token: Option<Arc<String>>,
|
||||||
ctx: RequestContext,
|
ctx: TimelineRequestContext,
|
||||||
) {
|
) {
|
||||||
let mut broker_client = get_broker_client().clone();
|
let mut broker_client = get_broker_client().clone();
|
||||||
|
|
||||||
@@ -56,9 +55,6 @@ pub fn spawn_connection_manager_task(
|
|||||||
|
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
WALRECEIVER_RUNTIME.handle(),
|
WALRECEIVER_RUNTIME.handle(),
|
||||||
TaskKind::WalReceiverManager,
|
|
||||||
Some(tenant_id),
|
|
||||||
Some(timeline_id),
|
|
||||||
&format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
|
&format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
|
||||||
false,
|
false,
|
||||||
async move {
|
async move {
|
||||||
@@ -72,10 +68,10 @@ pub fn spawn_connection_manager_task(
|
|||||||
);
|
);
|
||||||
loop {
|
loop {
|
||||||
select! {
|
select! {
|
||||||
_ = task_mgr::shutdown_watcher() => {
|
_ = ctx.cancelled() => {
|
||||||
info!("WAL receiver shutdown requested, shutting down");
|
info!("WAL receiver shutdown requested, shutting down");
|
||||||
walreceiver_state.shutdown().await;
|
walreceiver_state.shutdown().await;
|
||||||
return Ok(());
|
return;
|
||||||
},
|
},
|
||||||
loop_step_result = connection_manager_loop_step(
|
loop_step_result = connection_manager_loop_step(
|
||||||
&mut broker_client,
|
&mut broker_client,
|
||||||
@@ -86,7 +82,7 @@ pub fn spawn_connection_manager_task(
|
|||||||
ControlFlow::Break(()) => {
|
ControlFlow::Break(()) => {
|
||||||
info!("Connection manager loop ended, shutting down");
|
info!("Connection manager loop ended, shutting down");
|
||||||
walreceiver_state.shutdown().await;
|
walreceiver_state.shutdown().await;
|
||||||
return Ok(());
|
return;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -104,7 +100,7 @@ pub fn spawn_connection_manager_task(
|
|||||||
async fn connection_manager_loop_step(
|
async fn connection_manager_loop_step(
|
||||||
broker_client: &mut BrokerClientChannel,
|
broker_client: &mut BrokerClientChannel,
|
||||||
walreceiver_state: &mut WalreceiverState,
|
walreceiver_state: &mut WalreceiverState,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) -> ControlFlow<(), ()> {
|
) -> ControlFlow<(), ()> {
|
||||||
let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
|
let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
|
||||||
|
|
||||||
@@ -187,23 +183,13 @@ async fn connection_manager_loop_step(
|
|||||||
|
|
||||||
new_event = async {
|
new_event = async {
|
||||||
loop {
|
loop {
|
||||||
if walreceiver_state.timeline.current_state() == TimelineState::Loading {
|
|
||||||
warn!("wal connection manager should only be launched after timeline has become active");
|
|
||||||
}
|
|
||||||
match timeline_state_updates.changed().await {
|
match timeline_state_updates.changed().await {
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
let new_state = walreceiver_state.timeline.current_state();
|
let new_state = walreceiver_state.timeline.current_state();
|
||||||
match new_state {
|
match new_state {
|
||||||
// we're already active as walreceiver, no need to reactivate
|
// we're already active as walreceiver, no need to reactivate
|
||||||
TimelineState::Active => continue,
|
TimelineState::Active => continue,
|
||||||
TimelineState::Broken | TimelineState::Stopping => {
|
TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return ControlFlow::Continue(new_state),
|
||||||
info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
|
|
||||||
return ControlFlow::Break(());
|
|
||||||
}
|
|
||||||
TimelineState::Loading => {
|
|
||||||
warn!("timeline transitioned back to Loading state, that should not happen");
|
|
||||||
return ControlFlow::Continue(new_state);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(_sender_dropped_error) => return ControlFlow::Break(()),
|
Err(_sender_dropped_error) => return ControlFlow::Break(()),
|
||||||
@@ -211,7 +197,7 @@ async fn connection_manager_loop_step(
|
|||||||
}
|
}
|
||||||
} => match new_event {
|
} => match new_event {
|
||||||
ControlFlow::Continue(new_state) => {
|
ControlFlow::Continue(new_state) => {
|
||||||
info!("observed timeline state change, new state is {new_state:?}");
|
info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates");
|
||||||
return ControlFlow::Continue(());
|
return ControlFlow::Continue(());
|
||||||
}
|
}
|
||||||
ControlFlow::Break(()) => {
|
ControlFlow::Break(()) => {
|
||||||
@@ -304,9 +290,7 @@ async fn subscribe_for_timeline_updates(
|
|||||||
return resp.into_inner();
|
return resp.into_inner();
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
// Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
|
warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
|
||||||
// entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
|
|
||||||
info!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -406,32 +390,38 @@ impl WalreceiverState {
|
|||||||
&mut self,
|
&mut self,
|
||||||
new_sk_id: NodeId,
|
new_sk_id: NodeId,
|
||||||
new_wal_source_connconf: PgConnectionConfig,
|
new_wal_source_connconf: PgConnectionConfig,
|
||||||
ctx: &RequestContext,
|
ctx: &TimelineRequestContext,
|
||||||
) {
|
) {
|
||||||
self.drop_old_connection(true).await;
|
self.drop_old_connection(true).await;
|
||||||
|
|
||||||
let id = self.id;
|
let id = self.id;
|
||||||
let connect_timeout = self.wal_connect_timeout;
|
let connect_timeout = self.wal_connect_timeout;
|
||||||
let timeline = Arc::clone(&self.timeline);
|
let timeline = Arc::clone(&self.timeline);
|
||||||
let ctx = ctx.detached_child(
|
|
||||||
TaskKind::WalReceiverConnectionHandler,
|
let child_ctx = ctx.register_another(RequestContext::with_parent(
|
||||||
ctx.download_behavior(),
|
TaskKind::WalReceiverConnection,
|
||||||
|
DownloadBehavior::Download,
|
||||||
|
ctx,
|
||||||
|
));
|
||||||
|
let cancellation_token = child_ctx.cancellation_token().clone();
|
||||||
|
|
||||||
|
let connection_handle = TaskHandle::spawn(
|
||||||
|
move |events_sender| {
|
||||||
|
async move {
|
||||||
|
super::walreceiver_connection::handle_walreceiver_connection(
|
||||||
|
timeline,
|
||||||
|
new_wal_source_connconf,
|
||||||
|
events_sender,
|
||||||
|
connect_timeout,
|
||||||
|
child_ctx,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.context("walreceiver connection handling failure")
|
||||||
|
}
|
||||||
|
.instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id))
|
||||||
|
},
|
||||||
|
cancellation_token,
|
||||||
);
|
);
|
||||||
let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
|
|
||||||
async move {
|
|
||||||
super::walreceiver_connection::handle_walreceiver_connection(
|
|
||||||
timeline,
|
|
||||||
new_wal_source_connconf,
|
|
||||||
events_sender,
|
|
||||||
cancellation,
|
|
||||||
connect_timeout,
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.context("walreceiver connection handling failure")
|
|
||||||
}
|
|
||||||
.instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id))
|
|
||||||
});
|
|
||||||
|
|
||||||
let now = Utc::now().naive_utc();
|
let now = Utc::now().naive_utc();
|
||||||
self.wal_connection = Some(WalConnection {
|
self.wal_connection = Some(WalConnection {
|
||||||
@@ -843,6 +833,7 @@ fn wal_stream_connection_config(
|
|||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
|
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
use url::Host;
|
use url::Host;
|
||||||
|
|
||||||
fn dummy_broker_sk_timeline(
|
fn dummy_broker_sk_timeline(
|
||||||
@@ -923,12 +914,15 @@ mod tests {
|
|||||||
started_at: now,
|
started_at: now,
|
||||||
sk_id: connected_sk_id,
|
sk_id: connected_sk_id,
|
||||||
status: connection_status,
|
status: connection_status,
|
||||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
connection_task: TaskHandle::spawn(
|
||||||
sender
|
move |sender| async move {
|
||||||
.send(TaskStateUpdate::Progress(connection_status))
|
sender
|
||||||
.ok();
|
.send(TaskStateUpdate::Progress(connection_status))
|
||||||
Ok(())
|
.ok();
|
||||||
}),
|
Ok(())
|
||||||
|
},
|
||||||
|
CancellationToken::new(),
|
||||||
|
),
|
||||||
discovered_new_wal: None,
|
discovered_new_wal: None,
|
||||||
});
|
});
|
||||||
state.wal_stream_candidates = HashMap::from([
|
state.wal_stream_candidates = HashMap::from([
|
||||||
@@ -1085,12 +1079,15 @@ mod tests {
|
|||||||
started_at: now,
|
started_at: now,
|
||||||
sk_id: connected_sk_id,
|
sk_id: connected_sk_id,
|
||||||
status: connection_status,
|
status: connection_status,
|
||||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
connection_task: TaskHandle::spawn(
|
||||||
sender
|
move |sender| async move {
|
||||||
.send(TaskStateUpdate::Progress(connection_status))
|
sender
|
||||||
.ok();
|
.send(TaskStateUpdate::Progress(connection_status))
|
||||||
Ok(())
|
.ok();
|
||||||
}),
|
Ok(())
|
||||||
|
},
|
||||||
|
CancellationToken::new(),
|
||||||
|
),
|
||||||
discovered_new_wal: None,
|
discovered_new_wal: None,
|
||||||
});
|
});
|
||||||
state.wal_stream_candidates = HashMap::from([
|
state.wal_stream_candidates = HashMap::from([
|
||||||
@@ -1150,12 +1147,15 @@ mod tests {
|
|||||||
started_at: now,
|
started_at: now,
|
||||||
sk_id: NodeId(1),
|
sk_id: NodeId(1),
|
||||||
status: connection_status,
|
status: connection_status,
|
||||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
connection_task: TaskHandle::spawn(
|
||||||
sender
|
move |sender| async move {
|
||||||
.send(TaskStateUpdate::Progress(connection_status))
|
sender
|
||||||
.ok();
|
.send(TaskStateUpdate::Progress(connection_status))
|
||||||
Ok(())
|
.ok();
|
||||||
}),
|
Ok(())
|
||||||
|
},
|
||||||
|
CancellationToken::new(),
|
||||||
|
),
|
||||||
discovered_new_wal: None,
|
discovered_new_wal: None,
|
||||||
});
|
});
|
||||||
state.wal_stream_candidates = HashMap::from([(
|
state.wal_stream_candidates = HashMap::from([(
|
||||||
@@ -1212,7 +1212,10 @@ mod tests {
|
|||||||
started_at: now,
|
started_at: now,
|
||||||
sk_id: NodeId(1),
|
sk_id: NodeId(1),
|
||||||
status: connection_status,
|
status: connection_status,
|
||||||
connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
|
connection_task: TaskHandle::spawn(
|
||||||
|
move |_| async move { Ok(()) },
|
||||||
|
CancellationToken::new(),
|
||||||
|
),
|
||||||
discovered_new_wal: Some(NewCommittedWAL {
|
discovered_new_wal: Some(NewCommittedWAL {
|
||||||
discovered_at: time_over_threshold,
|
discovered_at: time_over_threshold,
|
||||||
lsn: new_lsn,
|
lsn: new_lsn,
|
||||||
@@ -1256,11 +1259,11 @@ mod tests {
|
|||||||
const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
|
const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
|
||||||
|
|
||||||
async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
|
async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
|
||||||
let (tenant, ctx) = harness.load().await;
|
let (tenant, tenant_ctx) = harness.load().await;
|
||||||
let timeline = tenant
|
let (timeline, timeline_ctx) = tenant
|
||||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
|
.create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &tenant_ctx)
|
||||||
.expect("Failed to create an empty timeline for dummy wal connection manager");
|
.expect("Failed to create an empty timeline for dummy wal connection manager");
|
||||||
let timeline = timeline.initialize(&ctx).unwrap();
|
let timeline = timeline.initialize(&timeline_ctx).unwrap();
|
||||||
|
|
||||||
WalreceiverState {
|
WalreceiverState {
|
||||||
id: TenantTimelineId {
|
id: TenantTimelineId {
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user