mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-24 13:50:37 +00:00
Compare commits
21 Commits
prefetch_s
...
sergey/fix
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b39d6126bb | ||
|
|
0bc488b723 | ||
|
|
0c915dcb1d | ||
|
|
feb07ed510 | ||
|
|
4603a4cbb5 | ||
|
|
02c1c351dc | ||
|
|
607c0facfc | ||
|
|
e5d523c86a | ||
|
|
7a16cde737 | ||
|
|
d6325aa79d | ||
|
|
544777e86b | ||
|
|
e2ae4c09a6 | ||
|
|
22ae67af8d | ||
|
|
d1edc8aa00 | ||
|
|
f013d53230 | ||
|
|
0aa2f5c9a5 | ||
|
|
26f4ff949a | ||
|
|
a1fd0ba23b | ||
|
|
32662ff1c4 | ||
|
|
249d77c720 | ||
|
|
0f445827f5 |
3
.github/ansible/neon-stress.hosts.yaml
vendored
3
.github/ansible/neon-stress.hosts.yaml
vendored
@@ -3,7 +3,7 @@ storage:
|
||||
bucket_name: neon-storage-ireland
|
||||
bucket_region: eu-west-1
|
||||
console_mgmt_base_url: http://neon-stress-console.local
|
||||
etcd_endpoints: neon-stress-etcd.local:2379
|
||||
broker_endpoint: http://storage-broker.neon-stress.local:50051
|
||||
safekeeper_enable_s3_offload: 'false'
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
@@ -14,6 +14,7 @@ storage:
|
||||
safekeeper_s3_prefix: neon-stress/wal
|
||||
hostname_suffix: ".local"
|
||||
remote_user: admin
|
||||
sentry_environment: development
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
|
||||
@@ -3,7 +3,7 @@ storage:
|
||||
bucket_name: neon-prod-storage-ap-southeast-1
|
||||
bucket_region: ap-southeast-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379
|
||||
broker_endpoint: https://storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech:443
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
@@ -16,6 +16,7 @@ storage:
|
||||
ansible_aws_ssm_region: ap-southeast-1
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1
|
||||
console_region_id: aws-ap-southeast-1
|
||||
sentry_environment: production
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
|
||||
3
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
3
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
@@ -3,7 +3,7 @@ storage:
|
||||
bucket_name: neon-prod-storage-eu-central-1
|
||||
bucket_region: eu-central-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379
|
||||
broker_endpoint: https://storage-broker.gamma.eu-central-1.internal.aws.neon.tech:443
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
@@ -16,6 +16,7 @@ storage:
|
||||
ansible_aws_ssm_region: eu-central-1
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1
|
||||
console_region_id: aws-eu-central-1
|
||||
sentry_environment: production
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
|
||||
3
.github/ansible/prod.us-east-2.hosts.yaml
vendored
3
.github/ansible/prod.us-east-2.hosts.yaml
vendored
@@ -3,7 +3,7 @@ storage:
|
||||
bucket_name: neon-prod-storage-us-east-2
|
||||
bucket_region: us-east-2
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379
|
||||
broker_endpoint: https://storage-broker.delta.us-east-2.internal.aws.neon.tech:443
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
@@ -16,6 +16,7 @@ storage:
|
||||
ansible_aws_ssm_region: us-east-2
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2
|
||||
console_region_id: aws-us-east-2
|
||||
sentry_environment: production
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
|
||||
37
.github/ansible/prod.us-west-2.hosts.yaml
vendored
Normal file
37
.github/ansible/prod.us-west-2.hosts.yaml
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-us-west-2
|
||||
bucket_region: us-west-2
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
etcd_endpoints: etcd-0.us-west-2.aws.neon.tech:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: us-west-2
|
||||
ansible_aws_ssm_bucket_name: neon-prod-storage-us-west-2
|
||||
console_region_id: aws-us-west-2-new
|
||||
sentry_environment: production
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-0d9f6dfae0e1c780d
|
||||
pageserver-1.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-0c834be1dddba8b3f
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-00719d8a74986fda6
|
||||
safekeeper-1.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-074682f9d3c712e7c
|
||||
safekeeper-2.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-042b7efb1729d7966
|
||||
|
||||
3
.github/ansible/production.hosts.yaml
vendored
3
.github/ansible/production.hosts.yaml
vendored
@@ -4,7 +4,7 @@ storage:
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
bucket_name: zenith-storage-oregon
|
||||
bucket_region: us-west-2
|
||||
etcd_endpoints: zenith-1-etcd.local:2379
|
||||
broker_endpoint: http://storage-broker.prod.local:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
@@ -14,6 +14,7 @@ storage:
|
||||
safekeeper_s3_prefix: prod-1/wal
|
||||
hostname_suffix: ".local"
|
||||
remote_user: admin
|
||||
sentry_environment: production
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
|
||||
3
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
3
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
@@ -3,7 +3,7 @@ storage:
|
||||
bucket_name: neon-dev-storage-eu-west-1
|
||||
bucket_region: eu-west-1
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
etcd_endpoints: etcd-0.eu-west-1.aws.neon.build:2379
|
||||
broker_endpoint: https://storage-broker.zeta.eu-west-1.internal.aws.neon.build:443
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
@@ -16,6 +16,7 @@ storage:
|
||||
ansible_aws_ssm_region: eu-west-1
|
||||
ansible_aws_ssm_bucket_name: neon-dev-storage-eu-west-1
|
||||
console_region_id: aws-eu-west-1
|
||||
sentry_environment: development
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
|
||||
3
.github/ansible/staging.hosts.yaml
vendored
3
.github/ansible/staging.hosts.yaml
vendored
@@ -3,7 +3,7 @@ storage:
|
||||
bucket_name: zenith-staging-storage-us-east-1
|
||||
bucket_region: us-east-1
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
|
||||
broker_endpoint: http://storage-broker.staging.local:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
@@ -13,6 +13,7 @@ storage:
|
||||
safekeeper_s3_prefix: us-stage/wal
|
||||
hostname_suffix: ".local"
|
||||
remote_user: admin
|
||||
sentry_environment: development
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
|
||||
3
.github/ansible/staging.us-east-2.hosts.yaml
vendored
3
.github/ansible/staging.us-east-2.hosts.yaml
vendored
@@ -3,7 +3,7 @@ storage:
|
||||
bucket_name: neon-staging-storage-us-east-2
|
||||
bucket_region: us-east-2
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
|
||||
broker_endpoint: https://storage-broker.beta.us-east-2.internal.aws.neon.build:443
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
@@ -16,6 +16,7 @@ storage:
|
||||
ansible_aws_ssm_region: us-east-2
|
||||
ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
|
||||
console_region_id: aws-us-east-2
|
||||
sentry_environment: development
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
|
||||
4
.github/ansible/systemd/pageserver.service
vendored
4
.github/ansible/systemd/pageserver.service
vendored
@@ -5,8 +5,8 @@ After=network.target auditd.service
|
||||
[Service]
|
||||
Type=simple
|
||||
User=pageserver
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }}
|
||||
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }} SENTRY_ENVIRONMENT={{ sentry_environment }}
|
||||
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoint='{{ broker_endpoint }}'" -D /storage/pageserver/data
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
|
||||
4
.github/ansible/systemd/safekeeper.service
vendored
4
.github/ansible/systemd/safekeeper.service
vendored
@@ -5,8 +5,8 @@ After=network.target auditd.service
|
||||
[Service]
|
||||
Type=simple
|
||||
User=safekeeper
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }}
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }} SENTRY_ENVIRONMENT={{ sentry_environment }}
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoint={{ broker_endpoint }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
|
||||
@@ -8,6 +8,7 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
domain: "*.eu-west-1.aws.neon.build"
|
||||
sentryEnvironment: "development"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -10,6 +10,8 @@ ingress:
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
# we have basically infinite streams, disable body size limit
|
||||
nginx.ingress.kubernetes.io/proxy-body-size: "0"
|
||||
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
|
||||
|
||||
hosts:
|
||||
@@ -51,3 +53,5 @@ extraManifests:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "development"
|
||||
|
||||
@@ -8,6 +8,7 @@ settings:
|
||||
authBackend: "link"
|
||||
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.stage.neon.tech/psql_session/"
|
||||
sentryEnvironment: "development"
|
||||
|
||||
# -- Additional labels for neon-proxy-link pods
|
||||
podLabels:
|
||||
|
||||
@@ -8,6 +8,7 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
domain: "*.cloud.stage.neon.tech"
|
||||
sentryEnvironment: "development"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -8,6 +8,7 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.build"
|
||||
sentryEnvironment: "development"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -10,6 +10,8 @@ ingress:
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
# we have basically infinite streams, disable body size limit
|
||||
nginx.ingress.kubernetes.io/proxy-body-size: "0"
|
||||
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
|
||||
|
||||
hosts:
|
||||
@@ -51,3 +53,5 @@ extraManifests:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "development"
|
||||
|
||||
@@ -3,22 +3,22 @@ podLabels:
|
||||
neon_env: neon-stress
|
||||
neon_service: storage-broker
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
# Use L4 LB
|
||||
service:
|
||||
# service.annotations -- Annotations to add to the service
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: alb
|
||||
alb.ingress.kubernetes.io/healthcheck-path: /status
|
||||
alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
|
||||
alb.ingress.kubernetes.io/scheme: "internal"
|
||||
alb.ingress.kubernetes.io/target-type: "ip"
|
||||
alb.ingress.kubernetes.io/ssl-redirect: "443"
|
||||
alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||
# assign service to this name at external-dns
|
||||
external-dns.alpha.kubernetes.io/hostname: storage-broker.neon-stress.local
|
||||
# service.type -- Service type
|
||||
type: LoadBalancer
|
||||
# service.port -- broker listen port
|
||||
port: 50051
|
||||
|
||||
hosts:
|
||||
- host: storage-broker-stress.stage.neon.tech
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
ingress:
|
||||
enabled: false
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
@@ -52,3 +52,5 @@ extraManifests:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "development"
|
||||
|
||||
@@ -24,3 +24,6 @@ metrics:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "development"
|
||||
|
||||
1
.github/helm-values/neon-stress.proxy.yaml
vendored
1
.github/helm-values/neon-stress.proxy.yaml
vendored
@@ -4,6 +4,7 @@ settings:
|
||||
authBackend: "link"
|
||||
authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.dev.neon.tech/psql_session/"
|
||||
sentryEnvironment: "development"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -8,6 +8,7 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.ap-southeast-1.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -10,6 +10,8 @@ ingress:
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
# we have basically infinite streams, disable body size limit
|
||||
nginx.ingress.kubernetes.io/proxy-body-size: "0"
|
||||
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
|
||||
|
||||
hosts:
|
||||
@@ -51,3 +53,5 @@ extraManifests:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "production"
|
||||
|
||||
@@ -8,6 +8,7 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.eu-central-1.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -10,6 +10,8 @@ ingress:
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
# we have basically infinite streams, disable body size limit
|
||||
nginx.ingress.kubernetes.io/proxy-body-size: "0"
|
||||
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
|
||||
|
||||
hosts:
|
||||
@@ -51,3 +53,5 @@ extraManifests:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "production"
|
||||
|
||||
@@ -8,6 +8,7 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -10,6 +10,8 @@ ingress:
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
# we have basically infinite streams, disable body size limit
|
||||
nginx.ingress.kubernetes.io/proxy-body-size: "0"
|
||||
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
|
||||
|
||||
hosts:
|
||||
@@ -51,3 +53,5 @@ extraManifests:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "production"
|
||||
|
||||
@@ -8,6 +8,7 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.us-west-2.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -10,6 +10,8 @@ ingress:
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
# we have basically infinite streams, disable body size limit
|
||||
nginx.ingress.kubernetes.io/proxy-body-size: "0"
|
||||
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
|
||||
|
||||
hosts:
|
||||
@@ -51,3 +53,5 @@ extraManifests:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "production"
|
||||
|
||||
@@ -3,22 +3,22 @@ podLabels:
|
||||
neon_env: production
|
||||
neon_service: storage-broker
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
# Use L4 LB
|
||||
service:
|
||||
# service.annotations -- Annotations to add to the service
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: alb
|
||||
alb.ingress.kubernetes.io/healthcheck-path: /status
|
||||
alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
|
||||
alb.ingress.kubernetes.io/scheme: "internal"
|
||||
alb.ingress.kubernetes.io/target-type: "ip"
|
||||
alb.ingress.kubernetes.io/ssl-redirect: "443"
|
||||
alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||
# assign service to this name at external-dns
|
||||
external-dns.alpha.kubernetes.io/hostname: storage-broker.prod.local
|
||||
# service.type -- Service type
|
||||
type: LoadBalancer
|
||||
# service.port -- broker listen port
|
||||
port: 50051
|
||||
|
||||
hosts:
|
||||
- host: storage-broker.neon.tech
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
ingress:
|
||||
enabled: false
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
@@ -52,3 +52,5 @@ extraManifests:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "production"
|
||||
|
||||
@@ -22,3 +22,6 @@ metrics:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "production"
|
||||
|
||||
1
.github/helm-values/production.proxy.yaml
vendored
1
.github/helm-values/production.proxy.yaml
vendored
@@ -2,6 +2,7 @@ settings:
|
||||
authBackend: "link"
|
||||
authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.neon.tech/psql_session/"
|
||||
sentryEnvironment: "production"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -3,22 +3,22 @@ podLabels:
|
||||
neon_env: staging
|
||||
neon_service: storage-broker
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
# Use L4 LB
|
||||
service:
|
||||
# service.annotations -- Annotations to add to the service
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: alb
|
||||
alb.ingress.kubernetes.io/healthcheck-path: /status
|
||||
alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
|
||||
alb.ingress.kubernetes.io/scheme: "internal"
|
||||
alb.ingress.kubernetes.io/target-type: "ip"
|
||||
alb.ingress.kubernetes.io/ssl-redirect: "443"
|
||||
alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||
# assign service to this name at external-dns
|
||||
external-dns.alpha.kubernetes.io/hostname: storage-broker.staging.local
|
||||
# service.type -- Service type
|
||||
type: LoadBalancer
|
||||
# service.port -- broker listen port
|
||||
port: 50051
|
||||
|
||||
hosts:
|
||||
- host: storage-broker.stage.neon.tech
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
ingress:
|
||||
enabled: false
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
@@ -52,3 +52,5 @@ extraManifests:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "development"
|
||||
|
||||
1
.github/helm-values/staging.proxy-scram.yaml
vendored
1
.github/helm-values/staging.proxy-scram.yaml
vendored
@@ -8,6 +8,7 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
domain: "*.cloud.stage.neon.tech"
|
||||
sentryEnvironment: "development"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
|
||||
1
.github/helm-values/staging.proxy.yaml
vendored
1
.github/helm-values/staging.proxy.yaml
vendored
@@ -8,6 +8,7 @@ settings:
|
||||
authBackend: "link"
|
||||
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.stage.neon.tech/psql_session/"
|
||||
sentryEnvironment: "development"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
|
||||
171
.github/workflows/build_and_test.yml
vendored
171
.github/workflows/build_and_test.yml
vendored
@@ -7,6 +7,10 @@ on:
|
||||
- release
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow per any non-`main` branch.
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
|
||||
@@ -45,6 +49,83 @@ jobs:
|
||||
shell: bash
|
||||
id: build-tag
|
||||
|
||||
check-codestyle-python:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cloud:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Cache poetry deps
|
||||
id: cache_poetry
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Run isort to ensure code format
|
||||
run: poetry run isort --diff --check .
|
||||
|
||||
- name: Run black to ensure code format
|
||||
run: poetry run black --diff --check .
|
||||
|
||||
- name: Run flake8 to ensure code format
|
||||
run: poetry run flake8 .
|
||||
|
||||
- name: Run mypy to check types
|
||||
run: poetry run mypy .
|
||||
|
||||
check-codestyle-rust:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Restore cargo deps cache
|
||||
id: cache_cargo
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry/
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git/
|
||||
target/
|
||||
key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
- name: Run cargo clippy
|
||||
run: ./run_clippy.sh
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
||||
- name: Check rust dependencies
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||
|
||||
build-neon:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
@@ -79,12 +160,10 @@ jobs:
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Set pg 15 revision for caching
|
||||
id: pg_v15_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
# Set some environment variables used by all the steps.
|
||||
#
|
||||
@@ -101,16 +180,15 @@ jobs:
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
||||
CARGO_FEATURES="--features testing"
|
||||
CARGO_FLAGS="--locked --timings $CARGO_FEATURES"
|
||||
CARGO_FLAGS="--locked $CARGO_FEATURES"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FEATURES="--features testing,profiling"
|
||||
CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES"
|
||||
CARGO_FLAGS="--locked --release $CARGO_FEATURES"
|
||||
fi
|
||||
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
|
||||
echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
|
||||
echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
# Don't include the ~/.cargo/registry/src directory. It contains just
|
||||
# uncompressed versions of the crates in ~/.cargo/registry/cache
|
||||
@@ -127,8 +205,8 @@ jobs:
|
||||
target/
|
||||
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
|
||||
key: |
|
||||
v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-
|
||||
v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||
v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
@@ -147,26 +225,21 @@ jobs:
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v14 -j$(nproc)
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Build postgres v15
|
||||
if: steps.cache_pg_15.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v15 -j$(nproc)
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Build neon extensions
|
||||
run: mold -run make neon-pg-ext -j$(nproc)
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Run cargo test
|
||||
run: |
|
||||
${cov_prefix} cargo test $CARGO_FLAGS
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Install rust binaries
|
||||
run: |
|
||||
@@ -207,11 +280,9 @@ jobs:
|
||||
echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
|
||||
done
|
||||
fi
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Install postgres binaries
|
||||
run: cp -a pg_install /tmp/neon/pg_install
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Upload Neon artifact
|
||||
uses: ./.github/actions/upload
|
||||
@@ -219,17 +290,6 @@ jobs:
|
||||
name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
- name: Prepare cargo build timing stats for storing
|
||||
run: |
|
||||
mkdir -p "/tmp/neon/cargo-timings/$BUILD_TYPE/"
|
||||
cp -r ./target/cargo-timings/* "/tmp/neon/cargo-timings/$BUILD_TYPE/"
|
||||
shell: bash -euxo pipefail {0}
|
||||
- name: Upload cargo build stats
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ matrix.build_type }}-build-stats
|
||||
path: /tmp/neon/cargo-timings/
|
||||
|
||||
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug'
|
||||
@@ -250,7 +310,7 @@ jobs:
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 2
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Pytest regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -284,7 +344,7 @@ jobs:
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 2
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Pytest benchmarks
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -330,7 +390,6 @@ jobs:
|
||||
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }}
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json
|
||||
./scripts/pysync
|
||||
@@ -363,7 +422,7 @@ jobs:
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git/
|
||||
target/
|
||||
key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||
|
||||
- name: Get Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -379,7 +438,6 @@ jobs:
|
||||
|
||||
- name: Merge coverage data
|
||||
run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Build and upload coverage report
|
||||
run: |
|
||||
@@ -412,7 +470,6 @@ jobs:
|
||||
\"description\": \"Coverage report is ready\",
|
||||
\"target_url\": \"$REPORT_URL\"
|
||||
}"
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
trigger-e2e-tests:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
@@ -463,6 +520,9 @@ jobs:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs: [ tag ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -481,6 +541,9 @@ jobs:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs: [ tag ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -496,6 +559,10 @@ jobs:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
needs: [ tag ]
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1 # v3 won't work with kaniko
|
||||
@@ -513,6 +580,10 @@ jobs:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
needs: [ tag ]
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1 # v3 won't work with kaniko
|
||||
@@ -658,7 +729,7 @@ jobs:
|
||||
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
calculate-deploy-targets:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
@@ -669,7 +740,7 @@ jobs:
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "staging.neon-storage-broker", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
|
||||
NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "storage_broker_ns": "neon-stress-storage-broker", "storage_broker_config": "neon-stress.neon-storage-broker", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY", storage_broker_config: }'
|
||||
NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "storage_broker_ns": "neon-stress-storage-broker", "storage_broker_config": "neon-stress.neon-storage-broker", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY"}'
|
||||
echo "include=[$STAGING, $NEON_STRESS]" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
|
||||
@@ -680,8 +751,8 @@ jobs:
|
||||
fi
|
||||
|
||||
deploy:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
#container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
||||
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
@@ -701,16 +772,6 @@ jobs:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.10'
|
||||
|
||||
- name: Setup ansible
|
||||
run: |
|
||||
export PATH="/root/.local/bin:$PATH"
|
||||
pip install --progress-bar off --user ansible boto3 toml
|
||||
|
||||
- name: Redeploy
|
||||
run: |
|
||||
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
@@ -732,7 +793,7 @@ jobs:
|
||||
ssh-add ssh-key
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-new:
|
||||
@@ -780,7 +841,7 @@ jobs:
|
||||
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||
needs: [ push-docker-hub, tag, regress-tests ]
|
||||
if: |
|
||||
contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') &&
|
||||
contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
defaults:
|
||||
run:
|
||||
@@ -820,7 +881,7 @@ jobs:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
target_region: [ us-east-2, eu-central-1, ap-southeast-1 ]
|
||||
target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -848,7 +909,7 @@ jobs:
|
||||
|
||||
deploy-proxy:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
if: |
|
||||
@@ -888,9 +949,10 @@ jobs:
|
||||
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
deploy-storage-broker-staging:
|
||||
deploy-storage-broker:
|
||||
name: deploy storage broker on old staging and old prod
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
if: |
|
||||
@@ -926,7 +988,7 @@ jobs:
|
||||
|
||||
- name: Deploy storage-broker
|
||||
run:
|
||||
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --wait --timeout 5m0s
|
||||
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
||||
|
||||
deploy-proxy-new:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
@@ -1011,7 +1073,7 @@ jobs:
|
||||
|
||||
- name: Deploy storage-broker
|
||||
run:
|
||||
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --wait --timeout 5m0s
|
||||
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
||||
|
||||
deploy-proxy-prod-new:
|
||||
runs-on: prod
|
||||
@@ -1088,7 +1150,7 @@ jobs:
|
||||
|
||||
- name: Deploy storage-broker
|
||||
run:
|
||||
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --wait --timeout 5m0s
|
||||
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
||||
|
||||
promote-compatibility-data:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
@@ -1099,7 +1161,6 @@ jobs:
|
||||
if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
|
||||
steps:
|
||||
- name: Promote compatibility snapshot for the release
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
BUCKET: neon-github-public-dev
|
||||
PREFIX: artifacts/latest
|
||||
|
||||
166
.github/workflows/codestyle.yml
vendored
166
.github/workflows/codestyle.yml
vendored
@@ -1,166 +0,0 @@
|
||||
name: Check code style and build
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow per any non-`main` branch.
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
RUST_BACKTRACE: 1
|
||||
COPT: '-Werror'
|
||||
|
||||
jobs:
|
||||
check-codestyle-rust:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# XXX: both OSes have rustup
|
||||
# * https://github.com/actions/runner-images/blob/main/images/macos/macos-12-Readme.md#rust-tools
|
||||
# * https://github.com/actions/runner-images/blob/main/images/linux/Ubuntu2204-Readme.md#rust-tools
|
||||
# this is all we need to install our toolchain later via rust-toolchain.toml
|
||||
# so don't install any toolchain explicitly.
|
||||
os: [ubuntu-latest, macos-latest]
|
||||
timeout-minutes: 90
|
||||
name: check codestyle rust and postgres
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check formatting
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
- name: Install Ubuntu postgres dependencies
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev protobuf-compiler
|
||||
|
||||
- name: Install macOS postgres dependencies
|
||||
if: matrix.os == 'macos-latest'
|
||||
run: brew install flex bison openssl protobuf
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Set pg 15 revision for caching
|
||||
id: pg_v15_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Set extra env for macOS
|
||||
if: matrix.os == 'macos-latest'
|
||||
run: |
|
||||
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
run: make postgres-v14
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Build postgres v15
|
||||
if: steps.cache_pg_15.outputs.cache-hit != 'true'
|
||||
run: make postgres-v15
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Build neon extensions
|
||||
run: make neon-pg-ext
|
||||
|
||||
- name: Cache cargo deps
|
||||
id: cache_cargo
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
|
||||
|
||||
- name: Run cargo clippy
|
||||
run: ./run_clippy.sh
|
||||
|
||||
- name: Ensure all project builds
|
||||
run: cargo build --locked --all --all-targets
|
||||
|
||||
check-rust-dependencies:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 1
|
||||
|
||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
||||
- name: Check every project module is covered by Hakari
|
||||
run: |
|
||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
check-codestyle-python:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Cache poetry deps
|
||||
id: cache_poetry
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Run isort to ensure code format
|
||||
run: poetry run isort --diff --check .
|
||||
|
||||
- name: Run black to ensure code format
|
||||
run: poetry run black --diff --check .
|
||||
|
||||
- name: Run flake8 to ensure code format
|
||||
run: poetry run flake8 .
|
||||
|
||||
- name: Run mypy to check types
|
||||
run: poetry run mypy .
|
||||
128
.github/workflows/neon_extra_builds.yml
vendored
Normal file
128
.github/workflows/neon_extra_builds.yml
vendored
Normal file
@@ -0,0 +1,128 @@
|
||||
name: Check neon with extra platform builds
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow per any non-`main` branch.
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
RUST_BACKTRACE: 1
|
||||
COPT: '-Werror'
|
||||
|
||||
jobs:
|
||||
check-macos-build:
|
||||
timeout-minutes: 90
|
||||
runs-on: macos-latest
|
||||
|
||||
env:
|
||||
# Use release build only, to have less debug info around
|
||||
# Hence keeping target/ (and general cache size) smaller
|
||||
BUILD_TYPE: release
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Install macOS postgres dependencies
|
||||
run: brew install flex bison openssl protobuf
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set pg 15 revision for caching
|
||||
id: pg_v15_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Set extra env for macOS
|
||||
run: |
|
||||
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Cache cargo deps
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
run: make postgres-v14 -j$(nproc)
|
||||
|
||||
- name: Build postgres v15
|
||||
if: steps.cache_pg_15.outputs.cache-hit != 'true'
|
||||
run: make postgres-v15 -j$(nproc)
|
||||
|
||||
- name: Build neon extensions
|
||||
run: make neon-pg-ext -j$(nproc)
|
||||
|
||||
- name: Run cargo build
|
||||
run: cargo build --all --release
|
||||
|
||||
- name: Check that no warnings are produced
|
||||
run: ./run_clippy.sh
|
||||
|
||||
gather-rust-build-stats:
|
||||
timeout-minutes: 90
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
BUILD_TYPE: release
|
||||
# build with incremental compilation produce partial results
|
||||
# so do not attempt to cache this build, also disable the incremental compilation
|
||||
CARGO_INCREMENTAL: 0
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Install Ubuntu postgres dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev protobuf-compiler
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
- name: Produce the build stats
|
||||
run: cargo build --all --release --timings
|
||||
|
||||
- name: Upload the build stats
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-build-stats
|
||||
path: ./target/cargo-timings/
|
||||
901
Cargo.lock
generated
901
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -79,7 +79,7 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
|
||||
RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
|
||||
&& /usr/local/bin/pageserver -D /data/.neon/ --init \
|
||||
-c "id=1234" \
|
||||
-c "broker_endpoints=['http://etcd:2379']" \
|
||||
-c "broker_endpoint='http://storage_broker:50051'" \
|
||||
-c "pg_distrib_dir='/usr/local/'" \
|
||||
-c "listen_pg_addr='0.0.0.0:6400'" \
|
||||
-c "listen_http_addr='0.0.0.0:9898'"
|
||||
|
||||
14
README.md
14
README.md
@@ -26,12 +26,12 @@ See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more inf
|
||||
* On Ubuntu or Debian, this set of packages should be sufficient to build the code:
|
||||
```bash
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client protobuf-compiler
|
||||
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
|
||||
```
|
||||
* On Fedora, these packages are needed:
|
||||
```bash
|
||||
dnf install flex bison readline-devel zlib-devel openssl-devel \
|
||||
libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib protobuf-compiler
|
||||
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
@@ -44,7 +44,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
1. Install XCode and dependencies
|
||||
```
|
||||
xcode-select --install
|
||||
brew install protobuf etcd openssl flex bison
|
||||
brew install protobuf openssl flex bison
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
@@ -123,12 +123,12 @@ Stopped pageserver 1 process with pid 2545906
|
||||
|
||||
# start pageserver and safekeeper
|
||||
> ./target/debug/neon_local start
|
||||
Starting etcd broker using "/usr/bin/etcd"
|
||||
etcd started, pid: 2545996
|
||||
Starting neon broker at 127.0.0.1:50051
|
||||
storage_broker started, pid: 2918372
|
||||
Starting pageserver at '127.0.0.1:64000' in '.neon'.
|
||||
pageserver started, pid: 2546005
|
||||
pageserver started, pid: 2918386
|
||||
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
|
||||
safekeeper 1 started, pid: 2546041
|
||||
safekeeper 1 started, pid: 2918437
|
||||
|
||||
# start postgres compute node
|
||||
> ./target/debug/neon_local pg start main
|
||||
|
||||
@@ -25,5 +25,7 @@ url = "2.2.2"
|
||||
pageserver_api = { path = "../libs/pageserver_api" }
|
||||
postgres_connection = { path = "../libs/postgres_connection" }
|
||||
safekeeper_api = { path = "../libs/safekeeper_api" }
|
||||
# Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
||||
storage_broker = { version = "0.1", path = "../storage_broker" }
|
||||
utils = { path = "../libs/utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -10,5 +10,5 @@ id = 1
|
||||
pg_port = 5454
|
||||
http_port = 7676
|
||||
|
||||
[etcd_broker]
|
||||
broker_endpoints = ['http://127.0.0.1:2379']
|
||||
[broker]
|
||||
listen_addr = '127.0.0.1:50051'
|
||||
|
||||
@@ -51,21 +51,21 @@ pub enum InitialPidFile<'t> {
|
||||
}
|
||||
|
||||
/// Start a background child process using the parameters given.
|
||||
pub fn start_process<
|
||||
F,
|
||||
S: AsRef<OsStr>,
|
||||
EI: IntoIterator<Item = (String, String)>, // Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
|
||||
>(
|
||||
pub fn start_process<F, AI, A, EI>(
|
||||
process_name: &str,
|
||||
datadir: &Path,
|
||||
command: &Path,
|
||||
args: &[S],
|
||||
args: AI,
|
||||
envs: EI,
|
||||
initial_pid_file: InitialPidFile,
|
||||
process_status_check: F,
|
||||
) -> anyhow::Result<Child>
|
||||
where
|
||||
F: Fn() -> anyhow::Result<bool>,
|
||||
AI: IntoIterator<Item = A>,
|
||||
A: AsRef<OsStr>,
|
||||
// Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
|
||||
EI: IntoIterator<Item = (String, String)>,
|
||||
{
|
||||
let log_path = datadir.join(format!("{process_name}.log"));
|
||||
let process_log_file = fs::OpenOptions::new()
|
||||
|
||||
@@ -8,10 +8,10 @@
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
|
||||
use control_plane::compute::ComputeControlPlane;
|
||||
use control_plane::local_env::{EtcdBroker, LocalEnv};
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use control_plane::pageserver::PageServerNode;
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::{etcd, local_env};
|
||||
use control_plane::{broker, local_env};
|
||||
use pageserver_api::models::TimelineInfo;
|
||||
use pageserver_api::{
|
||||
DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
|
||||
@@ -22,9 +22,10 @@ use safekeeper_api::{
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
||||
};
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::path::PathBuf;
|
||||
use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
|
||||
use utils::{
|
||||
auth::{Claims, Scope},
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
@@ -41,13 +42,12 @@ project_git_version!(GIT_VERSION);
|
||||
|
||||
const DEFAULT_PG_VERSION: &str = "14";
|
||||
|
||||
fn default_conf(etcd_binary_path: &Path) -> String {
|
||||
fn default_conf() -> String {
|
||||
format!(
|
||||
r#"
|
||||
# Default built-in configuration, defined in main.rs
|
||||
[etcd_broker]
|
||||
broker_endpoints = ['http://localhost:2379']
|
||||
etcd_binary_path = '{etcd_binary_path}'
|
||||
[broker]
|
||||
listen_addr = '{DEFAULT_BROKER_ADDR}'
|
||||
|
||||
[pageserver]
|
||||
id = {DEFAULT_PAGESERVER_ID}
|
||||
@@ -60,7 +60,6 @@ id = {DEFAULT_SAFEKEEPER_ID}
|
||||
pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
|
||||
http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
|
||||
"#,
|
||||
etcd_binary_path = etcd_binary_path.display(),
|
||||
pageserver_auth_type = AuthType::Trust,
|
||||
)
|
||||
}
|
||||
@@ -298,7 +297,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||
})?
|
||||
} else {
|
||||
// Built-in default config
|
||||
default_conf(&EtcdBroker::locate_etcd()?)
|
||||
default_conf()
|
||||
};
|
||||
|
||||
let pg_version = init_match
|
||||
@@ -342,7 +341,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
|
||||
.get_many::<String>("pageserver-config-override")
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.map(|s| s.as_str())
|
||||
.map(String::as_str)
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -807,14 +806,14 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
||||
}
|
||||
|
||||
fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
etcd::start_etcd_process(env)?;
|
||||
broker::start_broker_process(env)?;
|
||||
let pageserver = PageServerNode::from_env(env);
|
||||
|
||||
// Postgres nodes are not started automatically
|
||||
|
||||
if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
try_stop_etcd_process(env);
|
||||
try_stop_storage_broker_process(env);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@@ -822,7 +821,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
|
||||
let safekeeper = SafekeeperNode::from_env(env, node);
|
||||
if let Err(e) = safekeeper.start() {
|
||||
eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id);
|
||||
try_stop_etcd_process(env);
|
||||
try_stop_storage_broker_process(env);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
@@ -854,14 +853,14 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
|
||||
}
|
||||
}
|
||||
|
||||
try_stop_etcd_process(env);
|
||||
try_stop_storage_broker_process(env);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn try_stop_etcd_process(env: &local_env::LocalEnv) {
|
||||
if let Err(e) = etcd::stop_etcd_process(env) {
|
||||
eprintln!("etcd stop failed: {e}");
|
||||
fn try_stop_storage_broker_process(env: &local_env::LocalEnv) {
|
||||
if let Err(e) = broker::stop_broker_process(env) {
|
||||
eprintln!("neon broker stop failed: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
48
control_plane/src/broker.rs
Normal file
48
control_plane/src/broker.rs
Normal file
@@ -0,0 +1,48 @@
|
||||
use anyhow::Context;
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::{background_process, local_env};
|
||||
|
||||
pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let broker = &env.broker;
|
||||
let listen_addr = &broker.listen_addr;
|
||||
|
||||
print!("Starting neon broker at {}", listen_addr);
|
||||
|
||||
let args = [format!("--listen-addr={listen_addr}")];
|
||||
|
||||
let client = reqwest::blocking::Client::new();
|
||||
background_process::start_process(
|
||||
"storage_broker",
|
||||
&env.base_data_dir,
|
||||
&env.storage_broker_bin(),
|
||||
&args,
|
||||
[],
|
||||
background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
|
||||
|| {
|
||||
let url = broker.client_url();
|
||||
let status_url = url.join("status").with_context(|| {
|
||||
format!("Failed to append /status path to broker endpoint {url}",)
|
||||
})?;
|
||||
let request = client
|
||||
.get(status_url)
|
||||
.build()
|
||||
.with_context(|| format!("Failed to construct request to broker endpoint {url}"))?;
|
||||
match client.execute(request) {
|
||||
Ok(resp) => Ok(resp.status().is_success()),
|
||||
Err(_) => Ok(false),
|
||||
}
|
||||
},
|
||||
)
|
||||
.context("Failed to spawn storage_broker subprocess")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stop_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
background_process::stop_process(true, "storage_broker", &storage_broker_pid_file_path(env))
|
||||
}
|
||||
|
||||
fn storage_broker_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
|
||||
env.base_data_dir.join("storage_broker.pid")
|
||||
}
|
||||
@@ -1,78 +0,0 @@
|
||||
use std::{fs, path::PathBuf};
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
use crate::{background_process, local_env};
|
||||
|
||||
pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let etcd_broker = &env.etcd_broker;
|
||||
print!(
|
||||
"Starting etcd broker using {:?}",
|
||||
etcd_broker.etcd_binary_path
|
||||
);
|
||||
|
||||
let etcd_data_dir = env.base_data_dir.join("etcd");
|
||||
fs::create_dir_all(&etcd_data_dir)
|
||||
.with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?;
|
||||
|
||||
let client_urls = etcd_broker.comma_separated_endpoints();
|
||||
let args = [
|
||||
format!("--data-dir={}", etcd_data_dir.display()),
|
||||
format!("--listen-client-urls={client_urls}"),
|
||||
format!("--advertise-client-urls={client_urls}"),
|
||||
// Set --quota-backend-bytes to keep the etcd virtual memory
|
||||
// size smaller. Our test etcd clusters are very small.
|
||||
// See https://github.com/etcd-io/etcd/issues/7910
|
||||
"--quota-backend-bytes=100000000".to_string(),
|
||||
// etcd doesn't compact (vacuum) with default settings,
|
||||
// enable it to prevent space exhaustion.
|
||||
"--auto-compaction-mode=revision".to_string(),
|
||||
"--auto-compaction-retention=1".to_string(),
|
||||
];
|
||||
|
||||
let pid_file_path = etcd_pid_file_path(env);
|
||||
|
||||
let client = reqwest::blocking::Client::new();
|
||||
|
||||
background_process::start_process(
|
||||
"etcd",
|
||||
&etcd_data_dir,
|
||||
&etcd_broker.etcd_binary_path,
|
||||
&args,
|
||||
[],
|
||||
background_process::InitialPidFile::Create(&pid_file_path),
|
||||
|| {
|
||||
for broker_endpoint in &etcd_broker.broker_endpoints {
|
||||
let request = broker_endpoint
|
||||
.join("health")
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to append /health path to broker endopint {}",
|
||||
broker_endpoint
|
||||
)
|
||||
})
|
||||
.and_then(|url| {
|
||||
client.get(&url.to_string()).build().with_context(|| {
|
||||
format!("Failed to construct request to etcd endpoint {url}")
|
||||
})
|
||||
})?;
|
||||
if client.execute(request).is_ok() {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(false)
|
||||
},
|
||||
)
|
||||
.context("Failed to spawn etcd subprocess")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
background_process::stop_process(true, "etcd", &etcd_pid_file_path(env))
|
||||
}
|
||||
|
||||
fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
|
||||
env.base_data_dir.join("etcd.pid")
|
||||
}
|
||||
@@ -8,8 +8,8 @@
|
||||
//
|
||||
|
||||
mod background_process;
|
||||
pub mod broker;
|
||||
pub mod compute;
|
||||
pub mod etcd;
|
||||
pub mod local_env;
|
||||
pub mod pageserver;
|
||||
pub mod postgresql_conf;
|
||||
|
||||
@@ -4,12 +4,16 @@
|
||||
//! script which will use local paths.
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::net::IpAddr;
|
||||
use std::net::Ipv4Addr;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use utils::{
|
||||
@@ -62,7 +66,7 @@ pub struct LocalEnv {
|
||||
#[serde(default)]
|
||||
pub private_key_path: PathBuf,
|
||||
|
||||
pub etcd_broker: EtcdBroker,
|
||||
pub broker: NeonBroker,
|
||||
|
||||
pub pageserver: PageServerConf,
|
||||
|
||||
@@ -78,67 +82,26 @@ pub struct LocalEnv {
|
||||
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
|
||||
}
|
||||
|
||||
/// Etcd broker config for cluster internal communication.
|
||||
#[serde_as]
|
||||
/// Broker config for cluster internal communication.
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
pub struct EtcdBroker {
|
||||
/// A prefix to all to any key when pushing/polling etcd from a node.
|
||||
#[serde(default)]
|
||||
pub broker_etcd_prefix: Option<String>,
|
||||
|
||||
/// Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'.
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Vec<DisplayFromStr>")]
|
||||
pub broker_endpoints: Vec<Url>,
|
||||
|
||||
/// Etcd binary path to use.
|
||||
#[serde(default)]
|
||||
pub etcd_binary_path: PathBuf,
|
||||
#[serde(default)]
|
||||
pub struct NeonBroker {
|
||||
/// Broker listen address for storage nodes coordination, e.g. '127.0.0.1:50051'.
|
||||
pub listen_addr: SocketAddr,
|
||||
}
|
||||
|
||||
impl EtcdBroker {
|
||||
pub fn locate_etcd() -> anyhow::Result<PathBuf> {
|
||||
let which_output = Command::new("which")
|
||||
.arg("etcd")
|
||||
.output()
|
||||
.context("Failed to run 'which etcd' command")?;
|
||||
let stdout = String::from_utf8_lossy(&which_output.stdout);
|
||||
ensure!(
|
||||
which_output.status.success(),
|
||||
"'which etcd' invocation failed. Status: {}, stdout: {stdout}, stderr: {}",
|
||||
which_output.status,
|
||||
String::from_utf8_lossy(&which_output.stderr)
|
||||
);
|
||||
|
||||
let etcd_path = PathBuf::from(stdout.trim());
|
||||
ensure!(
|
||||
etcd_path.is_file(),
|
||||
"'which etcd' invocation was successful, but the path it returned is not a file or does not exist: {}",
|
||||
etcd_path.display()
|
||||
);
|
||||
|
||||
Ok(etcd_path)
|
||||
// Dummy Default impl to satisfy Deserialize derive.
|
||||
impl Default for NeonBroker {
|
||||
fn default() -> Self {
|
||||
NeonBroker {
|
||||
listen_addr: SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn comma_separated_endpoints(&self) -> String {
|
||||
self.broker_endpoints
|
||||
.iter()
|
||||
.map(|url| {
|
||||
// URL by default adds a '/' path at the end, which is not what etcd CLI wants.
|
||||
let url_string = url.as_str();
|
||||
if url_string.ends_with('/') {
|
||||
&url_string[0..url_string.len() - 1]
|
||||
} else {
|
||||
url_string
|
||||
}
|
||||
})
|
||||
.fold(String::new(), |mut comma_separated_urls, url| {
|
||||
if !comma_separated_urls.is_empty() {
|
||||
comma_separated_urls.push(',');
|
||||
}
|
||||
comma_separated_urls.push_str(url);
|
||||
comma_separated_urls
|
||||
})
|
||||
impl NeonBroker {
|
||||
pub fn client_url(&self) -> Url {
|
||||
Url::parse(&format!("http://{}", self.listen_addr)).expect("failed to construct url")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -234,6 +197,10 @@ impl LocalEnv {
|
||||
self.neon_distrib_dir.join("safekeeper")
|
||||
}
|
||||
|
||||
pub fn storage_broker_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("storage_broker")
|
||||
}
|
||||
|
||||
pub fn pg_data_dirs_path(&self) -> PathBuf {
|
||||
self.base_data_dir.join("pgdatadirs").join("tenants")
|
||||
}
|
||||
@@ -511,8 +478,8 @@ mod tests {
|
||||
"failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
|
||||
);
|
||||
|
||||
let string_to_replace = "broker_endpoints = ['http://127.0.0.1:2379']";
|
||||
let spoiled_url_str = "broker_endpoints = ['!@$XOXO%^&']";
|
||||
let string_to_replace = "listen_addr = '127.0.0.1:50051'";
|
||||
let spoiled_url_str = "listen_addr = '!@$XOXO%^&'";
|
||||
let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
|
||||
assert!(
|
||||
spoiled_url_toml.contains(spoiled_url_str),
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, Write};
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Child;
|
||||
use std::path::PathBuf;
|
||||
use std::process::{Child, Command};
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
@@ -96,13 +97,8 @@ impl PageServerNode {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn initialize(
|
||||
&self,
|
||||
create_tenant: Option<TenantId>,
|
||||
initial_timeline_id: Option<TimelineId>,
|
||||
config_overrides: &[&str],
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<TimelineId> {
|
||||
// pageserver conf overrides defined by neon_local configuration.
|
||||
fn neon_local_overrides(&self) -> Vec<String> {
|
||||
let id = format!("id={}", self.env.pageserver.id);
|
||||
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
||||
let pg_distrib_dir_param = format!(
|
||||
@@ -117,44 +113,54 @@ impl PageServerNode {
|
||||
);
|
||||
let listen_pg_addr_param =
|
||||
format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
|
||||
let broker_endpoints_param = format!(
|
||||
"broker_endpoints=[{}]",
|
||||
self.env
|
||||
.etcd_broker
|
||||
.broker_endpoints
|
||||
.iter()
|
||||
.map(|url| format!("'{url}'"))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
let broker_etcd_prefix_param = self
|
||||
.env
|
||||
.etcd_broker
|
||||
.broker_etcd_prefix
|
||||
.as_ref()
|
||||
.map(|prefix| format!("broker_etcd_prefix='{prefix}'"));
|
||||
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
|
||||
|
||||
let mut init_config_overrides = config_overrides.to_vec();
|
||||
init_config_overrides.push(&id);
|
||||
init_config_overrides.push(&pg_distrib_dir_param);
|
||||
init_config_overrides.push(&authg_type_param);
|
||||
init_config_overrides.push(&listen_http_addr_param);
|
||||
init_config_overrides.push(&listen_pg_addr_param);
|
||||
init_config_overrides.push(&broker_endpoints_param);
|
||||
|
||||
if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() {
|
||||
init_config_overrides.push(broker_etcd_prefix_param);
|
||||
}
|
||||
let mut overrides = vec![
|
||||
id,
|
||||
pg_distrib_dir_param,
|
||||
authg_type_param,
|
||||
listen_http_addr_param,
|
||||
listen_pg_addr_param,
|
||||
broker_endpoint_param,
|
||||
];
|
||||
|
||||
if self.env.pageserver.auth_type != AuthType::Trust {
|
||||
init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'");
|
||||
overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned());
|
||||
}
|
||||
overrides
|
||||
}
|
||||
|
||||
/// Initializes a pageserver node by creating its config with the overrides provided,
|
||||
/// and creating an initial tenant and timeline afterwards.
|
||||
pub fn initialize(
|
||||
&self,
|
||||
create_tenant: Option<TenantId>,
|
||||
initial_timeline_id: Option<TimelineId>,
|
||||
config_overrides: &[&str],
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<TimelineId> {
|
||||
// First, run `pageserver --init` and wait for it to write a config into FS and exit.
|
||||
self.pageserver_init(config_overrides).with_context(|| {
|
||||
format!(
|
||||
"Failed to run init for pageserver node {}",
|
||||
self.env.pageserver.id,
|
||||
)
|
||||
})?;
|
||||
|
||||
// Then, briefly start it fully to run HTTP commands on it,
|
||||
// to create initial tenant and timeline.
|
||||
// We disable the remote storage, since we stop pageserver right after the timeline creation,
|
||||
// hence most of the uploads will either aborted or not started: no point to start them at all.
|
||||
let disabled_remote_storage_override = "remote_storage={}";
|
||||
let mut pageserver_process = self
|
||||
.start_node(&init_config_overrides, &self.env.base_data_dir, true)
|
||||
.start_node(
|
||||
&[disabled_remote_storage_override],
|
||||
// Previous overrides will be taken from the config created before, don't overwrite them.
|
||||
false,
|
||||
)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to start a process for pageserver {}",
|
||||
"Failed to start a process for pageserver node {}",
|
||||
self.env.pageserver.id,
|
||||
)
|
||||
})?;
|
||||
@@ -215,52 +221,73 @@ impl PageServerNode {
|
||||
}
|
||||
|
||||
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
|
||||
self.start_node(config_overrides, &self.repo_path(), false)
|
||||
self.start_node(config_overrides, false)
|
||||
}
|
||||
|
||||
fn start_node(
|
||||
&self,
|
||||
config_overrides: &[&str],
|
||||
datadir: &Path,
|
||||
update_config: bool,
|
||||
) -> anyhow::Result<Child> {
|
||||
print!(
|
||||
"Starting pageserver at '{}' in '{}'",
|
||||
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
let datadir = self.repo_path();
|
||||
let node_id = self.env.pageserver.id;
|
||||
println!(
|
||||
"Initializing pageserver node {} at '{}' in {:?}",
|
||||
node_id,
|
||||
self.pg_connection_config.raw_address(),
|
||||
datadir.display()
|
||||
datadir
|
||||
);
|
||||
io::stdout().flush()?;
|
||||
|
||||
let mut args = vec![
|
||||
"-D",
|
||||
datadir.to_str().with_context(|| {
|
||||
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
|
||||
})?,
|
||||
];
|
||||
let datadir_path_str = datadir.to_str().with_context(|| {
|
||||
format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
|
||||
})?;
|
||||
let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
|
||||
args.push(Cow::Borrowed("--init"));
|
||||
|
||||
let init_output = Command::new(&self.env.pageserver_bin())
|
||||
.args(args.iter().map(Cow::as_ref))
|
||||
.envs(self.pageserver_env_variables()?)
|
||||
.output()
|
||||
.with_context(|| format!("Failed to run pageserver init for node {node_id}"))?;
|
||||
|
||||
anyhow::ensure!(
|
||||
init_output.status.success(),
|
||||
"Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}",
|
||||
node_id,
|
||||
String::from_utf8_lossy(&init_output.stdout),
|
||||
String::from_utf8_lossy(&init_output.stderr),
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
|
||||
let mut overrides = self.neon_local_overrides();
|
||||
overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
|
||||
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
"Starting pageserver node {} at '{}' in {:?}",
|
||||
self.env.pageserver.id,
|
||||
self.pg_connection_config.raw_address(),
|
||||
datadir
|
||||
);
|
||||
io::stdout().flush()?;
|
||||
|
||||
let datadir_path_str = datadir.to_str().with_context(|| {
|
||||
format!(
|
||||
"Cannot start pageserver node {} in path that has no string representation: {:?}",
|
||||
self.env.pageserver.id, datadir,
|
||||
)
|
||||
})?;
|
||||
let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
|
||||
if update_config {
|
||||
args.push("--update-config");
|
||||
args.push(Cow::Borrowed("--update-config"));
|
||||
}
|
||||
|
||||
for config_override in config_overrides {
|
||||
args.extend(["-c", config_override]);
|
||||
}
|
||||
|
||||
let envs = if self.env.pageserver.auth_type != AuthType::Trust {
|
||||
// Generate a token to connect from the pageserver to a safekeeper
|
||||
let token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
|
||||
vec![("ZENITH_AUTH_TOKEN".to_owned(), token)]
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
background_process::start_process(
|
||||
"pageserver",
|
||||
datadir,
|
||||
&datadir,
|
||||
&self.env.pageserver_bin(),
|
||||
&args,
|
||||
envs,
|
||||
args.iter().map(Cow::as_ref),
|
||||
self.pageserver_env_variables()?,
|
||||
background_process::InitialPidFile::Expect(&self.pid_file()),
|
||||
|| match self.check_status() {
|
||||
Ok(()) => Ok(true),
|
||||
@@ -270,6 +297,35 @@ impl PageServerNode {
|
||||
)
|
||||
}
|
||||
|
||||
fn pageserver_basic_args<'a>(
|
||||
&self,
|
||||
config_overrides: &'a [&'a str],
|
||||
datadir_path_str: &'a str,
|
||||
) -> Vec<Cow<'a, str>> {
|
||||
let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];
|
||||
|
||||
let mut overrides = self.neon_local_overrides();
|
||||
overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
|
||||
for config_override in overrides {
|
||||
args.push(Cow::Borrowed("-c"));
|
||||
args.push(Cow::Owned(config_override));
|
||||
}
|
||||
|
||||
args
|
||||
}
|
||||
|
||||
fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
|
||||
Ok(if self.env.pageserver.auth_type != AuthType::Trust {
|
||||
// Generate a token to connect from the pageserver to a safekeeper
|
||||
let token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
|
||||
vec![("ZENITH_AUTH_TOKEN".to_owned(), token)]
|
||||
} else {
|
||||
Vec::new()
|
||||
})
|
||||
}
|
||||
|
||||
///
|
||||
/// Stop the server.
|
||||
///
|
||||
|
||||
@@ -131,13 +131,8 @@ impl SafekeeperNode {
|
||||
args.push("--no-sync");
|
||||
}
|
||||
|
||||
let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
|
||||
if !comma_separated_endpoints.is_empty() {
|
||||
args.extend(["--broker-endpoints", &comma_separated_endpoints]);
|
||||
}
|
||||
if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
|
||||
args.extend(["--broker-etcd-prefix", prefix]);
|
||||
}
|
||||
let broker_endpoint = format!("{}", self.env.broker.client_url());
|
||||
args.extend(["--broker-endpoint", &broker_endpoint]);
|
||||
|
||||
let mut backup_threads = String::new();
|
||||
if let Some(threads) = self.conf.backup_threads {
|
||||
|
||||
@@ -1,29 +1,6 @@
|
||||
version: '3'
|
||||
|
||||
services:
|
||||
etcd:
|
||||
restart: always
|
||||
image: quay.io/coreos/etcd:v3.5.4
|
||||
ports:
|
||||
- 2379:2379
|
||||
- 2380:2380
|
||||
environment:
|
||||
# This signifficantly speeds up etcd and we anyway don't data persistency there.
|
||||
ETCD_UNSAFE_NO_FSYNC: "1"
|
||||
command:
|
||||
- "etcd"
|
||||
- "--auto-compaction-mode=revision"
|
||||
- "--auto-compaction-retention=1"
|
||||
- "--name=etcd-cluster"
|
||||
- "--initial-cluster-state=new"
|
||||
- "--initial-cluster-token=etcd-cluster-1"
|
||||
- "--initial-cluster=etcd-cluster=http://etcd:2380"
|
||||
- "--initial-advertise-peer-urls=http://etcd:2380"
|
||||
- "--advertise-client-urls=http://etcd:2379"
|
||||
- "--listen-client-urls=http://0.0.0.0:2379"
|
||||
- "--listen-peer-urls=http://0.0.0.0:2380"
|
||||
- "--quota-backend-bytes=134217728" # 128 MB
|
||||
|
||||
minio:
|
||||
restart: always
|
||||
image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
|
||||
@@ -56,7 +33,7 @@ services:
|
||||
restart: always
|
||||
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
|
||||
environment:
|
||||
- BROKER_ENDPOINT='http://etcd:2379'
|
||||
- BROKER_ENDPOINT='http://storage_broker:50051'
|
||||
- AWS_ACCESS_KEY_ID=minio
|
||||
- AWS_SECRET_ACCESS_KEY=password
|
||||
#- RUST_BACKTRACE=1
|
||||
@@ -68,7 +45,7 @@ services:
|
||||
- "-c"
|
||||
command:
|
||||
- "/usr/local/bin/pageserver -D /data/.neon/
|
||||
-c \"broker_endpoints=[$$BROKER_ENDPOINT]\"
|
||||
-c \"broker_endpoint=$$BROKER_ENDPOINT\"
|
||||
-c \"listen_pg_addr='0.0.0.0:6400'\"
|
||||
-c \"listen_http_addr='0.0.0.0:9898'\"
|
||||
-c \"remote_storage={endpoint='http://minio:9000',
|
||||
@@ -76,7 +53,7 @@ services:
|
||||
bucket_region='eu-north-1',
|
||||
prefix_in_bucket='/pageserver/'}\""
|
||||
depends_on:
|
||||
- etcd
|
||||
- storage_broker
|
||||
- minio_create_buckets
|
||||
|
||||
safekeeper1:
|
||||
@@ -85,7 +62,7 @@ services:
|
||||
environment:
|
||||
- SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
|
||||
- SAFEKEEPER_ID=1
|
||||
- BROKER_ENDPOINT=http://etcd:2379
|
||||
- BROKER_ENDPOINT=http://storage_broker:50051
|
||||
- AWS_ACCESS_KEY_ID=minio
|
||||
- AWS_SECRET_ACCESS_KEY=password
|
||||
#- RUST_BACKTRACE=1
|
||||
@@ -99,14 +76,14 @@ services:
|
||||
- "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
|
||||
--listen-http='0.0.0.0:7676'
|
||||
--id=$$SAFEKEEPER_ID
|
||||
--broker-endpoints=$$BROKER_ENDPOINT
|
||||
--broker-endpoint=$$BROKER_ENDPOINT
|
||||
-D /data
|
||||
--remote-storage=\"{endpoint='http://minio:9000',
|
||||
bucket_name='neon',
|
||||
bucket_region='eu-north-1',
|
||||
prefix_in_bucket='/safekeeper/'}\""
|
||||
depends_on:
|
||||
- etcd
|
||||
- storage_broker
|
||||
- minio_create_buckets
|
||||
|
||||
safekeeper2:
|
||||
@@ -115,7 +92,7 @@ services:
|
||||
environment:
|
||||
- SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
|
||||
- SAFEKEEPER_ID=2
|
||||
- BROKER_ENDPOINT=http://etcd:2379
|
||||
- BROKER_ENDPOINT=http://storage_broker:50051
|
||||
- AWS_ACCESS_KEY_ID=minio
|
||||
- AWS_SECRET_ACCESS_KEY=password
|
||||
#- RUST_BACKTRACE=1
|
||||
@@ -129,14 +106,14 @@ services:
|
||||
- "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
|
||||
--listen-http='0.0.0.0:7676'
|
||||
--id=$$SAFEKEEPER_ID
|
||||
--broker-endpoints=$$BROKER_ENDPOINT
|
||||
--broker-endpoint=$$BROKER_ENDPOINT
|
||||
-D /data
|
||||
--remote-storage=\"{endpoint='http://minio:9000',
|
||||
bucket_name='neon',
|
||||
bucket_region='eu-north-1',
|
||||
prefix_in_bucket='/safekeeper/'}\""
|
||||
depends_on:
|
||||
- etcd
|
||||
- storage_broker
|
||||
- minio_create_buckets
|
||||
|
||||
safekeeper3:
|
||||
@@ -145,7 +122,7 @@ services:
|
||||
environment:
|
||||
- SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
|
||||
- SAFEKEEPER_ID=3
|
||||
- BROKER_ENDPOINT=http://etcd:2379
|
||||
- BROKER_ENDPOINT=http://storage_broker:50051
|
||||
- AWS_ACCESS_KEY_ID=minio
|
||||
- AWS_SECRET_ACCESS_KEY=password
|
||||
#- RUST_BACKTRACE=1
|
||||
@@ -159,16 +136,25 @@ services:
|
||||
- "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
|
||||
--listen-http='0.0.0.0:7676'
|
||||
--id=$$SAFEKEEPER_ID
|
||||
--broker-endpoints=$$BROKER_ENDPOINT
|
||||
--broker-endpoint=$$BROKER_ENDPOINT
|
||||
-D /data
|
||||
--remote-storage=\"{endpoint='http://minio:9000',
|
||||
bucket_name='neon',
|
||||
bucket_region='eu-north-1',
|
||||
prefix_in_bucket='/safekeeper/'}\""
|
||||
depends_on:
|
||||
- etcd
|
||||
- storage_broker
|
||||
- minio_create_buckets
|
||||
|
||||
storage_broker:
|
||||
restart: always
|
||||
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
|
||||
ports:
|
||||
- 50051:50051
|
||||
command:
|
||||
- "storage_broker"
|
||||
- "--listen-addr=0.0.0.0:50051"
|
||||
|
||||
compute:
|
||||
restart: always
|
||||
build:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
### Overview
|
||||
We use JWT tokens in communication between almost all components (compute, pageserver, safekeeper, CLI) regardless of the protocol used (HTTP/PostgreSQL).
|
||||
Etcd currently has no authentication.
|
||||
storage_broker currently has no authentication.
|
||||
Authentication is optional and is disabled by default for easier debugging.
|
||||
It is used in some tests, though.
|
||||
Note that we do not cover authentication with `pg.neon.tech` here.
|
||||
@@ -84,7 +84,7 @@ the scope is the tenant and the token is usually passed through the
|
||||
Pageserver keeps track of multiple tenants, each having multiple timelines.
|
||||
For each timeline, it connects to the corresponding Safekeeper.
|
||||
Information about "corresponding Safekeeper" is published by Safekeepers
|
||||
in the Etcd, but they do not publish access tokens, otherwise what is
|
||||
in the storage_broker, but they do not publish access tokens, otherwise what is
|
||||
the point of authentication.
|
||||
|
||||
Pageserver keeps a connection to some set of Safekeepers, which
|
||||
|
||||
@@ -23,9 +23,9 @@ We build all images after a successful `release` tests run and push automaticall
|
||||
|
||||
You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
|
||||
|
||||
- etcd x 1
|
||||
- pageserver x 1
|
||||
- safekeeper x 3
|
||||
- storage_broker x 1
|
||||
- compute x 1
|
||||
- MinIO x 1 # This is Amazon S3 compatible object storage
|
||||
|
||||
@@ -41,7 +41,7 @@ $ cd docker-compose/docker-compose.yml
|
||||
$ docker-compose down # remove the conainers if exists
|
||||
$ PG_VERSION=15 TAG=2221 docker-compose up --build -d # You can specify the postgres and image version
|
||||
Creating network "dockercompose_default" with the default driver
|
||||
Creating dockercompose_etcd3_1 ...
|
||||
Creating docker-compose_storage_broker_1 ... done
|
||||
(...omit...)
|
||||
```
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@ the values in the config file, if any are specified for the same key and get int
|
||||
|
||||
```toml
|
||||
# Initial configuration file created by 'pageserver --init'
|
||||
|
||||
listen_pg_addr = '127.0.0.1:64000'
|
||||
listen_http_addr = '127.0.0.1:9898'
|
||||
|
||||
@@ -25,13 +24,12 @@ max_file_descriptors = '100'
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
initial_superuser_name = 'cloud_admin'
|
||||
|
||||
broker_etcd_prefix = 'neon'
|
||||
broker_endpoints = ['some://etcd']
|
||||
broker_endpoint = 'http://127.0.0.1:50051'
|
||||
|
||||
# [remote_storage]
|
||||
```
|
||||
|
||||
The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user,
|
||||
The config above shows default values for all basic pageserver settings, besides `broker_endpoint`: that one has to be set by the user,
|
||||
see the corresponding section below.
|
||||
Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank.
|
||||
Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start.
|
||||
@@ -50,16 +48,10 @@ Example: `${PAGESERVER_BIN} -c "checkpoint_timeout = '10 m'" -c "remote_storage=
|
||||
|
||||
Note that TOML distinguishes between strings and integers, the former require single or double quotes around them.
|
||||
|
||||
#### broker_endpoints
|
||||
#### broker_endpoint
|
||||
|
||||
A list of endpoints (etcd currently) to connect and pull the information from.
|
||||
Mandatory, does not have a default, since requires etcd to be started as a separate process,
|
||||
and its connection url should be specified separately.
|
||||
|
||||
#### broker_etcd_prefix
|
||||
|
||||
A prefix to add for every etcd key used, to separate one group of related instances from another, in the same cluster.
|
||||
Default is `neon`.
|
||||
A storage broker endpoint to connect and pull the information from. Default is
|
||||
`'http://127.0.0.1:50051'`.
|
||||
|
||||
#### checkpoint_distance
|
||||
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
[package]
|
||||
name = "etcd_broker"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
etcd-client = "0.9.0"
|
||||
regex = "1.4.5"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "2.0"
|
||||
once_cell = "1.13.0"
|
||||
|
||||
utils = { path = "../utils" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
tokio = "1"
|
||||
tracing = "0.1"
|
||||
thiserror = "1"
|
||||
@@ -1,209 +0,0 @@
|
||||
//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent).
|
||||
//! Intended to connect services to each other, not to store their data.
|
||||
|
||||
/// All broker keys, that are used when dealing with etcd.
|
||||
pub mod subscription_key;
|
||||
/// All broker values, possible to use when dealing with etcd.
|
||||
pub mod subscription_value;
|
||||
|
||||
use std::str::FromStr;
|
||||
|
||||
use serde::de::DeserializeOwned;
|
||||
|
||||
use subscription_key::SubscriptionKey;
|
||||
use tokio::{sync::mpsc, task::JoinHandle};
|
||||
use tracing::*;
|
||||
|
||||
use crate::subscription_key::SubscriptionFullKey;
|
||||
|
||||
pub use etcd_client::*;
|
||||
|
||||
/// Default value to use for prefixing to all etcd keys with.
|
||||
/// This way allows isolating safekeeper/pageserver groups in the same etcd cluster.
|
||||
pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon";
|
||||
|
||||
/// A way to control the data retrieval from a certain subscription.
|
||||
pub struct BrokerSubscription<V> {
|
||||
/// An unbounded channel to fetch the relevant etcd updates from.
|
||||
pub value_updates: mpsc::UnboundedReceiver<BrokerUpdate<V>>,
|
||||
key: SubscriptionKey,
|
||||
/// A subscription task handle, to allow waiting on it for the task to complete.
|
||||
/// Both the updates channel and the handle require `&mut`, so it's better to keep
|
||||
/// both `pub` to allow using both in the same structures without borrow checker complaining.
|
||||
pub watcher_handle: JoinHandle<Result<(), BrokerError>>,
|
||||
watcher: Watcher,
|
||||
}
|
||||
|
||||
impl<V> BrokerSubscription<V> {
|
||||
/// Cancels the subscription, stopping the data poller and waiting for it to shut down.
|
||||
pub async fn cancel(mut self) -> Result<(), BrokerError> {
|
||||
self.watcher.cancel().await.map_err(|e| {
|
||||
BrokerError::EtcdClient(
|
||||
e,
|
||||
format!("Failed to cancel broker subscription, kind: {:?}", self.key),
|
||||
)
|
||||
})?;
|
||||
match (&mut self.watcher_handle).await {
|
||||
Ok(res) => res,
|
||||
Err(e) => {
|
||||
if e.is_cancelled() {
|
||||
// don't error on the tasks that are cancelled already
|
||||
Ok(())
|
||||
} else {
|
||||
Err(BrokerError::InternalError(format!(
|
||||
"Panicked during broker subscription task, kind: {:?}, error: {e}",
|
||||
self.key
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V> Drop for BrokerSubscription<V> {
|
||||
fn drop(&mut self) {
|
||||
// we poll data from etcd into the channel in the same struct, so if the whole struct gets dropped,
|
||||
// no more data is used by the receiver and it's safe to cancel and drop the whole etcd subscription task.
|
||||
self.watcher_handle.abort();
|
||||
}
|
||||
}
|
||||
|
||||
/// An update from the etcd broker.
|
||||
pub struct BrokerUpdate<V> {
|
||||
/// Etcd generation version, the bigger the more actual the data is.
|
||||
pub etcd_version: i64,
|
||||
/// Etcd key for the corresponding value, parsed from the broker KV.
|
||||
pub key: SubscriptionFullKey,
|
||||
/// Current etcd value, parsed from the broker KV.
|
||||
pub value: V,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum BrokerError {
|
||||
#[error("Etcd client error: {0}. Context: {1}")]
|
||||
EtcdClient(etcd_client::Error, String),
|
||||
#[error("Error during parsing etcd key: {0}")]
|
||||
KeyNotParsed(String),
|
||||
#[error("Internal error: {0}")]
|
||||
InternalError(String),
|
||||
}
|
||||
|
||||
/// Creates a background task to poll etcd for timeline updates from safekeepers.
|
||||
/// Stops and returns `Err` on any error during etcd communication.
|
||||
/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle,
|
||||
/// exiting normally in such cases.
|
||||
/// Etcd values are parsed as json fukes into a type, specified in the generic patameter.
|
||||
pub async fn subscribe_for_json_values<V>(
|
||||
client: &mut Client,
|
||||
key: SubscriptionKey,
|
||||
) -> Result<BrokerSubscription<V>, BrokerError>
|
||||
where
|
||||
V: DeserializeOwned + Send + 'static,
|
||||
{
|
||||
subscribe_for_values(client, key, |_, value_str| {
|
||||
match serde_json::from_str::<V>(value_str) {
|
||||
Ok(value) => Some(value),
|
||||
Err(e) => {
|
||||
error!("Failed to parse value str '{value_str}': {e}");
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Same as [`subscribe_for_json_values`], but allows to specify a custom parser of a etcd value string.
|
||||
pub async fn subscribe_for_values<P, V>(
|
||||
client: &mut Client,
|
||||
key: SubscriptionKey,
|
||||
value_parser: P,
|
||||
) -> Result<BrokerSubscription<V>, BrokerError>
|
||||
where
|
||||
V: Send + 'static,
|
||||
P: Fn(SubscriptionFullKey, &str) -> Option<V> + Send + 'static,
|
||||
{
|
||||
info!("Subscribing to broker value updates, key: {key:?}");
|
||||
let subscription_key = key.clone();
|
||||
|
||||
let (watcher, mut stream) = client
|
||||
.watch(key.watch_key(), Some(WatchOptions::new().with_prefix()))
|
||||
.await
|
||||
.map_err(|e| {
|
||||
BrokerError::EtcdClient(
|
||||
e,
|
||||
format!("Failed to init the watch for subscription {key:?}"),
|
||||
)
|
||||
})?;
|
||||
|
||||
let (value_updates_sender, value_updates_receiver) = mpsc::unbounded_channel();
|
||||
let watcher_handle = tokio::spawn(async move {
|
||||
while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!(
|
||||
"Failed to get messages from the subscription stream, kind: {:?}, error: {e}", key.kind
|
||||
)))? {
|
||||
if resp.canceled() {
|
||||
info!("Watch for timeline updates subscription was canceled, exiting");
|
||||
break;
|
||||
}
|
||||
|
||||
let events = resp.events();
|
||||
debug!("Processing {} events", events.len());
|
||||
|
||||
for event in events {
|
||||
if EventType::Put == event.event_type() {
|
||||
if let Some(new_etcd_kv) = event.kv() {
|
||||
match parse_etcd_kv(new_etcd_kv, &value_parser, &key.cluster_prefix) {
|
||||
Ok(Some((key, value))) => if let Err(e) = value_updates_sender.send(BrokerUpdate {
|
||||
etcd_version: new_etcd_kv.version(),
|
||||
key,
|
||||
value,
|
||||
}) {
|
||||
info!("Broker value updates for key {key:?} sender got dropped, exiting: {e}");
|
||||
break;
|
||||
},
|
||||
Ok(None) => debug!("Ignoring key {key:?} : no value was returned by the parser"),
|
||||
Err(BrokerError::KeyNotParsed(e)) => debug!("Unexpected key {key:?} for timeline update: {e}"),
|
||||
Err(e) => error!("Failed to represent etcd KV {new_etcd_kv:?}: {e}"),
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}.instrument(info_span!("etcd_broker")));
|
||||
|
||||
Ok(BrokerSubscription {
|
||||
key: subscription_key,
|
||||
value_updates: value_updates_receiver,
|
||||
watcher_handle,
|
||||
watcher,
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_etcd_kv<P, V>(
|
||||
kv: &KeyValue,
|
||||
value_parser: &P,
|
||||
cluster_prefix: &str,
|
||||
) -> Result<Option<(SubscriptionFullKey, V)>, BrokerError>
|
||||
where
|
||||
P: Fn(SubscriptionFullKey, &str) -> Option<V>,
|
||||
{
|
||||
let key_str = kv.key_str().map_err(|e| {
|
||||
BrokerError::EtcdClient(e, "Failed to extract key str out of etcd KV".to_string())
|
||||
})?;
|
||||
let value_str = kv.value_str().map_err(|e| {
|
||||
BrokerError::EtcdClient(e, "Failed to extract value str out of etcd KV".to_string())
|
||||
})?;
|
||||
|
||||
if !key_str.starts_with(cluster_prefix) {
|
||||
return Err(BrokerError::KeyNotParsed(format!(
|
||||
"KV has unexpected key '{key_str}' that does not start with cluster prefix {cluster_prefix}"
|
||||
)));
|
||||
}
|
||||
|
||||
let key = SubscriptionFullKey::from_str(&key_str[cluster_prefix.len()..]).map_err(|e| {
|
||||
BrokerError::KeyNotParsed(format!("Failed to parse KV key '{key_str}': {e}"))
|
||||
})?;
|
||||
|
||||
Ok(value_parser(key, value_str).map(|value| (key, value)))
|
||||
}
|
||||
@@ -1,310 +0,0 @@
|
||||
//! Etcd broker keys, used in the project and shared between instances.
|
||||
//! The keys are split into two categories:
|
||||
//!
|
||||
//! * [`SubscriptionFullKey`] full key format: `<cluster_prefix>/<tenant>/<timeline>/<node_kind>/<operation>/<node_id>`
|
||||
//! Always returned from etcd in this form, always start with the user key provided.
|
||||
//!
|
||||
//! * [`SubscriptionKey`] user input key format: always partial, since it's unknown which `node_id`'s are available.
|
||||
//! Full key always starts with the user input one, due to etcd subscription properties.
|
||||
|
||||
use std::{fmt::Display, str::FromStr};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::{Captures, Regex};
|
||||
use utils::id::{NodeId, TenantId, TenantTimelineId};
|
||||
|
||||
/// The subscription kind to the timeline updates from safekeeper.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct SubscriptionKey {
|
||||
/// Generic cluster prefix, allowing to use the same etcd instance by multiple logic groups.
|
||||
pub cluster_prefix: String,
|
||||
/// The subscription kind.
|
||||
pub kind: SubscriptionKind,
|
||||
}
|
||||
|
||||
/// All currently possible key kinds of a etcd broker subscription.
|
||||
/// Etcd works so, that every key that starts with the subbscription key given is considered matching and
|
||||
/// returned as part of the subscrption.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum SubscriptionKind {
|
||||
/// Get every update in etcd.
|
||||
All,
|
||||
/// Get etcd updates for any timeiline of a certain tenant, affected by any operation from any node kind.
|
||||
TenantTimelines(TenantId),
|
||||
/// Get etcd updates for a certain timeline of a tenant, affected by any operation from any node kind.
|
||||
Timeline(TenantTimelineId),
|
||||
/// Get etcd timeline updates, specific to a certain node kind.
|
||||
Node(TenantTimelineId, NodeKind),
|
||||
/// Get etcd timeline updates for a certain operation on specific nodes.
|
||||
Operation(TenantTimelineId, NodeKind, OperationKind),
|
||||
}
|
||||
|
||||
/// All kinds of nodes, able to write into etcd.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum NodeKind {
|
||||
Safekeeper,
|
||||
Pageserver,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum OperationKind {
|
||||
Safekeeper(SkOperationKind),
|
||||
}
|
||||
|
||||
/// Current operations, running inside the safekeeper node.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum SkOperationKind {
|
||||
TimelineInfo,
|
||||
WalBackup,
|
||||
}
|
||||
|
||||
static SUBSCRIPTION_FULL_KEY_REGEX: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new("/([[:xdigit:]]+)/([[:xdigit:]]+)/([^/]+)/([^/]+)/([[:digit:]]+)$")
|
||||
.expect("wrong subscription full etcd key regex")
|
||||
});
|
||||
|
||||
/// Full key, received from etcd during any of the component's work.
|
||||
/// No other etcd keys are considered during system's work.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct SubscriptionFullKey {
|
||||
pub id: TenantTimelineId,
|
||||
pub node_kind: NodeKind,
|
||||
pub operation: OperationKind,
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
impl SubscriptionKey {
|
||||
/// Subscribes for all etcd updates.
|
||||
pub fn all(cluster_prefix: String) -> Self {
|
||||
SubscriptionKey {
|
||||
cluster_prefix,
|
||||
kind: SubscriptionKind::All,
|
||||
}
|
||||
}
|
||||
|
||||
/// Subscribes to a given timeline info updates from safekeepers.
|
||||
pub fn sk_timeline_info(cluster_prefix: String, timeline: TenantTimelineId) -> Self {
|
||||
Self {
|
||||
cluster_prefix,
|
||||
kind: SubscriptionKind::Operation(
|
||||
timeline,
|
||||
NodeKind::Safekeeper,
|
||||
OperationKind::Safekeeper(SkOperationKind::TimelineInfo),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
/// Subscribes to all timeine updates during specific operations, running on the corresponding nodes.
|
||||
pub fn operation(
|
||||
cluster_prefix: String,
|
||||
timeline: TenantTimelineId,
|
||||
node_kind: NodeKind,
|
||||
operation: OperationKind,
|
||||
) -> Self {
|
||||
Self {
|
||||
cluster_prefix,
|
||||
kind: SubscriptionKind::Operation(timeline, node_kind, operation),
|
||||
}
|
||||
}
|
||||
|
||||
/// Etcd key to use for watching a certain timeline updates from safekeepers.
|
||||
pub fn watch_key(&self) -> String {
|
||||
let cluster_prefix = &self.cluster_prefix;
|
||||
match self.kind {
|
||||
SubscriptionKind::All => cluster_prefix.to_string(),
|
||||
SubscriptionKind::TenantTimelines(tenant_id) => {
|
||||
format!("{cluster_prefix}/{tenant_id}")
|
||||
}
|
||||
SubscriptionKind::Timeline(id) => {
|
||||
format!("{cluster_prefix}/{id}")
|
||||
}
|
||||
SubscriptionKind::Node(id, node_kind) => {
|
||||
format!("{cluster_prefix}/{id}/{node_kind}")
|
||||
}
|
||||
SubscriptionKind::Operation(id, node_kind, operation_kind) => {
|
||||
format!("{cluster_prefix}/{id}/{node_kind}/{operation_kind}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for OperationKind {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
OperationKind::Safekeeper(o) => o.fmt(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for OperationKind {
|
||||
type Err = String;
|
||||
|
||||
fn from_str(operation_kind_str: &str) -> Result<Self, Self::Err> {
|
||||
match operation_kind_str {
|
||||
"timeline_info" => Ok(OperationKind::Safekeeper(SkOperationKind::TimelineInfo)),
|
||||
"wal_backup" => Ok(OperationKind::Safekeeper(SkOperationKind::WalBackup)),
|
||||
_ => Err(format!("Unknown operation kind: {operation_kind_str}")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for SubscriptionFullKey {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let Self {
|
||||
id,
|
||||
node_kind,
|
||||
operation,
|
||||
node_id,
|
||||
} = self;
|
||||
write!(f, "{id}/{node_kind}/{operation}/{node_id}")
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for SubscriptionFullKey {
|
||||
type Err = String;
|
||||
|
||||
fn from_str(subscription_kind_str: &str) -> Result<Self, Self::Err> {
|
||||
let key_captures = match SUBSCRIPTION_FULL_KEY_REGEX.captures(subscription_kind_str) {
|
||||
Some(captures) => captures,
|
||||
None => {
|
||||
return Err(format!(
|
||||
"Subscription kind str does not match a subscription full key regex {}",
|
||||
SUBSCRIPTION_FULL_KEY_REGEX.as_str()
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
id: TenantTimelineId::new(
|
||||
parse_capture(&key_captures, 1)?,
|
||||
parse_capture(&key_captures, 2)?,
|
||||
),
|
||||
node_kind: parse_capture(&key_captures, 3)?,
|
||||
operation: parse_capture(&key_captures, 4)?,
|
||||
node_id: NodeId(parse_capture(&key_captures, 5)?),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_capture<T>(caps: &Captures, index: usize) -> Result<T, String>
|
||||
where
|
||||
T: FromStr,
|
||||
<T as FromStr>::Err: Display,
|
||||
{
|
||||
let capture_match = caps
|
||||
.get(index)
|
||||
.ok_or_else(|| format!("Failed to get capture match at index {index}"))?
|
||||
.as_str();
|
||||
capture_match.parse().map_err(|e| {
|
||||
format!(
|
||||
"Failed to parse {} from {capture_match}: {e}",
|
||||
std::any::type_name::<T>()
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
impl Display for NodeKind {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::Safekeeper => write!(f, "safekeeper"),
|
||||
Self::Pageserver => write!(f, "pageserver"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for NodeKind {
|
||||
type Err = String;
|
||||
|
||||
fn from_str(node_kind_str: &str) -> Result<Self, Self::Err> {
|
||||
match node_kind_str {
|
||||
"safekeeper" => Ok(Self::Safekeeper),
|
||||
"pageserver" => Ok(Self::Pageserver),
|
||||
_ => Err(format!("Invalid node kind: {node_kind_str}")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for SkOperationKind {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::TimelineInfo => write!(f, "timeline_info"),
|
||||
Self::WalBackup => write!(f, "wal_backup"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for SkOperationKind {
|
||||
type Err = String;
|
||||
|
||||
fn from_str(operation_str: &str) -> Result<Self, Self::Err> {
|
||||
match operation_str {
|
||||
"timeline_info" => Ok(Self::TimelineInfo),
|
||||
"wal_backup" => Ok(Self::WalBackup),
|
||||
_ => Err(format!("Invalid operation: {operation_str}")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn full_cluster_key_parsing() {
|
||||
let prefix = "neon";
|
||||
let node_kind = NodeKind::Safekeeper;
|
||||
let operation_kind = OperationKind::Safekeeper(SkOperationKind::WalBackup);
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let id = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
let node_id = NodeId(1);
|
||||
|
||||
let timeline_subscription_keys = [
|
||||
SubscriptionKey {
|
||||
cluster_prefix: prefix.to_string(),
|
||||
kind: SubscriptionKind::All,
|
||||
},
|
||||
SubscriptionKey {
|
||||
cluster_prefix: prefix.to_string(),
|
||||
kind: SubscriptionKind::TenantTimelines(tenant_id),
|
||||
},
|
||||
SubscriptionKey {
|
||||
cluster_prefix: prefix.to_string(),
|
||||
kind: SubscriptionKind::Timeline(id),
|
||||
},
|
||||
SubscriptionKey {
|
||||
cluster_prefix: prefix.to_string(),
|
||||
kind: SubscriptionKind::Node(id, node_kind),
|
||||
},
|
||||
SubscriptionKey {
|
||||
cluster_prefix: prefix.to_string(),
|
||||
kind: SubscriptionKind::Operation(id, node_kind, operation_kind),
|
||||
},
|
||||
];
|
||||
|
||||
let full_key_string = format!(
|
||||
"{}/{node_id}",
|
||||
timeline_subscription_keys.last().unwrap().watch_key()
|
||||
);
|
||||
|
||||
for key in timeline_subscription_keys {
|
||||
assert!(full_key_string.starts_with(&key.watch_key()), "Full key '{full_key_string}' should start with any of the keys, keys, but {key:?} did not match");
|
||||
}
|
||||
|
||||
let full_key = SubscriptionFullKey::from_str(&full_key_string).unwrap_or_else(|e| {
|
||||
panic!("Failed to parse {full_key_string} as a subscription full key: {e}")
|
||||
});
|
||||
|
||||
assert_eq!(
|
||||
full_key,
|
||||
SubscriptionFullKey {
|
||||
id,
|
||||
node_kind,
|
||||
operation: operation_kind,
|
||||
node_id
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -1,38 +0,0 @@
|
||||
//! Module for the values to put into etcd.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Data about safekeeper's timeline. Fields made optional for easy migrations.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct SkTimelineInfo {
|
||||
/// Term of the last entry.
|
||||
pub last_log_term: Option<u64>,
|
||||
/// LSN of the last record.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub flush_lsn: Option<Lsn>,
|
||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub commit_lsn: Option<Lsn>,
|
||||
/// LSN up to which safekeeper has backed WAL.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub backup_lsn: Option<Lsn>,
|
||||
/// LSN of last checkpoint uploaded by pageserver.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub remote_consistent_lsn: Option<Lsn>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub peer_horizon_lsn: Option<Lsn>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub local_start_lsn: Option<Lsn>,
|
||||
/// A connection string to use for WAL receiving.
|
||||
#[serde(default)]
|
||||
pub safekeeper_connstr: Option<String>,
|
||||
}
|
||||
@@ -104,11 +104,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
) -> Result<Download, DownloadError>;
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
|
||||
|
||||
/// Downcast to LocalFs implementation. For tests.
|
||||
fn as_local(&self) -> Option<&LocalFs> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Download {
|
||||
@@ -142,7 +137,7 @@ impl std::fmt::Display for DownloadError {
|
||||
write!(f, "Failed to download a remote file due to user input: {e}")
|
||||
}
|
||||
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
|
||||
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e}"),
|
||||
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -277,7 +272,7 @@ impl Debug for S3Config {
|
||||
}
|
||||
|
||||
impl RemoteStorageConfig {
|
||||
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
|
||||
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
|
||||
let local_path = toml.get("local_path");
|
||||
let bucket_name = toml.get("bucket_name");
|
||||
let bucket_region = toml.get("bucket_region");
|
||||
@@ -301,7 +296,8 @@ impl RemoteStorageConfig {
|
||||
.context("Failed to parse 'concurrency_limit' as a positive integer")?;
|
||||
|
||||
let storage = match (local_path, bucket_name, bucket_region) {
|
||||
(None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"),
|
||||
// no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
|
||||
(None, None, None) => return Ok(None),
|
||||
(_, Some(_), None) => {
|
||||
bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
|
||||
}
|
||||
@@ -327,11 +323,11 @@ impl RemoteStorageConfig {
|
||||
(Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
|
||||
};
|
||||
|
||||
Ok(RemoteStorageConfig {
|
||||
Ok(Some(RemoteStorageConfig {
|
||||
max_concurrent_syncs,
|
||||
max_sync_errors,
|
||||
storage,
|
||||
})
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -283,10 +283,6 @@ impl RemoteStorage for LocalFs {
|
||||
bail!("File {file_path:?} either does not exist or is not a file")
|
||||
}
|
||||
}
|
||||
|
||||
fn as_local(&self) -> Option<&LocalFs> {
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_metadata_path(original_path: &Path) -> PathBuf {
|
||||
|
||||
@@ -22,3 +22,40 @@ pub struct TimelineCreateRequest {
|
||||
// If not passed, it is assigned to the beginning of commit_lsn segment.
|
||||
pub local_start_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
fn lsn_invalid() -> Lsn {
|
||||
Lsn::INVALID
|
||||
}
|
||||
|
||||
/// Data about safekeeper's timeline, mirrors broker.proto.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct SkTimelineInfo {
|
||||
/// Term of the last entry.
|
||||
pub last_log_term: Option<u64>,
|
||||
/// LSN of the last record.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub flush_lsn: Lsn,
|
||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub commit_lsn: Lsn,
|
||||
/// LSN up to which safekeeper has backed WAL.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub backup_lsn: Lsn,
|
||||
/// LSN of last checkpoint uploaded by pageserver.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub local_start_lsn: Lsn,
|
||||
/// A connection string to use for WAL receiving.
|
||||
#[serde(default)]
|
||||
pub safekeeper_connstr: Option<String>,
|
||||
}
|
||||
|
||||
@@ -10,11 +10,13 @@ pub fn init_sentry(
|
||||
extra_options: &[(&str, &str)],
|
||||
) -> Option<ClientInitGuard> {
|
||||
let dsn = env::var("SENTRY_DSN").ok()?;
|
||||
let environment = env::var("SENTRY_ENVIRONMENT").unwrap_or_else(|_| "development".into());
|
||||
|
||||
let guard = sentry::init((
|
||||
dsn,
|
||||
sentry::ClientOptions {
|
||||
release: release_name,
|
||||
environment: Some(environment.into()),
|
||||
..Default::default()
|
||||
},
|
||||
));
|
||||
|
||||
@@ -59,13 +59,13 @@ tracing = "0.1.36"
|
||||
url = "2"
|
||||
walkdir = "2.3.2"
|
||||
|
||||
etcd_broker = { path = "../libs/etcd_broker" }
|
||||
metrics = { path = "../libs/metrics" }
|
||||
pageserver_api = { path = "../libs/pageserver_api" }
|
||||
postgres_connection = { path = "../libs/postgres_connection" }
|
||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||
pq_proto = { path = "../libs/pq_proto" }
|
||||
remote_storage = { path = "../libs/remote_storage" }
|
||||
storage_broker = { version = "0.1", path = "../storage_broker" }
|
||||
tenant_size_model = { path = "../libs/tenant_size_model" }
|
||||
utils = { path = "../libs/utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
use anyhow::Result;
|
||||
use pageserver::repository::{Key, Value};
|
||||
use pageserver::repository::Key;
|
||||
use pageserver::tenant::filename::{DeltaFileName, ImageFileName};
|
||||
use pageserver::tenant::layer_map::LayerMap;
|
||||
use pageserver::tenant::storage_layer::Layer;
|
||||
use pageserver::tenant::storage_layer::ValueReconstructResult;
|
||||
use pageserver::tenant::storage_layer::ValueReconstructState;
|
||||
use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult};
|
||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||
use std::cmp::{max, min};
|
||||
use std::fs::File;
|
||||
@@ -14,7 +13,7 @@ use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
@@ -25,14 +24,6 @@ struct DummyDelta {
|
||||
}
|
||||
|
||||
impl Layer for DummyDelta {
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
TenantId::from_str("00000000000000000000000000000000").unwrap()
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
TimelineId::from_str("00000000000000000000000000000000").unwrap()
|
||||
}
|
||||
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
@@ -40,15 +31,6 @@ impl Layer for DummyDelta {
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.lsn_range.clone()
|
||||
}
|
||||
|
||||
fn filename(&self) -> PathBuf {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
@@ -62,24 +44,12 @@ impl Layer for DummyDelta {
|
||||
true
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_> {
|
||||
panic!()
|
||||
}
|
||||
|
||||
fn key_iter(&self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + '_> {
|
||||
panic!("Not implemented")
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
panic!()
|
||||
}
|
||||
|
||||
fn dump(&self, _verbose: bool) -> Result<()> {
|
||||
todo!()
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -89,14 +59,6 @@ struct DummyImage {
|
||||
}
|
||||
|
||||
impl Layer for DummyImage {
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
TenantId::from_str("00000000000000000000000000000000").unwrap()
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
TimelineId::from_str("00000000000000000000000000000000").unwrap()
|
||||
}
|
||||
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
@@ -106,14 +68,6 @@ impl Layer for DummyImage {
|
||||
self.lsn..(self.lsn + 1)
|
||||
}
|
||||
|
||||
fn filename(&self) -> PathBuf {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
@@ -127,29 +81,17 @@ impl Layer for DummyImage {
|
||||
false
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_> {
|
||||
panic!()
|
||||
}
|
||||
|
||||
fn key_iter(&self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + '_> {
|
||||
panic!("Not implemented")
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
panic!()
|
||||
}
|
||||
|
||||
fn dump(&self, _verbose: bool) -> Result<()> {
|
||||
todo!()
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
|
||||
let mut layer_map = LayerMap::default();
|
||||
fn build_layer_map(filename_dump: PathBuf) -> LayerMap<dyn Layer> {
|
||||
let mut layer_map = LayerMap::<dyn Layer>::default();
|
||||
|
||||
let mut min_lsn = Lsn(u64::MAX);
|
||||
let mut max_lsn = Lsn(0);
|
||||
@@ -185,7 +127,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
|
||||
}
|
||||
|
||||
/// Construct a layer map query pattern for benchmarks
|
||||
fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
|
||||
fn uniform_query_pattern(layer_map: &LayerMap<dyn Layer>) -> Vec<(Key, Lsn)> {
|
||||
// For each image layer we query one of the pages contained, at LSN right
|
||||
// before the image layer was created. This gives us a somewhat uniform
|
||||
// coverage of both the lsn and key space because image layers have
|
||||
@@ -258,7 +200,7 @@ fn bench_from_real_project(c: &mut Criterion) {
|
||||
|
||||
// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
|
||||
fn bench_sequential(c: &mut Criterion) {
|
||||
let mut layer_map = LayerMap::default();
|
||||
let mut layer_map: LayerMap<dyn Layer> = LayerMap::default();
|
||||
|
||||
// Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
|
||||
//
|
||||
|
||||
@@ -247,7 +247,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
// start profiler (if enabled)
|
||||
let profiler_guard = profiling::init_profiler(conf);
|
||||
|
||||
WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_etcd_client(conf))?;
|
||||
WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
|
||||
|
||||
// initialize authentication for incoming connections
|
||||
let auth = match &conf.auth_type {
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use std::env;
|
||||
use storage_broker::Uri;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::ConnectionId;
|
||||
|
||||
@@ -18,7 +19,7 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use toml_edit;
|
||||
use toml_edit::{Document, Item};
|
||||
use url::Url;
|
||||
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
logging::LogFormat,
|
||||
@@ -39,6 +40,7 @@ pub mod defaults {
|
||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
||||
DEFAULT_PG_LISTEN_PORT,
|
||||
};
|
||||
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
|
||||
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
|
||||
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
||||
@@ -59,7 +61,6 @@ pub mod defaults {
|
||||
pub const DEFAULT_CONFIG_FILE: &str = formatcp!(
|
||||
r###"
|
||||
# Initial configuration file created by 'pageserver --init'
|
||||
|
||||
#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
|
||||
#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
|
||||
|
||||
@@ -71,6 +72,8 @@ pub mod defaults {
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
#initial_superuser_name = '{DEFAULT_SUPERUSER}'
|
||||
|
||||
#broker_endpoint = '{BROKER_DEFAULT_ENDPOINT}'
|
||||
|
||||
#log_format = '{DEFAULT_LOG_FORMAT}'
|
||||
|
||||
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
|
||||
@@ -132,12 +135,8 @@ pub struct PageServerConf {
|
||||
pub profiling: ProfilingConfig,
|
||||
pub default_tenant_conf: TenantConf,
|
||||
|
||||
/// A prefix to add in etcd brokers before every key.
|
||||
/// Can be used for isolating different pageserver groups within the same etcd cluster.
|
||||
pub broker_etcd_prefix: String,
|
||||
|
||||
/// Etcd broker endpoints to connect to.
|
||||
pub broker_endpoints: Vec<Url>,
|
||||
/// Storage broker endpoints to connect to.
|
||||
pub broker_endpoint: Uri,
|
||||
|
||||
pub log_format: LogFormat,
|
||||
|
||||
@@ -148,8 +147,7 @@ pub struct PageServerConf {
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
/// and/or serialized at a whim, while the token is secret. Currently this token is the
|
||||
/// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in
|
||||
/// the future, more tokens and auth may arrive for etcd and/or its rewrite (see
|
||||
/// https://github.com/neondatabase/neon/issues/2394), completely changing the logic.
|
||||
/// the future, more tokens and auth may arrive for storage broker, completely changing the logic.
|
||||
/// Hence, we resort to a global variable for now instead of passing the token from the
|
||||
/// startup code to the connection code through a dozen layers.
|
||||
pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
|
||||
@@ -216,8 +214,7 @@ struct PageServerConfigBuilder {
|
||||
id: BuilderValue<NodeId>,
|
||||
|
||||
profiling: BuilderValue<ProfilingConfig>,
|
||||
broker_etcd_prefix: BuilderValue<String>,
|
||||
broker_endpoints: BuilderValue<Vec<Url>>,
|
||||
broker_endpoint: BuilderValue<Uri>,
|
||||
|
||||
log_format: BuilderValue<LogFormat>,
|
||||
|
||||
@@ -247,8 +244,9 @@ impl Default for PageServerConfigBuilder {
|
||||
remote_storage_config: Set(None),
|
||||
id: NotSet,
|
||||
profiling: Set(ProfilingConfig::Disabled),
|
||||
broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
|
||||
broker_endpoints: Set(Vec::new()),
|
||||
broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
|
||||
.parse()
|
||||
.expect("failed to parse default broker endpoint")),
|
||||
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
|
||||
|
||||
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
|
||||
@@ -308,12 +306,8 @@ impl PageServerConfigBuilder {
|
||||
self.remote_storage_config = BuilderValue::Set(remote_storage_config)
|
||||
}
|
||||
|
||||
pub fn broker_endpoints(&mut self, broker_endpoints: Vec<Url>) {
|
||||
self.broker_endpoints = BuilderValue::Set(broker_endpoints)
|
||||
}
|
||||
|
||||
pub fn broker_etcd_prefix(&mut self, broker_etcd_prefix: String) {
|
||||
self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix)
|
||||
pub fn broker_endpoint(&mut self, broker_endpoint: Uri) {
|
||||
self.broker_endpoint = BuilderValue::Set(broker_endpoint)
|
||||
}
|
||||
|
||||
pub fn id(&mut self, node_id: NodeId) {
|
||||
@@ -368,12 +362,9 @@ impl PageServerConfigBuilder {
|
||||
profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
|
||||
// TenantConf is handled separately
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoints: self
|
||||
.broker_endpoints
|
||||
broker_endpoint: self
|
||||
.broker_endpoint
|
||||
.ok_or(anyhow!("No broker endpoints provided"))?,
|
||||
broker_etcd_prefix: self
|
||||
.broker_etcd_prefix
|
||||
.ok_or(anyhow!("missing broker_etcd_prefix"))?,
|
||||
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
|
||||
concurrent_tenant_size_logical_size_queries: self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
@@ -533,24 +524,14 @@ impl PageServerConf {
|
||||
)),
|
||||
"auth_type" => builder.auth_type(parse_toml_from_str(key, item)?),
|
||||
"remote_storage" => {
|
||||
builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item)?))
|
||||
builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
|
||||
}
|
||||
"tenant_config" => {
|
||||
t_conf = Self::parse_toml_tenant_conf(item)?;
|
||||
}
|
||||
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
|
||||
"profiling" => builder.profiling(parse_toml_from_str(key, item)?),
|
||||
"broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?),
|
||||
"broker_endpoints" => builder.broker_endpoints(
|
||||
parse_toml_array(key, item)?
|
||||
.into_iter()
|
||||
.map(|endpoint_str| {
|
||||
endpoint_str.parse::<Url>().with_context(|| {
|
||||
format!("Array item {endpoint_str} for key {key} is not a valid url endpoint")
|
||||
})
|
||||
})
|
||||
.collect::<anyhow::Result<_>>()?,
|
||||
),
|
||||
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
|
||||
"log_format" => builder.log_format(
|
||||
LogFormat::from_config(&parse_toml_string(key, item)?)?
|
||||
),
|
||||
@@ -677,8 +658,7 @@ impl PageServerConf {
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::dummy_conf(),
|
||||
broker_endpoints: Vec::new(),
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
}
|
||||
@@ -730,22 +710,6 @@ where
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
|
||||
let array = item
|
||||
.as_array()
|
||||
.with_context(|| format!("configure option {name} is not an array"))?;
|
||||
|
||||
array
|
||||
.iter()
|
||||
.map(|value| {
|
||||
value
|
||||
.as_str()
|
||||
.map(str::to_string)
|
||||
.with_context(|| format!("Array item {value:?} for key {name} is not a string"))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Configurable semaphore permits setting.
|
||||
///
|
||||
/// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
|
||||
@@ -835,10 +799,10 @@ log_format = 'json'
|
||||
fn parse_defaults() -> anyhow::Result<()> {
|
||||
let tempdir = tempdir()?;
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
let broker_endpoint = "http://127.0.0.1:7777";
|
||||
let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
|
||||
// we have to create dummy values to overcome the validation errors
|
||||
let config_string = format!(
|
||||
"pg_distrib_dir='{}'\nid=10\nbroker_endpoints = ['{broker_endpoint}']",
|
||||
"pg_distrib_dir='{}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
|
||||
pg_distrib_dir.display()
|
||||
);
|
||||
let toml = config_string.parse()?;
|
||||
@@ -864,10 +828,7 @@ log_format = 'json'
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoints: vec![broker_endpoint
|
||||
.parse()
|
||||
.expect("Failed to parse a valid broker endpoint URL")],
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
},
|
||||
@@ -881,10 +842,10 @@ log_format = 'json'
|
||||
fn parse_basic_config() -> anyhow::Result<()> {
|
||||
let tempdir = tempdir()?;
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
let broker_endpoint = "http://127.0.0.1:7777";
|
||||
let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
|
||||
|
||||
let config_string = format!(
|
||||
"{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoints = ['{broker_endpoint}']",
|
||||
"{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoint = '{broker_endpoint}'",
|
||||
pg_distrib_dir.display()
|
||||
);
|
||||
let toml = config_string.parse()?;
|
||||
@@ -910,10 +871,7 @@ log_format = 'json'
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoints: vec![broker_endpoint
|
||||
.parse()
|
||||
.expect("Failed to parse a valid broker endpoint URL")],
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
|
||||
log_format: LogFormat::Json,
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
},
|
||||
@@ -947,7 +905,7 @@ local_path = '{}'"#,
|
||||
let config_string = format!(
|
||||
r#"{ALL_BASE_VALUES_TOML}
|
||||
pg_distrib_dir='{}'
|
||||
broker_endpoints = ['{broker_endpoint}']
|
||||
broker_endpoint = '{broker_endpoint}'
|
||||
|
||||
{remote_storage_config_str}"#,
|
||||
pg_distrib_dir.display(),
|
||||
@@ -1014,7 +972,7 @@ concurrency_limit = {s3_concurrency_limit}"#
|
||||
let config_string = format!(
|
||||
r#"{ALL_BASE_VALUES_TOML}
|
||||
pg_distrib_dir='{}'
|
||||
broker_endpoints = ['{broker_endpoint}']
|
||||
broker_endpoint = '{broker_endpoint}'
|
||||
|
||||
{remote_storage_config_str}"#,
|
||||
pg_distrib_dir.display(),
|
||||
@@ -1059,7 +1017,7 @@ broker_endpoints = ['{broker_endpoint}']
|
||||
let config_string = format!(
|
||||
r#"{ALL_BASE_VALUES_TOML}
|
||||
pg_distrib_dir='{}'
|
||||
broker_endpoints = ['{broker_endpoint}']
|
||||
broker_endpoint = '{broker_endpoint}'
|
||||
|
||||
[tenant_config]
|
||||
trace_read_requests = {trace_read_requests}"#,
|
||||
|
||||
@@ -197,12 +197,11 @@ pub use download::{is_temp_download_file, list_remote_timelines};
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::fmt::Debug;
|
||||
use std::ops::DerefMut;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use anyhow::ensure;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use tokio::runtime::Runtime;
|
||||
use tracing::{info, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
@@ -215,6 +214,7 @@ use crate::metrics::MeasureRemoteOp;
|
||||
use crate::metrics::RemoteOpFileKind;
|
||||
use crate::metrics::RemoteOpKind;
|
||||
use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS;
|
||||
use crate::tenant::filename::LayerFileName;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
storage_sync::index::LayerFileMetadata,
|
||||
@@ -287,7 +287,7 @@ struct UploadQueueInitialized {
|
||||
|
||||
/// All layer files stored in the remote storage, taking into account all
|
||||
/// in-progress and queued operations
|
||||
latest_files: HashMap<RemotePath, LayerFileMetadata>,
|
||||
latest_files: HashMap<LayerFileName, LayerFileMetadata>,
|
||||
|
||||
/// Metadata stored in the remote storage, taking into account all
|
||||
/// in-progress and queued operations.
|
||||
@@ -357,10 +357,6 @@ impl UploadQueue {
|
||||
|
||||
fn initialize_with_current_remote_index_part(
|
||||
&mut self,
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
|
||||
index_part: &IndexPart,
|
||||
) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||
match self {
|
||||
@@ -371,18 +367,13 @@ impl UploadQueue {
|
||||
}
|
||||
|
||||
let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
|
||||
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
|
||||
for timeline_name in &index_part.timeline_layers {
|
||||
let local_path = timeline_path.join(timeline_name);
|
||||
let remote_timeline_path = conf.remote_path(&local_path).expect(
|
||||
"Remote timeline path and local timeline path were constructed form the same conf",
|
||||
);
|
||||
for layer_name in &index_part.timeline_layers {
|
||||
let layer_metadata = index_part
|
||||
.layer_metadata
|
||||
.get(timeline_name)
|
||||
.get(layer_name)
|
||||
.map(LayerFileMetadata::from)
|
||||
.unwrap_or(LayerFileMetadata::MISSING);
|
||||
files.insert(remote_timeline_path, layer_metadata);
|
||||
files.insert(layer_name.to_owned(), layer_metadata);
|
||||
}
|
||||
|
||||
let index_part_metadata = index_part.parse_metadata()?;
|
||||
@@ -431,13 +422,13 @@ struct UploadTask {
|
||||
#[derive(Debug)]
|
||||
enum UploadOp {
|
||||
/// Upload a layer file
|
||||
UploadLayer(PathBuf, LayerFileMetadata),
|
||||
UploadLayer(LayerFileName, LayerFileMetadata),
|
||||
|
||||
/// Upload the metadata file
|
||||
UploadMetadata(IndexPart, Lsn),
|
||||
|
||||
/// Delete a file.
|
||||
Delete(RemoteOpFileKind, PathBuf),
|
||||
Delete(RemoteOpFileKind, LayerFileName),
|
||||
|
||||
/// Barrier. When the barrier operation is reached,
|
||||
Barrier(tokio::sync::watch::Sender<()>),
|
||||
@@ -446,14 +437,16 @@ enum UploadOp {
|
||||
impl std::fmt::Display for UploadOp {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
UploadOp::UploadLayer(path, metadata) => write!(
|
||||
f,
|
||||
"UploadLayer({}, size={:?})",
|
||||
path.display(),
|
||||
metadata.file_size()
|
||||
),
|
||||
UploadOp::UploadLayer(path, metadata) => {
|
||||
write!(
|
||||
f,
|
||||
"UploadLayer({}, size={:?})",
|
||||
path.file_name(),
|
||||
metadata.file_size()
|
||||
)
|
||||
}
|
||||
UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
|
||||
UploadOp::Delete(_, path) => write!(f, "Delete({})", path.display()),
|
||||
UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
|
||||
UploadOp::Barrier(_) => write!(f, "Barrier"),
|
||||
}
|
||||
}
|
||||
@@ -465,12 +458,7 @@ impl RemoteTimelineClient {
|
||||
/// The given `index_part` must be the one on the remote.
|
||||
pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(
|
||||
self.conf,
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
index_part,
|
||||
)?;
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -524,13 +512,15 @@ impl RemoteTimelineClient {
|
||||
/// On success, returns the size of the downloaded file.
|
||||
pub async fn download_layer_file(
|
||||
&self,
|
||||
remote_path: &RemotePath,
|
||||
layer_file_name: &LayerFileName,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
) -> anyhow::Result<u64> {
|
||||
let downloaded_size = download::download_layer_file(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
remote_path,
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
layer_file_name,
|
||||
layer_metadata,
|
||||
)
|
||||
.measure_remote_op(
|
||||
@@ -548,13 +538,13 @@ impl RemoteTimelineClient {
|
||||
let new_metadata = LayerFileMetadata::new(downloaded_size);
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
if let Some(upgraded) = upload_queue.latest_files.get_mut(remote_path) {
|
||||
if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
|
||||
upgraded.merge(&new_metadata);
|
||||
} else {
|
||||
// The file should exist, since we just downloaded it.
|
||||
warn!(
|
||||
"downloaded file {:?} not found in local copy of the index file",
|
||||
remote_path
|
||||
layer_file_name
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -611,7 +601,7 @@ impl RemoteTimelineClient {
|
||||
///
|
||||
pub fn schedule_layer_file_upload(
|
||||
self: &Arc<Self>,
|
||||
path: &Path,
|
||||
layer_file_name: &LayerFileName,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
@@ -626,13 +616,16 @@ impl RemoteTimelineClient {
|
||||
|
||||
upload_queue
|
||||
.latest_files
|
||||
.insert(self.conf.remote_path(path)?, layer_metadata.clone());
|
||||
.insert(layer_file_name.clone(), layer_metadata.clone());
|
||||
|
||||
let op = UploadOp::UploadLayer(PathBuf::from(path), layer_metadata.clone());
|
||||
let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
|
||||
self.update_upload_queue_unfinished_metric(1, &op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
|
||||
info!("scheduled layer file upload {}", path.display());
|
||||
info!(
|
||||
"scheduled layer file upload {}",
|
||||
layer_file_name.file_name()
|
||||
);
|
||||
|
||||
// Launch the task immediately, if possible
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
@@ -644,16 +637,13 @@ impl RemoteTimelineClient {
|
||||
///
|
||||
/// The deletion won't actually be performed, until all preceding
|
||||
/// upload operations have completed succesfully.
|
||||
pub fn schedule_layer_file_deletion(self: &Arc<Self>, paths: &[PathBuf]) -> anyhow::Result<()> {
|
||||
pub fn schedule_layer_file_deletion(
|
||||
self: &Arc<Self>,
|
||||
names: &[LayerFileName],
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
// Convert the paths into RemotePaths, and gather other information we need.
|
||||
let mut remote_paths = Vec::with_capacity(paths.len());
|
||||
for path in paths {
|
||||
remote_paths.push(self.conf.remote_path(path)?);
|
||||
}
|
||||
|
||||
// Deleting layers doesn't affect the values stored in TimelineMetadata,
|
||||
// so we don't need update it. Just serialize it.
|
||||
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
|
||||
@@ -667,8 +657,8 @@ impl RemoteTimelineClient {
|
||||
// from latest_files, but not yet scheduled for deletion. Use a closure
|
||||
// to syntactically forbid ? or bail! calls here.
|
||||
let no_bail_here = || {
|
||||
for remote_path in remote_paths {
|
||||
upload_queue.latest_files.remove(&remote_path);
|
||||
for name in names {
|
||||
upload_queue.latest_files.remove(name);
|
||||
}
|
||||
|
||||
let index_part = IndexPart::new(
|
||||
@@ -681,11 +671,11 @@ impl RemoteTimelineClient {
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
|
||||
// schedule the actual deletions
|
||||
for path in paths {
|
||||
let op = UploadOp::Delete(RemoteOpFileKind::Layer, PathBuf::from(path));
|
||||
for name in names {
|
||||
let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
|
||||
self.update_upload_queue_unfinished_metric(1, &op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
info!("scheduled layer file deletion {}", path.display());
|
||||
info!("scheduled layer file deletion {}", name.file_name());
|
||||
}
|
||||
|
||||
// Launch the tasks immediately, if possible
|
||||
@@ -841,7 +831,11 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
|
||||
let upload_result: anyhow::Result<()> = match &task.op {
|
||||
UploadOp::UploadLayer(ref path, ref layer_metadata) => {
|
||||
UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
|
||||
let path = &self
|
||||
.conf
|
||||
.timeline_path(&self.timeline_id, &self.tenant_id)
|
||||
.join(layer_file_name.file_name());
|
||||
upload::upload_timeline_layer(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
@@ -872,7 +866,11 @@ impl RemoteTimelineClient {
|
||||
)
|
||||
.await
|
||||
}
|
||||
UploadOp::Delete(metric_file_kind, ref path) => {
|
||||
UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
|
||||
let path = &self
|
||||
.conf
|
||||
.timeline_path(&self.timeline_id, &self.tenant_id)
|
||||
.join(layer_file_name.file_name());
|
||||
delete::delete_layer(self.conf, &self.storage_impl, path)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
@@ -1078,7 +1076,7 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
||||
use std::collections::HashSet;
|
||||
use std::{collections::HashSet, path::Path};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
|
||||
@@ -1102,8 +1100,8 @@ mod tests {
|
||||
TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap()
|
||||
}
|
||||
|
||||
fn assert_file_list(a: &HashSet<String>, b: &[&str]) {
|
||||
let mut avec: Vec<&str> = a.iter().map(|a| a.as_str()).collect();
|
||||
fn assert_file_list(a: &HashSet<LayerFileName>, b: &[&str]) {
|
||||
let mut avec: Vec<String> = a.iter().map(|x| x.file_name()).collect();
|
||||
avec.sort();
|
||||
|
||||
let mut bvec = b.to_vec();
|
||||
@@ -1198,11 +1196,11 @@ mod tests {
|
||||
std::fs::write(timeline_path.join("bar"), &content_bar)?;
|
||||
|
||||
client.schedule_layer_file_upload(
|
||||
&timeline_path.join("foo"),
|
||||
&LayerFileName::Test("foo".to_owned()),
|
||||
&LayerFileMetadata::new(content_foo.len() as u64),
|
||||
)?;
|
||||
client.schedule_layer_file_upload(
|
||||
&timeline_path.join("bar"),
|
||||
&LayerFileName::Test("bar".to_owned()),
|
||||
&LayerFileMetadata::new(content_bar.len() as u64),
|
||||
)?;
|
||||
|
||||
@@ -1244,10 +1242,10 @@ mod tests {
|
||||
let content_baz = dummy_contents("baz");
|
||||
std::fs::write(timeline_path.join("baz"), &content_baz)?;
|
||||
client.schedule_layer_file_upload(
|
||||
&timeline_path.join("baz"),
|
||||
&LayerFileName::Test("baz".to_owned()),
|
||||
&LayerFileMetadata::new(content_baz.len() as u64),
|
||||
)?;
|
||||
client.schedule_layer_file_deletion(&[timeline_path.join("foo")])?;
|
||||
client.schedule_layer_file_deletion(&[LayerFileName::Test("foo".to_owned())])?;
|
||||
{
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut().unwrap();
|
||||
|
||||
@@ -6,15 +6,16 @@ use anyhow::{bail, Context};
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tracing::debug;
|
||||
use tracing::{debug, info_span, Instrument};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::storage_sync::index::LayerFileMetadata;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use crate::tenant::filename::LayerFileName;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::IndexPart;
|
||||
use super::index::{IndexPart, IndexPartUnclean};
|
||||
|
||||
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
|
||||
fs::File::open(path).await?.sync_all().await
|
||||
@@ -28,10 +29,16 @@ async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Er
|
||||
pub async fn download_layer_file<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
remote_path: &'a RemotePath,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
layer_file_name: &'a LayerFileName,
|
||||
layer_metadata: &'a LayerFileMetadata,
|
||||
) -> anyhow::Result<u64> {
|
||||
let local_path = conf.local_path(remote_path);
|
||||
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
|
||||
|
||||
let local_path = timeline_path.join(layer_file_name.file_name());
|
||||
|
||||
let remote_path = conf.remote_path(&local_path)?;
|
||||
|
||||
// Perform a rename inspired by durable_rename from file_utils.c.
|
||||
// The sequence:
|
||||
@@ -52,7 +59,7 @@ pub async fn download_layer_file<'a>(
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
let mut download = storage.download(remote_path).await.with_context(|| {
|
||||
let mut download = storage.download(&remote_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
|
||||
)
|
||||
@@ -169,7 +176,9 @@ pub async fn list_remote_timelines<'a>(
|
||||
part_downloads.push(async move {
|
||||
(
|
||||
timeline_id,
|
||||
download_index_part(conf, &storage_clone, tenant_id, timeline_id).await,
|
||||
download_index_part(conf, &storage_clone, tenant_id, timeline_id)
|
||||
.instrument(info_span!("download_index_part", timeline=%timeline_id))
|
||||
.await,
|
||||
)
|
||||
});
|
||||
}
|
||||
@@ -211,11 +220,13 @@ pub async fn download_index_part(
|
||||
.with_context(|| format!("Failed to download an index part into file {index_part_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
|
||||
let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes)
|
||||
.with_context(|| {
|
||||
format!("Failed to deserialize index part file into file {index_part_path:?}")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let index_part = index_part.remove_unclean_layer_file_names();
|
||||
|
||||
Ok(index_part)
|
||||
}
|
||||
|
||||
@@ -4,11 +4,11 @@
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use remote_storage::RemotePath;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use tracing::warn;
|
||||
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::{filename::LayerFileName, metadata::TimelineMetadata};
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -62,7 +62,10 @@ impl LayerFileMetadata {
|
||||
/// remember to add a test case for the changed version.
|
||||
#[serde_as]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
pub struct IndexPart {
|
||||
pub struct IndexPartImpl<L>
|
||||
where
|
||||
L: std::hash::Hash + PartialEq + Eq,
|
||||
{
|
||||
/// Debugging aid describing the version of this type.
|
||||
#[serde(default)]
|
||||
version: usize,
|
||||
@@ -70,19 +73,19 @@ pub struct IndexPart {
|
||||
/// Layer names, which are stored on the remote storage.
|
||||
///
|
||||
/// Additional metadata can might exist in `layer_metadata`.
|
||||
pub timeline_layers: HashSet<String>,
|
||||
pub timeline_layers: HashSet<L>,
|
||||
|
||||
/// FIXME: unused field. This should be removed, but that changes the on-disk format,
|
||||
/// so we need to make sure we're backwards-` (and maybe forwards-) compatible
|
||||
/// First pass is to move it to Optional and the next would be its removal
|
||||
missing_layers: Option<HashSet<String>>,
|
||||
missing_layers: Option<HashSet<L>>,
|
||||
|
||||
/// Per layer file name metadata, which can be present for a present or missing layer file.
|
||||
///
|
||||
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
|
||||
/// that latest version stores.
|
||||
#[serde(default)]
|
||||
pub layer_metadata: HashMap<String, IndexLayerMetadata>,
|
||||
#[serde(default = "HashMap::default")]
|
||||
pub layer_metadata: HashMap<L, IndexLayerMetadata>,
|
||||
|
||||
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
|
||||
// It's duplicated here for convenience.
|
||||
@@ -91,6 +94,104 @@ pub struct IndexPart {
|
||||
metadata_bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
// TODO seems like another part of the remote storage file format
|
||||
// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
|
||||
pub type IndexPart = IndexPartImpl<LayerFileName>;
|
||||
|
||||
pub type IndexPartUnclean = IndexPartImpl<UncleanLayerFileName>;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||
pub enum UncleanLayerFileName {
|
||||
Clean(LayerFileName),
|
||||
BackupFile(String),
|
||||
}
|
||||
|
||||
impl<'de> serde::Deserialize<'de> for UncleanLayerFileName {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
deserializer.deserialize_string(UncleanLayerFileNameVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
struct UncleanLayerFileNameVisitor;
|
||||
|
||||
impl<'de> serde::de::Visitor<'de> for UncleanLayerFileNameVisitor {
|
||||
type Value = UncleanLayerFileName;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(
|
||||
formatter,
|
||||
"a string that is a valid LayerFileName or '.old' backup file name"
|
||||
)
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
let maybe_clean: Result<LayerFileName, _> = v.parse();
|
||||
match maybe_clean {
|
||||
Ok(clean) => Ok(UncleanLayerFileName::Clean(clean)),
|
||||
Err(e) => {
|
||||
if v.ends_with(".old") {
|
||||
Ok(UncleanLayerFileName::BackupFile(v.to_owned()))
|
||||
} else {
|
||||
Err(E::custom(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UncleanLayerFileName {
|
||||
fn into_clean(self) -> Option<LayerFileName> {
|
||||
match self {
|
||||
UncleanLayerFileName::Clean(clean) => Some(clean),
|
||||
UncleanLayerFileName::BackupFile(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexPartUnclean {
|
||||
pub fn remove_unclean_layer_file_names(self) -> IndexPart {
|
||||
let IndexPartUnclean {
|
||||
version,
|
||||
timeline_layers,
|
||||
// this is an unused field, ignore it on cleaning
|
||||
missing_layers: _,
|
||||
layer_metadata,
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
} = self;
|
||||
|
||||
IndexPart {
|
||||
version,
|
||||
timeline_layers: timeline_layers
|
||||
.into_iter()
|
||||
.filter_map(|unclean_file_name| match unclean_file_name {
|
||||
UncleanLayerFileName::Clean(clean_name) => Some(clean_name),
|
||||
UncleanLayerFileName::BackupFile(backup_file_name) => {
|
||||
// For details see https://github.com/neondatabase/neon/issues/3024
|
||||
warn!(
|
||||
"got backup file on the remote storage, ignoring it {backup_file_name}"
|
||||
);
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
missing_layers: None,
|
||||
layer_metadata: layer_metadata
|
||||
.into_iter()
|
||||
.filter_map(|(l, m)| l.into_clean().map(|l| (l, m)))
|
||||
.collect(),
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexPart {
|
||||
/// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
|
||||
/// used to understand later versions.
|
||||
@@ -100,23 +201,17 @@ impl IndexPart {
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
pub fn new(
|
||||
layers_and_metadata: HashMap<RemotePath, LayerFileMetadata>,
|
||||
layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata_bytes: Vec<u8>,
|
||||
) -> Self {
|
||||
let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
|
||||
let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
|
||||
|
||||
for (remote_path, metadata) in &layers_and_metadata {
|
||||
for (remote_name, metadata) in &layers_and_metadata {
|
||||
timeline_layers.insert(remote_name.to_owned());
|
||||
let metadata = IndexLayerMetadata::from(metadata);
|
||||
match remote_path.object_name() {
|
||||
Some(layer_name) => {
|
||||
timeline_layers.insert(layer_name.to_owned());
|
||||
layer_metadata.insert(layer_name.to_owned(), metadata);
|
||||
}
|
||||
// TODO move this on a type level: we know, that every layer entry does have a name
|
||||
None => panic!("Layer {remote_path:?} has no file name, skipping"),
|
||||
}
|
||||
layer_metadata.insert(remote_name.to_owned(), metadata);
|
||||
}
|
||||
|
||||
Self {
|
||||
@@ -156,21 +251,22 @@ mod tests {
|
||||
fn v0_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"missing_layers":["not_a_real_layer_but_adding_coverage"],
|
||||
"missing_layers":["LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage"],
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 0,
|
||||
timeline_layers: HashSet::from([String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")]),
|
||||
missing_layers: Some(HashSet::from([String::from("not_a_real_layer_but_adding_coverage")])),
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
missing_layers: None, // disabled fields should not carry unused values further
|
||||
layer_metadata: HashMap::default(),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
let part: IndexPartUnclean = serde_json::from_str(example).unwrap();
|
||||
let part = part.remove_unclean_layer_file_names();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
@@ -179,10 +275,10 @@ mod tests {
|
||||
let example = r#"{
|
||||
"version":1,
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"missing_layers":["not_a_real_layer_but_adding_coverage"],
|
||||
"missing_layers":["LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage"],
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
|
||||
"LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
@@ -191,13 +287,13 @@ mod tests {
|
||||
let expected = IndexPart {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
timeline_layers: HashSet::from([String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")]),
|
||||
missing_layers: Some(HashSet::from([String::from("not_a_real_layer_but_adding_coverage")])),
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
missing_layers: None,
|
||||
layer_metadata: HashMap::from([
|
||||
(String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"), IndexLayerMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: Some(25600000),
|
||||
}),
|
||||
(String::from("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata {
|
||||
(LayerFileName::new_test("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: Some(9007199254741001),
|
||||
@@ -207,7 +303,9 @@ mod tests {
|
||||
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
let part = serde_json::from_str::<IndexPartUnclean>(example)
|
||||
.unwrap()
|
||||
.remove_unclean_layer_file_names();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
@@ -218,7 +316,7 @@ mod tests {
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
|
||||
"LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
@@ -227,29 +325,24 @@ mod tests {
|
||||
let expected = IndexPart {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_string()]),
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
layer_metadata: HashMap::from([
|
||||
(
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_string(),
|
||||
IndexLayerMetadata {
|
||||
file_size: Some(25600000),
|
||||
}
|
||||
),
|
||||
(
|
||||
"not_a_real_layer_but_adding_coverage".to_string(),
|
||||
IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: Some(9007199254741001),
|
||||
}
|
||||
)
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: Some(25600000),
|
||||
}),
|
||||
(LayerFileName::new_test("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: Some(9007199254741001),
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
missing_layers: None,
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
let part = serde_json::from_str::<IndexPartUnclean>(example).unwrap();
|
||||
let part = part.remove_unclean_layer_file_names();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,7 +71,7 @@ use crate::shutdown_pageserver;
|
||||
//
|
||||
// WAL receiver runtime:
|
||||
// - used to handle WAL receiver connections.
|
||||
// - and to receiver updates from etcd
|
||||
// - and to receiver updates from storage_broker
|
||||
//
|
||||
// Background runtime
|
||||
// - layer flushing
|
||||
@@ -178,7 +178,7 @@ pub enum TaskKind {
|
||||
PageRequestHandler,
|
||||
|
||||
// Manages the WAL receiver connection for one timeline. It subscribes to
|
||||
// events from etcd, decides which safekeeper to connect to. It spawns a
|
||||
// events from storage_broker, decides which safekeeper to connect to. It spawns a
|
||||
// separate WalReceiverConnection task to handle each connection.
|
||||
WalReceiverManager,
|
||||
|
||||
|
||||
@@ -57,6 +57,7 @@ use crate::storage_sync::RemoteTimelineClient;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::metadata::load_metadata;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
@@ -89,8 +90,6 @@ mod timeline;
|
||||
|
||||
pub mod size;
|
||||
|
||||
use storage_layer::Layer;
|
||||
|
||||
pub use timeline::Timeline;
|
||||
|
||||
// re-export this function so that page_cache.rs can use it.
|
||||
|
||||
@@ -60,7 +60,7 @@ where
|
||||
///
|
||||
/// ```no_run
|
||||
/// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
|
||||
/// # let reader: FileBlockReader<std::fs::File> = todo!();
|
||||
/// # let reader: FileBlockReader<std::fs::File> = unimplemented!("stub");
|
||||
/// let cursor = reader.block_cursor();
|
||||
/// let buf = cursor.read_blk(1);
|
||||
/// // do stuff with 'buf'
|
||||
|
||||
@@ -30,7 +30,9 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{
|
||||
PersistentLayer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
@@ -52,6 +54,9 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::filename::LayerFileName;
|
||||
use super::storage_layer::Layer;
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
///
|
||||
@@ -194,14 +199,6 @@ pub struct DeltaLayerInner {
|
||||
}
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
@@ -209,13 +206,86 @@ impl Layer for DeltaLayer {
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.lsn_range.clone()
|
||||
}
|
||||
|
||||
fn filename(&self) -> PathBuf {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
fn is_incremental(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
fn short_id(&self) -> String {
|
||||
self.filename().file_name()
|
||||
}
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end
|
||||
);
|
||||
|
||||
if !verbose {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
println!(
|
||||
"index_start_blk: {}, root {}",
|
||||
inner.index_start_blk, inner.index_root_blk
|
||||
);
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
tree_reader.dump()?;
|
||||
|
||||
let mut cursor = file.block_cursor();
|
||||
|
||||
// A subroutine to dump a single blob
|
||||
let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
||||
let buf = cursor.read_blob(blob_ref.pos())?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(desc)
|
||||
};
|
||||
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|delta_key, val| {
|
||||
let blob_ref = BlobRef(val);
|
||||
let key = DeltaKey::extract_key_from_buf(delta_key);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
|
||||
|
||||
let desc = match dump_blob(blob_ref) {
|
||||
Ok(desc) => desc,
|
||||
Err(err) => format!("ERROR: {}", err),
|
||||
};
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
true
|
||||
},
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
@@ -302,6 +372,24 @@ impl Layer for DeltaLayer {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for DeltaLayer {
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
|
||||
fn filename(&self) -> LayerFileName {
|
||||
self.layer_name().into()
|
||||
}
|
||||
|
||||
fn local_path(&self) -> PathBuf {
|
||||
self.path()
|
||||
}
|
||||
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = anyhow::Result<(Key, Lsn, Value)>> + 'a> {
|
||||
let inner = match self.load() {
|
||||
@@ -332,89 +420,6 @@ impl Layer for DeltaLayer {
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end
|
||||
);
|
||||
|
||||
if !verbose {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
println!(
|
||||
"index_start_blk: {}, root {}",
|
||||
inner.index_start_blk, inner.index_root_blk
|
||||
);
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
tree_reader.dump()?;
|
||||
|
||||
let mut cursor = file.block_cursor();
|
||||
|
||||
// A subroutine to dump a single blob
|
||||
let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
||||
let buf = cursor.read_blob(blob_ref.pos())?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(desc)
|
||||
};
|
||||
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|delta_key, val| {
|
||||
let blob_ref = BlobRef(val);
|
||||
let key = DeltaKey::extract_key_from_buf(delta_key);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
|
||||
|
||||
let desc = match dump_blob(blob_ref) {
|
||||
Ok(desc) => desc,
|
||||
Err(err) => format!("ERROR: {}", err),
|
||||
};
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
true
|
||||
},
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl DeltaLayer {
|
||||
@@ -511,8 +516,8 @@ impl DeltaLayer {
|
||||
}
|
||||
}
|
||||
PathOrConf::Path(path) => {
|
||||
let actual_filename = Path::new(path.file_name().unwrap());
|
||||
let expected_filename = self.filename();
|
||||
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
|
||||
let expected_filename = self.filename().file_name();
|
||||
|
||||
if actual_filename != expected_filename {
|
||||
println!(
|
||||
|
||||
@@ -7,11 +7,12 @@ use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
// Note: Timeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
pub struct DeltaFileName {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
@@ -101,7 +102,7 @@ impl fmt::Display for DeltaFileName {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
pub struct ImageFileName {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn: Lsn,
|
||||
@@ -172,6 +173,103 @@ impl fmt::Display for ImageFileName {
|
||||
)
|
||||
}
|
||||
}
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||
pub enum LayerFileName {
|
||||
Image(ImageFileName),
|
||||
Delta(DeltaFileName),
|
||||
#[cfg(test)]
|
||||
Test(String),
|
||||
}
|
||||
|
||||
impl LayerFileName {
|
||||
pub fn file_name(&self) -> String {
|
||||
match self {
|
||||
LayerFileName::Image(fname) => format!("{fname}"),
|
||||
LayerFileName::Delta(fname) => format!("{fname}"),
|
||||
#[cfg(test)]
|
||||
LayerFileName::Test(fname) => fname.to_string(),
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
pub(crate) fn new_test(name: &str) -> LayerFileName {
|
||||
LayerFileName::Test(name.to_owned())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ImageFileName> for LayerFileName {
|
||||
fn from(fname: ImageFileName) -> Self {
|
||||
LayerFileName::Image(fname)
|
||||
}
|
||||
}
|
||||
impl From<DeltaFileName> for LayerFileName {
|
||||
fn from(fname: DeltaFileName) -> Self {
|
||||
LayerFileName::Delta(fname)
|
||||
}
|
||||
}
|
||||
|
||||
// include a `/` in the name as an additional layer of robustness
|
||||
// because `/` chars are not allowed in UNIX paths
|
||||
#[cfg(test)]
|
||||
const LAYER_FILE_NAME_TEST_PREFIX: &str = "LAYER_FILE_NAME::test/";
|
||||
|
||||
impl FromStr for LayerFileName {
|
||||
type Err = String;
|
||||
|
||||
fn from_str(value: &str) -> Result<Self, Self::Err> {
|
||||
#[cfg(test)]
|
||||
if let Some(value) = value.strip_prefix(LAYER_FILE_NAME_TEST_PREFIX) {
|
||||
return Ok(LayerFileName::Test(value.to_owned()));
|
||||
}
|
||||
let delta = DeltaFileName::parse_str(value);
|
||||
let image = ImageFileName::parse_str(value);
|
||||
let ok = match (delta, image) {
|
||||
(None, None) => {
|
||||
return Err(format!(
|
||||
"neither delta nor image layer file name: {value:?}"
|
||||
))
|
||||
}
|
||||
(Some(delta), None) => LayerFileName::Delta(delta),
|
||||
(None, Some(image)) => LayerFileName::Image(image),
|
||||
(Some(_), Some(_)) => unreachable!(),
|
||||
};
|
||||
Ok(ok)
|
||||
}
|
||||
}
|
||||
|
||||
impl serde::Serialize for LayerFileName {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
match self {
|
||||
LayerFileName::Image(fname) => serializer.serialize_str(&format!("{}", fname)),
|
||||
LayerFileName::Delta(fname) => serializer.serialize_str(&format!("{}", fname)),
|
||||
#[cfg(test)]
|
||||
LayerFileName::Test(t) => {
|
||||
serializer.serialize_str(&format!("{LAYER_FILE_NAME_TEST_PREFIX}{t}"))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct LayerFileNameVisitor;
|
||||
|
||||
impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
|
||||
type Value = LayerFileName;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(
|
||||
formatter,
|
||||
"a string that is a valid image or delta layer file name"
|
||||
)
|
||||
}
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
v.parse().map_err(|e| E::custom(e))
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper enum to hold a PageServerConf, or a path
|
||||
///
|
||||
|
||||
@@ -26,7 +26,9 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::filename::{ImageFileName, PathOrConf};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{
|
||||
PersistentLayer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
@@ -48,6 +50,9 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::filename::LayerFileName;
|
||||
use super::storage_layer::Layer;
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
///
|
||||
@@ -120,22 +125,6 @@ pub struct ImageLayerInner {
|
||||
}
|
||||
|
||||
impl Layer for ImageLayer {
|
||||
fn filename(&self) -> PathBuf {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
@@ -144,58 +133,12 @@ impl Layer for ImageLayer {
|
||||
// End-bound is exclusive
|
||||
self.lsn..(self.lsn + 1)
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
assert!(self.key_range.contains(&key));
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
if let Some(offset) = tree_reader.get(&keybuf)? {
|
||||
let blob = file.block_cursor().read_blob(offset).with_context(|| {
|
||||
format!(
|
||||
"failed to read value from data file {} at offset {}",
|
||||
self.filename().display(),
|
||||
offset
|
||||
)
|
||||
})?;
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
|
||||
todo!();
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
false
|
||||
fn short_id(&self) -> String {
|
||||
self.filename().file_name()
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
@@ -223,6 +166,68 @@ impl Layer for ImageLayer {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
assert!(self.key_range.contains(&key));
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
if let Some(offset) = tree_reader.get(&keybuf)? {
|
||||
let blob = file.block_cursor().read_blob(offset).with_context(|| {
|
||||
format!(
|
||||
"failed to read value from data file {} at offset {}",
|
||||
self.path().display(),
|
||||
offset
|
||||
)
|
||||
})?;
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for ImageLayer {
|
||||
fn filename(&self) -> LayerFileName {
|
||||
self.layer_name().into()
|
||||
}
|
||||
|
||||
fn local_path(&self) -> PathBuf {
|
||||
self.path()
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
|
||||
unimplemented!();
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl ImageLayer {
|
||||
@@ -314,8 +319,8 @@ impl ImageLayer {
|
||||
}
|
||||
}
|
||||
PathOrConf::Path(path) => {
|
||||
let actual_filename = Path::new(path.file_name().unwrap());
|
||||
let expected_filename = self.filename();
|
||||
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
|
||||
let expected_filename = self.filename().file_name();
|
||||
|
||||
if actual_filename != expected_filename {
|
||||
println!(
|
||||
|
||||
@@ -10,9 +10,9 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter};
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
|
||||
use crate::walrecord;
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use anyhow::{ensure, Result};
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashMap;
|
||||
use tracing::*;
|
||||
@@ -26,9 +26,10 @@ use utils::{
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::RwLock;
|
||||
|
||||
use super::storage_layer::Layer;
|
||||
|
||||
thread_local! {
|
||||
/// A buffer for serializing object during [`InMemoryLayer::put_value`].
|
||||
/// This buffer is reused for each serialization to avoid additional malloc calls.
|
||||
@@ -75,33 +76,13 @@ impl InMemoryLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for InMemoryLayer {
|
||||
// An in-memory layer can be spilled to disk into ephemeral file,
|
||||
// This function is used only for debugging, so we don't need to be very precise.
|
||||
// Construct a filename as if it was a delta layer.
|
||||
fn filename(&self) -> PathBuf {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
|
||||
|
||||
PathBuf::from(format!(
|
||||
"inmem-{:016X}-{:016X}",
|
||||
self.start_lsn.0, end_lsn.0
|
||||
))
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
None
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
impl InMemoryLayer {
|
||||
pub fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for InMemoryLayer {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
Key::MIN..Key::MAX
|
||||
}
|
||||
@@ -116,73 +97,16 @@ impl Layer for InMemoryLayer {
|
||||
};
|
||||
self.start_lsn..end_lsn
|
||||
}
|
||||
|
||||
/// Look up given value in the layer.
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let mut reader = inner.file.block_cursor();
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
let buf = reader.read_blob(*pos)?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((*entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// release lock on 'inner'
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
|
||||
todo!();
|
||||
}
|
||||
|
||||
/// Nothing to do here. When you drop the last reference to the layer, it will
|
||||
/// be deallocated.
|
||||
fn delete(&self) -> Result<()> {
|
||||
bail!("can't delete an InMemoryLayer")
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
// in-memory layer is always considered incremental.
|
||||
true
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
true
|
||||
fn short_id(&self) -> String {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
|
||||
format!("inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
@@ -235,6 +159,55 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Look up given value in the layer.
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let mut reader = inner.file.block_cursor();
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
let buf = reader.read_blob(*pos)?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((*entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// release lock on 'inner'
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
use crate::metrics::NUM_ONDISK_LAYERS;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::inmemory_layer::InMemoryLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::tenant::storage_layer::{range_eq, range_overlaps};
|
||||
use amplify_num::i256;
|
||||
use anyhow::Result;
|
||||
@@ -28,11 +27,12 @@ use std::sync::Arc;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::storage_layer::Layer;
|
||||
|
||||
///
|
||||
/// LayerMap tracks what layers exist on a timeline.
|
||||
///
|
||||
#[derive(Default)]
|
||||
pub struct LayerMap {
|
||||
pub struct LayerMap<L: ?Sized> {
|
||||
//
|
||||
// 'open_layer' holds the current InMemoryLayer that is accepting new
|
||||
// records. If it is None, 'next_open_layer_at' will be set instead, indicating
|
||||
@@ -53,15 +53,27 @@ pub struct LayerMap {
|
||||
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
|
||||
|
||||
/// All the historic layers are kept here
|
||||
historic_layers: RTree<LayerRTreeObject>,
|
||||
historic_layers: RTree<LayerRTreeObject<L>>,
|
||||
|
||||
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
|
||||
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
|
||||
l0_delta_layers: Vec<Arc<dyn Layer>>,
|
||||
l0_delta_layers: Vec<Arc<L>>,
|
||||
}
|
||||
|
||||
struct LayerRTreeObject {
|
||||
layer: Arc<dyn Layer>,
|
||||
impl<L: ?Sized> Default for LayerMap<L> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
open_layer: None,
|
||||
next_open_layer_at: None,
|
||||
frozen_layers: VecDeque::default(),
|
||||
historic_layers: RTree::default(),
|
||||
l0_delta_layers: Vec::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct LayerRTreeObject<L: ?Sized> {
|
||||
layer: Arc<L>,
|
||||
|
||||
envelope: AABB<[IntKey; 2]>,
|
||||
}
|
||||
@@ -185,7 +197,7 @@ impl Num for IntKey {
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for LayerRTreeObject {
|
||||
impl<T: ?Sized> PartialEq for LayerRTreeObject<T> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
// FIXME: ptr_eq might fail to return true for 'dyn'
|
||||
// references. Clippy complains about this. In practice it
|
||||
@@ -196,15 +208,21 @@ impl PartialEq for LayerRTreeObject {
|
||||
}
|
||||
}
|
||||
|
||||
impl RTreeObject for LayerRTreeObject {
|
||||
impl<L> RTreeObject for LayerRTreeObject<L>
|
||||
where
|
||||
L: ?Sized,
|
||||
{
|
||||
type Envelope = AABB<[IntKey; 2]>;
|
||||
fn envelope(&self) -> Self::Envelope {
|
||||
self.envelope
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerRTreeObject {
|
||||
fn new(layer: Arc<dyn Layer>) -> Self {
|
||||
impl<L> LayerRTreeObject<L>
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
fn new(layer: Arc<L>) -> Self {
|
||||
let key_range = layer.get_key_range();
|
||||
let lsn_range = layer.get_lsn_range();
|
||||
|
||||
@@ -223,12 +241,15 @@ impl LayerRTreeObject {
|
||||
}
|
||||
|
||||
/// Return value of LayerMap::search
|
||||
pub struct SearchResult {
|
||||
pub layer: Arc<dyn Layer>,
|
||||
pub struct SearchResult<L: ?Sized> {
|
||||
pub layer: Arc<L>,
|
||||
pub lsn_floor: Lsn,
|
||||
}
|
||||
|
||||
impl LayerMap {
|
||||
impl<L> LayerMap<L>
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
///
|
||||
/// Find the latest layer that covers the given 'key', with lsn <
|
||||
/// 'end_lsn'.
|
||||
@@ -240,10 +261,10 @@ impl LayerMap {
|
||||
/// contain the version, even if it's missing from the returned
|
||||
/// layer.
|
||||
///
|
||||
pub fn search(&self, key: Key, end_lsn: Lsn) -> Result<Option<SearchResult>> {
|
||||
pub fn search(&self, key: Key, end_lsn: Lsn) -> Result<Option<SearchResult<L>>> {
|
||||
// linear search
|
||||
// Find the latest image layer that covers the given key
|
||||
let mut latest_img: Option<Arc<dyn Layer>> = None;
|
||||
let mut latest_img: Option<Arc<L>> = None;
|
||||
let mut latest_img_lsn: Option<Lsn> = None;
|
||||
let envelope = AABB::from_corners(
|
||||
[IntKey::from(key.to_i128()), IntKey::from(0i128)],
|
||||
@@ -277,7 +298,7 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
// Search the delta layers
|
||||
let mut latest_delta: Option<Arc<dyn Layer>> = None;
|
||||
let mut latest_delta: Option<Arc<L>> = None;
|
||||
for e in self
|
||||
.historic_layers
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
@@ -301,7 +322,7 @@ impl LayerMap {
|
||||
// No need to search any further
|
||||
trace!(
|
||||
"found layer {} for request on {key} at {end_lsn}",
|
||||
l.filename().display(),
|
||||
l.short_id(),
|
||||
);
|
||||
latest_delta.replace(Arc::clone(l));
|
||||
break;
|
||||
@@ -319,7 +340,7 @@ impl LayerMap {
|
||||
if let Some(l) = latest_delta {
|
||||
trace!(
|
||||
"found (old) layer {} for request on {key} at {end_lsn}",
|
||||
l.filename().display(),
|
||||
l.short_id(),
|
||||
);
|
||||
let lsn_floor = std::cmp::max(
|
||||
Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
|
||||
@@ -344,7 +365,7 @@ impl LayerMap {
|
||||
///
|
||||
/// Insert an on-disk layer
|
||||
///
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
pub fn insert_historic(&mut self, layer: Arc<L>) {
|
||||
if layer.get_key_range() == (Key::MIN..Key::MAX) {
|
||||
self.l0_delta_layers.push(layer.clone());
|
||||
}
|
||||
@@ -357,7 +378,7 @@ impl LayerMap {
|
||||
///
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
pub fn remove_historic(&mut self, layer: Arc<L>) {
|
||||
if layer.get_key_range() == (Key::MIN..Key::MAX) {
|
||||
let len_before = self.l0_delta_layers.len();
|
||||
|
||||
@@ -426,13 +447,13 @@ impl LayerMap {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<dyn Layer>> {
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
|
||||
self.historic_layers.iter().map(|e| e.layer.clone())
|
||||
}
|
||||
|
||||
/// Find the last image layer that covers 'key', ignoring any image layers
|
||||
/// newer than 'lsn'.
|
||||
fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<L>> {
|
||||
let mut candidate_lsn = Lsn(0);
|
||||
let mut candidate = None;
|
||||
let envelope = AABB::from_corners(
|
||||
@@ -474,7 +495,7 @@ impl LayerMap {
|
||||
&self,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<dyn Layer>>)>> {
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
|
||||
let mut points = vec![key_range.start];
|
||||
let envelope = AABB::from_corners(
|
||||
[IntKey::from(key_range.start.to_i128()), IntKey::from(0)],
|
||||
@@ -559,7 +580,7 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
/// Return all L0 delta layers
|
||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<dyn Layer>>> {
|
||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
|
||||
Ok(self.l0_delta_layers.clone())
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::filename::LayerFileName;
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
T: PartialOrd<T>,
|
||||
@@ -69,26 +70,9 @@ pub enum ValueReconstructResult {
|
||||
Missing,
|
||||
}
|
||||
|
||||
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
||||
/// range of LSNs.
|
||||
///
|
||||
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
|
||||
/// layers are used to ingest incoming WAL, and provide fast access to the
|
||||
/// recent page versions. On-disk layers are stored as files on disk, and are
|
||||
/// immutable. This trait presents the common functionality of in-memory and
|
||||
/// on-disk layers.
|
||||
///
|
||||
/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
|
||||
/// A delta layer contains all modifications within a range of LSNs and keys.
|
||||
/// An image layer is a snapshot of all the data in a key-range, at a single
|
||||
/// LSN
|
||||
///
|
||||
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
|
||||
/// required by [`LayerMap`].
|
||||
pub trait Layer: Send + Sync {
|
||||
fn get_tenant_id(&self) -> TenantId;
|
||||
|
||||
/// Identify the timeline this layer belongs to
|
||||
fn get_timeline_id(&self) -> TimelineId;
|
||||
|
||||
/// Range of keys that this layer covers
|
||||
fn get_key_range(&self) -> Range<Key>;
|
||||
|
||||
@@ -100,13 +84,11 @@ pub trait Layer: Send + Sync {
|
||||
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
||||
fn get_lsn_range(&self) -> Range<Lsn>;
|
||||
|
||||
/// Filename used to store this layer on disk. (Even in-memory layers
|
||||
/// implement this, to print a handy unique identifier for the layer for
|
||||
/// log messages, even though they're never not on disk.)
|
||||
fn filename(&self) -> PathBuf;
|
||||
|
||||
/// If a layer has a corresponding file on a local filesystem, return its absolute path.
|
||||
fn local_path(&self) -> Option<PathBuf>;
|
||||
/// Does this layer only contain some data for the key-range (incremental),
|
||||
/// or does it contain a version of every page? This is important to know
|
||||
/// for garbage collecting old layers: an incremental layer depends on
|
||||
/// the previous non-incremental layer.
|
||||
fn is_incremental(&self) -> bool;
|
||||
|
||||
///
|
||||
/// Return data needed to reconstruct given page at LSN.
|
||||
@@ -127,14 +109,39 @@ pub trait Layer: Send + Sync {
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
) -> Result<ValueReconstructResult>;
|
||||
|
||||
/// Does this layer only contain some data for the key-range (incremental),
|
||||
/// or does it contain a version of every page? This is important to know
|
||||
/// for garbage collecting old layers: an incremental layer depends on
|
||||
/// the previous non-incremental layer.
|
||||
fn is_incremental(&self) -> bool;
|
||||
/// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
|
||||
fn short_id(&self) -> String;
|
||||
|
||||
/// Returns true for layers that are represented in memory.
|
||||
fn is_in_memory(&self) -> bool;
|
||||
/// Dump summary of the contents of the layer to stdout
|
||||
fn dump(&self, verbose: bool) -> Result<()>;
|
||||
}
|
||||
|
||||
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
||||
/// range of LSNs.
|
||||
///
|
||||
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
|
||||
/// layers are used to ingest incoming WAL, and provide fast access to the
|
||||
/// recent page versions. On-disk layers are stored as files on disk, and are
|
||||
/// immutable. This trait presents the common functionality of in-memory and
|
||||
/// on-disk layers.
|
||||
///
|
||||
/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
|
||||
/// A delta layer contains all modifications within a range of LSNs and keys.
|
||||
/// An image layer is a snapshot of all the data in a key-range, at a single
|
||||
/// LSN
|
||||
///
|
||||
pub trait PersistentLayer: Layer {
|
||||
fn get_tenant_id(&self) -> TenantId;
|
||||
|
||||
/// Identify the timeline this layer belongs to
|
||||
fn get_timeline_id(&self) -> TimelineId;
|
||||
|
||||
/// File name used for this layer, both in the pageserver's local filesystem
|
||||
/// state as well as in the remote storage.
|
||||
fn filename(&self) -> LayerFileName;
|
||||
|
||||
// Path to the layer file in the local filesystem.
|
||||
fn local_path(&self) -> PathBuf;
|
||||
|
||||
/// Iterate through all keys and values stored in the layer
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
|
||||
@@ -147,7 +154,4 @@ pub trait Layer: Send + Sync {
|
||||
|
||||
/// Permanently remove this layer from disk.
|
||||
fn delete(&self) -> Result<()>;
|
||||
|
||||
/// Dump summary of the contents of the layer to stdout
|
||||
fn dump(&self, verbose: bool) -> Result<()>;
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ use tokio::task::spawn_blocking;
|
||||
use tracing::*;
|
||||
|
||||
use std::cmp::{max, min, Ordering};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -30,7 +30,7 @@ use crate::tenant::{
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::{save_metadata, TimelineMetadata},
|
||||
par_fsync,
|
||||
storage_layer::{Layer, ValueReconstructResult, ValueReconstructState},
|
||||
storage_layer::{PersistentLayer, ValueReconstructResult, ValueReconstructState},
|
||||
};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
@@ -54,7 +54,7 @@ use utils::{
|
||||
use crate::repository::GcResult;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task};
|
||||
use crate::walreceiver::{is_broker_client_initialized, spawn_connection_manager_task};
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::METADATA_FILE_NAME;
|
||||
@@ -62,6 +62,9 @@ use crate::ZERO_PAGE;
|
||||
use crate::{is_temporary, task_mgr};
|
||||
use crate::{page_cache, storage_sync::index::LayerFileMetadata};
|
||||
|
||||
use super::filename::LayerFileName;
|
||||
use super::storage_layer::Layer;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
enum FlushLoopState {
|
||||
NotStarted,
|
||||
@@ -78,7 +81,7 @@ pub struct Timeline {
|
||||
|
||||
pub pg_version: u32,
|
||||
|
||||
pub layers: RwLock<LayerMap>,
|
||||
pub layers: RwLock<LayerMap<dyn PersistentLayer>>,
|
||||
|
||||
last_freeze_at: AtomicLsn,
|
||||
// Atomic would be more appropriate here.
|
||||
@@ -856,12 +859,12 @@ impl Timeline {
|
||||
}
|
||||
|
||||
pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
|
||||
if !is_etcd_client_initialized() {
|
||||
if !is_broker_client_initialized() {
|
||||
if cfg!(test) {
|
||||
info!("not launching WAL receiver because etcd client hasn't been initialized");
|
||||
info!("not launching WAL receiver because broker client hasn't been initialized");
|
||||
return;
|
||||
} else {
|
||||
panic!("etcd client not initialized");
|
||||
panic!("broker client not initialized");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -882,7 +885,6 @@ impl Timeline {
|
||||
drop(tenant_conf_guard);
|
||||
let self_clone = Arc::clone(self);
|
||||
spawn_connection_manager_task(
|
||||
self.conf.broker_etcd_prefix.clone(),
|
||||
self_clone,
|
||||
walreceiver_connect_timeout,
|
||||
lagging_wal_timeout,
|
||||
@@ -928,7 +930,7 @@ impl Timeline {
|
||||
let layer =
|
||||
ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename);
|
||||
|
||||
trace!("found layer {}", layer.filename().display());
|
||||
trace!("found layer {}", layer.path().display());
|
||||
total_physical_size += layer.path().metadata()?.len();
|
||||
layers.insert_historic(Arc::new(layer));
|
||||
num_layers += 1;
|
||||
@@ -952,7 +954,7 @@ impl Timeline {
|
||||
let layer =
|
||||
DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename);
|
||||
|
||||
trace!("found layer {}", layer.filename().display());
|
||||
trace!("found layer {}", layer.path().display());
|
||||
total_physical_size += layer.path().metadata()?.len();
|
||||
layers.insert_historic(Arc::new(layer));
|
||||
num_layers += 1;
|
||||
@@ -999,9 +1001,9 @@ impl Timeline {
|
||||
&self,
|
||||
index_part: &IndexPart,
|
||||
remote_client: &RemoteTimelineClient,
|
||||
local_layers: HashSet<PathBuf>,
|
||||
local_layers: HashMap<LayerFileName, Arc<dyn PersistentLayer>>,
|
||||
up_to_date_disk_consistent_lsn: Lsn,
|
||||
) -> anyhow::Result<HashSet<PathBuf>> {
|
||||
) -> anyhow::Result<HashMap<LayerFileName, Arc<dyn PersistentLayer>>> {
|
||||
// Are we missing some files that are present in remote storage?
|
||||
// Download them now.
|
||||
// TODO Downloading many files this way is not efficient.
|
||||
@@ -1011,10 +1013,8 @@ impl Timeline {
|
||||
// 1) if there was another pageserver that came and generated new files
|
||||
// 2) during attach of a timeline with big history which we currently do not do
|
||||
let mut local_only_layers = local_layers;
|
||||
let timeline_dir = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
|
||||
for remote_layer_name in &index_part.timeline_layers {
|
||||
let local_layer_path = timeline_dir.join(remote_layer_name);
|
||||
local_only_layers.remove(&local_layer_path);
|
||||
let local_layer = local_only_layers.remove(remote_layer_name);
|
||||
|
||||
let remote_layer_metadata = index_part
|
||||
.layer_metadata
|
||||
@@ -1022,118 +1022,122 @@ impl Timeline {
|
||||
.map(LayerFileMetadata::from)
|
||||
.unwrap_or(LayerFileMetadata::MISSING);
|
||||
|
||||
let remote_layer_path = self
|
||||
.conf
|
||||
.remote_path(&local_layer_path)
|
||||
.expect("local_layer_path received from the same conf that provided a workdir");
|
||||
// Is the local layer's size different from the size stored in the
|
||||
// remote index file? If so, rename_to_backup those files & remove
|
||||
// local_layer form the layer map.
|
||||
// We'll download a fresh copy of the layer file below.
|
||||
if let Some(local_layer) = local_layer {
|
||||
let local_layer_path = local_layer.local_path();
|
||||
ensure!(
|
||||
local_layer_path.exists(),
|
||||
"every layer from local_layers must exist on disk: {}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
|
||||
if local_layer_path.exists() {
|
||||
let mut already_downloaded = true;
|
||||
// Are there any local files that exist, with a size that doesn't match
|
||||
// with the size stored in the remote index file?
|
||||
// If so, rename_to_backup those files so that we re-download them later.
|
||||
if let Some(remote_size) = remote_layer_metadata.file_size() {
|
||||
match local_layer_path.metadata() {
|
||||
Ok(metadata) => {
|
||||
let local_size = metadata.len();
|
||||
|
||||
if local_size != remote_size {
|
||||
warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
|
||||
if let Err(err) = rename_to_backup(&local_layer_path) {
|
||||
error!("could not rename file {local_layer_path:?}: {err:?}");
|
||||
} else {
|
||||
self.metrics.current_physical_size_gauge.sub(local_size);
|
||||
already_downloaded = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
error!("could not get size of local file {local_layer_path:?}: {err:?}")
|
||||
let metadata = local_layer_path.metadata().with_context(|| {
|
||||
format!(
|
||||
"get file size of local layer {}",
|
||||
local_layer_path.display()
|
||||
)
|
||||
})?;
|
||||
let local_size = metadata.len();
|
||||
if local_size != remote_size {
|
||||
warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
|
||||
if let Err(err) = rename_to_backup(&local_layer_path) {
|
||||
assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
|
||||
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
|
||||
} else {
|
||||
self.metrics.current_physical_size_gauge.sub(local_size);
|
||||
self.layers.write().unwrap().remove_historic(local_layer);
|
||||
// fall-through to adding the remote layer
|
||||
}
|
||||
} else {
|
||||
debug!(
|
||||
"layer is present locally and file size matches remote, using it: {}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if already_downloaded {
|
||||
} else {
|
||||
debug!(
|
||||
"layer is present locally and remote does not have file size, using it: {}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
info!("remote layer {remote_layer_path:?} does not exist locally");
|
||||
}
|
||||
|
||||
let layer_name = local_layer_path
|
||||
.file_name()
|
||||
.and_then(|os_str| os_str.to_str())
|
||||
.with_context(|| {
|
||||
format!("Layer file {local_layer_path:?} has no name in unicode")
|
||||
})?;
|
||||
if let Some(imgfilename) = ImageFileName::parse_str(layer_name) {
|
||||
if imgfilename.lsn > up_to_date_disk_consistent_lsn {
|
||||
warn!(
|
||||
info!(
|
||||
"remote layer does not exist locally, downloading it now: {}",
|
||||
remote_layer_name.file_name()
|
||||
);
|
||||
|
||||
match remote_layer_name {
|
||||
LayerFileName::Image(imgfilename) => {
|
||||
if imgfilename.lsn > up_to_date_disk_consistent_lsn {
|
||||
warn!(
|
||||
"found future image layer {} on timeline {} remote_consistent_lsn is {}",
|
||||
imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
|
||||
);
|
||||
continue;
|
||||
continue;
|
||||
}
|
||||
|
||||
trace!("downloading image file: {remote_layer_name:?}");
|
||||
let downloaded_size = remote_client
|
||||
.download_layer_file(remote_layer_name, &remote_layer_metadata)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("failed to download image layer {remote_layer_name:?}")
|
||||
})?;
|
||||
trace!("done");
|
||||
|
||||
let image_layer =
|
||||
ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, imgfilename);
|
||||
|
||||
self.layers
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert_historic(Arc::new(image_layer));
|
||||
self.metrics
|
||||
.current_physical_size_gauge
|
||||
.add(downloaded_size);
|
||||
}
|
||||
|
||||
trace!("downloading image file: {remote_layer_path:?}");
|
||||
let downloaded_size = remote_client
|
||||
.download_layer_file(&remote_layer_path, &remote_layer_metadata)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("failed to download image layer from path {remote_layer_path:?}")
|
||||
})?;
|
||||
trace!("done");
|
||||
|
||||
let image_layer =
|
||||
ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename);
|
||||
|
||||
self.layers
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert_historic(Arc::new(image_layer));
|
||||
self.metrics
|
||||
.current_physical_size_gauge
|
||||
.add(downloaded_size);
|
||||
} else if let Some(deltafilename) = DeltaFileName::parse_str(layer_name) {
|
||||
// Create a DeltaLayer struct for each delta file.
|
||||
// The end-LSN is exclusive, while disk_consistent_lsn is
|
||||
// inclusive. For example, if disk_consistent_lsn is 100, it is
|
||||
// OK for a delta layer to have end LSN 101, but if the end LSN
|
||||
// is 102, then it might not have been fully flushed to disk
|
||||
// before crash.
|
||||
if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
|
||||
warn!(
|
||||
LayerFileName::Delta(deltafilename) => {
|
||||
// Create a DeltaLayer struct for each delta file.
|
||||
// The end-LSN is exclusive, while disk_consistent_lsn is
|
||||
// inclusive. For example, if disk_consistent_lsn is 100, it is
|
||||
// OK for a delta layer to have end LSN 101, but if the end LSN
|
||||
// is 102, then it might not have been fully flushed to disk
|
||||
// before crash.
|
||||
if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
|
||||
warn!(
|
||||
"found future delta layer {} on timeline {} remote_consistent_lsn is {}",
|
||||
deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
|
||||
);
|
||||
continue;
|
||||
continue;
|
||||
}
|
||||
|
||||
trace!("downloading delta file: {remote_layer_name:?}");
|
||||
let sz = remote_client
|
||||
.download_layer_file(remote_layer_name, &remote_layer_metadata)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("failed to download delta layer {remote_layer_name:?}")
|
||||
})?;
|
||||
trace!("done");
|
||||
|
||||
let delta_layer =
|
||||
DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, deltafilename);
|
||||
|
||||
self.layers
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert_historic(Arc::new(delta_layer));
|
||||
self.metrics.current_physical_size_gauge.add(sz);
|
||||
}
|
||||
|
||||
trace!("downloading delta file: {remote_layer_path:?}");
|
||||
let sz = remote_client
|
||||
.download_layer_file(&remote_layer_path, &remote_layer_metadata)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("failed to download delta layer from path {remote_layer_path:?}")
|
||||
})?;
|
||||
trace!("done");
|
||||
|
||||
let delta_layer =
|
||||
DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename);
|
||||
|
||||
self.layers
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert_historic(Arc::new(delta_layer));
|
||||
self.metrics.current_physical_size_gauge.add(sz);
|
||||
} else if layer_name.ends_with(".old") {
|
||||
// For details see https://github.com/neondatabase/neon/issues/3024
|
||||
warn!(
|
||||
"got backup file on the remote storage, ignoring it {file}",
|
||||
file = layer_name
|
||||
)
|
||||
} else {
|
||||
bail!("unexpected layer filename {layer_name} in remote storage path: {remote_layer_path:?}");
|
||||
#[cfg(test)]
|
||||
LayerFileName::Test(_) => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1170,18 +1174,13 @@ impl Timeline {
|
||||
|
||||
let disk_consistent_lsn = up_to_date_metadata.disk_consistent_lsn();
|
||||
|
||||
// Build a map of local layers for quick lookups
|
||||
let local_layers = self
|
||||
.layers
|
||||
.read()
|
||||
.unwrap()
|
||||
.iter_historic_layers()
|
||||
.map(|historic_layer| {
|
||||
historic_layer
|
||||
.local_path()
|
||||
.expect("Historic layers should have a path")
|
||||
})
|
||||
.collect::<HashSet<_>>();
|
||||
.map(|l| (l.filename(), l))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let local_only_layers = match index_part {
|
||||
Some(index_part) => {
|
||||
@@ -1190,6 +1189,7 @@ impl Timeline {
|
||||
index_part.timeline_layers.len()
|
||||
);
|
||||
remote_client.init_upload_queue(index_part)?;
|
||||
|
||||
self.download_missing(index_part, remote_client, local_layers, disk_consistent_lsn)
|
||||
.await?
|
||||
}
|
||||
@@ -1201,14 +1201,15 @@ impl Timeline {
|
||||
};
|
||||
|
||||
// Are there local files that don't exist remotely? Schedule uploads for them
|
||||
for layer_path in &local_only_layers {
|
||||
for (layer_name, layer) in &local_only_layers {
|
||||
let layer_path = layer.local_path();
|
||||
let layer_size = layer_path
|
||||
.metadata()
|
||||
.with_context(|| format!("failed to get file {layer_path:?} metadata"))?
|
||||
.len();
|
||||
info!("scheduling {layer_path:?} for upload");
|
||||
remote_client
|
||||
.schedule_layer_file_upload(layer_path, &LayerFileMetadata::new(layer_size))?;
|
||||
.schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
|
||||
}
|
||||
if !local_only_layers.is_empty() {
|
||||
remote_client.schedule_index_upload(up_to_date_metadata)?;
|
||||
@@ -1323,7 +1324,36 @@ impl Timeline {
|
||||
Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type TraversalId = String;
|
||||
|
||||
trait TraversalLayerExt {
|
||||
fn traversal_id(&self) -> TraversalId;
|
||||
}
|
||||
|
||||
impl TraversalLayerExt for Arc<dyn PersistentLayer> {
|
||||
fn traversal_id(&self) -> String {
|
||||
debug_assert!(
|
||||
self.local_path().to_str().unwrap()
|
||||
.contains(&format!("{}", self.get_timeline_id())),
|
||||
"need timeline ID to uniquely identify the layer when tranversal crosses ancestor boundary",
|
||||
);
|
||||
format!("{}", self.local_path().display())
|
||||
}
|
||||
}
|
||||
|
||||
impl TraversalLayerExt for Arc<InMemoryLayer> {
|
||||
fn traversal_id(&self) -> String {
|
||||
format!(
|
||||
"timeline {} in-memory {}",
|
||||
self.get_timeline_id(),
|
||||
self.short_id()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
///
|
||||
/// Get a handle to a Layer for reading.
|
||||
///
|
||||
@@ -1344,7 +1374,7 @@ impl Timeline {
|
||||
|
||||
// For debugging purposes, collect the path of layers that we traversed
|
||||
// through. It's included in the error message if we fail to find the key.
|
||||
let mut traversal_path: Vec<(ValueReconstructResult, Lsn, Arc<dyn Layer>)> = Vec::new();
|
||||
let mut traversal_path = Vec::<(ValueReconstructResult, Lsn, TraversalId)>::new();
|
||||
|
||||
let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
|
||||
*cached_lsn
|
||||
@@ -1426,7 +1456,7 @@ impl Timeline {
|
||||
reconstruct_state,
|
||||
)?;
|
||||
cont_lsn = lsn_floor;
|
||||
traversal_path.push((result, cont_lsn, open_layer.clone()));
|
||||
traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -1441,7 +1471,7 @@ impl Timeline {
|
||||
reconstruct_state,
|
||||
)?;
|
||||
cont_lsn = lsn_floor;
|
||||
traversal_path.push((result, cont_lsn, frozen_layer.clone()));
|
||||
traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
@@ -1456,7 +1486,7 @@ impl Timeline {
|
||||
reconstruct_state,
|
||||
)?;
|
||||
cont_lsn = lsn_floor;
|
||||
traversal_path.push((result, cont_lsn, layer));
|
||||
traversal_path.push((result, cont_lsn, layer.traversal_id()));
|
||||
} else if timeline.ancestor_timeline.is_some() {
|
||||
// Nothing on this timeline. Traverse to parent
|
||||
result = ValueReconstructResult::Continue;
|
||||
@@ -1671,7 +1701,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
||||
#[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.filename().display()))]
|
||||
#[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
|
||||
async fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
|
||||
// As a special case, when we have just imported an image into the repository,
|
||||
// instead of writing out a L0 delta layer, we directly write out image layer
|
||||
@@ -1730,7 +1760,7 @@ impl Timeline {
|
||||
fn update_metadata_file(
|
||||
&self,
|
||||
disk_consistent_lsn: Lsn,
|
||||
layer_paths_to_upload: HashMap<PathBuf, LayerFileMetadata>,
|
||||
layer_paths_to_upload: HashMap<LayerFileName, LayerFileMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
// We can only save a valid 'prev_record_lsn' value on disk if we
|
||||
// flushed *all* in-memory changes to disk. We only track
|
||||
@@ -1795,10 +1825,11 @@ impl Timeline {
|
||||
fn create_delta_layer(
|
||||
&self,
|
||||
frozen_layer: &InMemoryLayer,
|
||||
) -> anyhow::Result<(PathBuf, LayerFileMetadata)> {
|
||||
) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> {
|
||||
// Write it out
|
||||
let new_delta = frozen_layer.write_to_disk()?;
|
||||
let new_delta_path = new_delta.path();
|
||||
let new_delta_filename = new_delta.filename();
|
||||
|
||||
// Sync it to disk.
|
||||
//
|
||||
@@ -1827,7 +1858,7 @@ impl Timeline {
|
||||
self.metrics.num_persistent_files_created.inc_by(1);
|
||||
self.metrics.persistent_bytes_written.inc_by(sz);
|
||||
|
||||
Ok((new_delta_path, LayerFileMetadata::new(sz)))
|
||||
Ok((new_delta_filename, LayerFileMetadata::new(sz)))
|
||||
}
|
||||
|
||||
fn repartition(&self, lsn: Lsn, partition_size: u64) -> anyhow::Result<(KeyPartitioning, Lsn)> {
|
||||
@@ -1889,7 +1920,7 @@ impl Timeline {
|
||||
partitioning: &KeyPartitioning,
|
||||
lsn: Lsn,
|
||||
force: bool,
|
||||
) -> anyhow::Result<HashMap<PathBuf, LayerFileMetadata>> {
|
||||
) -> anyhow::Result<HashMap<LayerFileName, LayerFileMetadata>> {
|
||||
let timer = self.metrics.create_images_time_histo.start_timer();
|
||||
let mut image_layers: Vec<ImageLayer> = Vec::new();
|
||||
for partition in partitioning.parts.iter() {
|
||||
@@ -1967,9 +1998,10 @@ impl Timeline {
|
||||
let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
|
||||
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
|
||||
for l in image_layers {
|
||||
let path = l.path();
|
||||
let metadata = path.metadata()?;
|
||||
let path = l.filename();
|
||||
let metadata = timeline_path.join(path.file_name()).metadata()?;
|
||||
|
||||
layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
|
||||
|
||||
@@ -1985,7 +2017,7 @@ impl Timeline {
|
||||
#[derive(Default)]
|
||||
struct CompactLevel0Phase1Result {
|
||||
new_layers: Vec<DeltaLayer>,
|
||||
deltas_to_compact: Vec<Arc<dyn Layer>>,
|
||||
deltas_to_compact: Vec<Arc<dyn PersistentLayer>>,
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
@@ -2043,7 +2075,7 @@ impl Timeline {
|
||||
level0_deltas.len()
|
||||
);
|
||||
for l in deltas_to_compact.iter() {
|
||||
info!("compact includes {}", l.filename().display());
|
||||
info!("compact includes {}", l.filename().file_name());
|
||||
}
|
||||
// We don't need the original list of layers anymore. Drop it so that
|
||||
// we don't accidentally use it later in the function.
|
||||
@@ -2272,7 +2304,7 @@ impl Timeline {
|
||||
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.schedule_layer_file_upload(
|
||||
&new_delta_path,
|
||||
&l.filename(),
|
||||
&LayerFileMetadata::new(metadata.len()),
|
||||
)?;
|
||||
}
|
||||
@@ -2281,19 +2313,19 @@ impl Timeline {
|
||||
self.metrics.current_physical_size_gauge.add(metadata.len());
|
||||
|
||||
new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
|
||||
layers.insert_historic(Arc::new(l));
|
||||
let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
|
||||
layers.insert_historic(x);
|
||||
}
|
||||
|
||||
// Now that we have reshuffled the data to set of new delta layers, we can
|
||||
// delete the old ones
|
||||
let mut layer_paths_to_delete = Vec::with_capacity(deltas_to_compact.len());
|
||||
let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
|
||||
for l in deltas_to_compact {
|
||||
if let Some(path) = l.local_path() {
|
||||
self.metrics
|
||||
.current_physical_size_gauge
|
||||
.sub(path.metadata()?.len());
|
||||
layer_paths_to_delete.push(path);
|
||||
}
|
||||
let path = l.local_path();
|
||||
self.metrics
|
||||
.current_physical_size_gauge
|
||||
.sub(path.metadata()?.len());
|
||||
layer_names_to_delete.push(l.filename());
|
||||
l.delete()?;
|
||||
layers.remove_historic(l);
|
||||
}
|
||||
@@ -2301,7 +2333,7 @@ impl Timeline {
|
||||
|
||||
// Also schedule the deletions in remote storage
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.schedule_layer_file_deletion(&layer_paths_to_delete)?;
|
||||
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -2486,23 +2518,13 @@ impl Timeline {
|
||||
//
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
'outer: for l in layers.iter_historic_layers() {
|
||||
// This layer is in the process of being flushed to disk.
|
||||
// It will be swapped out of the layer map, replaced with
|
||||
// on-disk layers containing the same data.
|
||||
// We can't GC it, as it's not on disk. We can't remove it
|
||||
// from the layer map yet, as it would make its data
|
||||
// inaccessible.
|
||||
if l.is_in_memory() {
|
||||
continue;
|
||||
}
|
||||
|
||||
result.layers_total += 1;
|
||||
|
||||
// 1. Is it newer than GC horizon cutoff point?
|
||||
if l.get_lsn_range().end > horizon_cutoff {
|
||||
debug!(
|
||||
"keeping {} because it's newer than horizon_cutoff {}",
|
||||
l.filename().display(),
|
||||
l.filename().file_name(),
|
||||
horizon_cutoff
|
||||
);
|
||||
result.layers_needed_by_cutoff += 1;
|
||||
@@ -2513,7 +2535,7 @@ impl Timeline {
|
||||
if l.get_lsn_range().end > pitr_cutoff {
|
||||
debug!(
|
||||
"keeping {} because it's newer than pitr_cutoff {}",
|
||||
l.filename().display(),
|
||||
l.filename().file_name(),
|
||||
pitr_cutoff
|
||||
);
|
||||
result.layers_needed_by_pitr += 1;
|
||||
@@ -2530,7 +2552,7 @@ impl Timeline {
|
||||
if &l.get_lsn_range().start <= retain_lsn {
|
||||
debug!(
|
||||
"keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
|
||||
l.filename().display(),
|
||||
l.filename().file_name(),
|
||||
retain_lsn,
|
||||
l.is_incremental(),
|
||||
);
|
||||
@@ -2563,7 +2585,7 @@ impl Timeline {
|
||||
{
|
||||
debug!(
|
||||
"keeping {} because it is the latest layer",
|
||||
l.filename().display()
|
||||
l.filename().file_name()
|
||||
);
|
||||
result.layers_not_updated += 1;
|
||||
continue 'outer;
|
||||
@@ -2572,7 +2594,7 @@ impl Timeline {
|
||||
// We didn't find any reason to keep this file, so remove it.
|
||||
debug!(
|
||||
"garbage collecting {} is_dropped: xx is_incremental: {}",
|
||||
l.filename().display(),
|
||||
l.filename().file_name(),
|
||||
l.is_incremental(),
|
||||
);
|
||||
layers_to_remove.push(Arc::clone(&l));
|
||||
@@ -2581,14 +2603,13 @@ impl Timeline {
|
||||
// Actually delete the layers from disk and remove them from the map.
|
||||
// (couldn't do this in the loop above, because you cannot modify a collection
|
||||
// while iterating it. BTreeMap::retain() would be another option)
|
||||
let mut layer_paths_to_delete = Vec::with_capacity(layers_to_remove.len());
|
||||
let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
|
||||
for doomed_layer in layers_to_remove {
|
||||
if let Some(path) = doomed_layer.local_path() {
|
||||
self.metrics
|
||||
.current_physical_size_gauge
|
||||
.sub(path.metadata()?.len());
|
||||
layer_paths_to_delete.push(path);
|
||||
}
|
||||
let path = doomed_layer.local_path();
|
||||
self.metrics
|
||||
.current_physical_size_gauge
|
||||
.sub(path.metadata()?.len());
|
||||
layer_names_to_delete.push(doomed_layer.filename());
|
||||
doomed_layer.delete()?;
|
||||
layers.remove_historic(doomed_layer);
|
||||
result.layers_removed += 1;
|
||||
@@ -2604,7 +2625,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.schedule_layer_file_deletion(&layer_paths_to_delete)?;
|
||||
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
|
||||
}
|
||||
|
||||
result.elapsed = now.elapsed()?;
|
||||
@@ -2689,7 +2710,7 @@ impl Timeline {
|
||||
/// to an error, as anyhow context information.
|
||||
fn layer_traversal_error(
|
||||
msg: String,
|
||||
path: Vec<(ValueReconstructResult, Lsn, Arc<dyn Layer>)>,
|
||||
path: Vec<(ValueReconstructResult, Lsn, TraversalId)>,
|
||||
) -> anyhow::Result<()> {
|
||||
// We want the original 'msg' to be the outermost context. The outermost context
|
||||
// is the most high-level information, which also gets propagated to the client.
|
||||
@@ -2698,9 +2719,7 @@ fn layer_traversal_error(
|
||||
.map(|(r, c, l)| {
|
||||
format!(
|
||||
"layer traversal: result {:?}, cont_lsn {}, layer: {}",
|
||||
r,
|
||||
c,
|
||||
l.filename().display()
|
||||
r, c, l,
|
||||
)
|
||||
})
|
||||
.chain(std::iter::once(msg));
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
//! hence WAL receiver needs to react on such events.
|
||||
//!
|
||||
//! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming.
|
||||
//! For that, it watches specific keys in etcd broker and pulls the relevant data periodically.
|
||||
//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
|
||||
//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
|
||||
//! Without this data, no WAL streaming is possible currently.
|
||||
//!
|
||||
@@ -26,57 +26,49 @@ mod walreceiver_connection;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::task_mgr::WALRECEIVER_RUNTIME;
|
||||
|
||||
use anyhow::{ensure, Context};
|
||||
use etcd_broker::Client;
|
||||
use itertools::Itertools;
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::OnceCell;
|
||||
use std::future::Future;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::sync::watch;
|
||||
use tracing::*;
|
||||
use url::Url;
|
||||
|
||||
pub use connection_manager::spawn_connection_manager_task;
|
||||
|
||||
static ETCD_CLIENT: OnceCell<Client> = OnceCell::new();
|
||||
static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
|
||||
|
||||
///
|
||||
/// Initialize the etcd client. This must be called once at page server startup.
|
||||
/// Initialize the broker client. This must be called once at page server startup.
|
||||
///
|
||||
pub async fn init_etcd_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
let etcd_endpoints = conf.broker_endpoints.clone();
|
||||
ensure!(
|
||||
!etcd_endpoints.is_empty(),
|
||||
"Cannot start wal receiver: etcd endpoints are empty"
|
||||
);
|
||||
pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
let broker_endpoint = conf.broker_endpoint.clone();
|
||||
|
||||
let etcd_client = Client::connect(etcd_endpoints.clone(), None)
|
||||
.await
|
||||
.context("Failed to connect to etcd")?;
|
||||
// Note: we do not attempt connecting here (but validate endpoints sanity).
|
||||
let broker_client = storage_broker::connect(broker_endpoint.clone()).context(format!(
|
||||
"Failed to create broker client to {}",
|
||||
&conf.broker_endpoint
|
||||
))?;
|
||||
|
||||
// FIXME: Should we still allow the pageserver to start, if etcd
|
||||
// doesn't work? It could still serve GetPage requests, with the
|
||||
// data it has locally and from what it can download from remote
|
||||
// storage
|
||||
if ETCD_CLIENT.set(etcd_client).is_err() {
|
||||
panic!("etcd already initialized");
|
||||
if BROKER_CLIENT.set(broker_client).is_err() {
|
||||
panic!("broker already initialized");
|
||||
}
|
||||
|
||||
info!(
|
||||
"Initialized etcd client with endpoints: {}",
|
||||
etcd_endpoints.iter().map(Url::to_string).join(", ")
|
||||
"Initialized broker client with endpoints: {}",
|
||||
broker_endpoint
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a handle to the etcd client
|
||||
/// Get a handle to the broker client
|
||||
///
|
||||
pub fn get_etcd_client() -> &'static etcd_broker::Client {
|
||||
ETCD_CLIENT.get().expect("etcd client not initialized")
|
||||
pub fn get_broker_client() -> &'static BrokerClientChannel {
|
||||
BROKER_CLIENT.get().expect("broker client not initialized")
|
||||
}
|
||||
|
||||
pub fn is_etcd_client_initialized() -> bool {
|
||||
ETCD_CLIENT.get().is_some()
|
||||
pub fn is_broker_client_initialized() -> bool {
|
||||
BROKER_CLIENT.get().is_some()
|
||||
}
|
||||
|
||||
/// A handle of an asynchronous task.
|
||||
|
||||
@@ -1,21 +1,15 @@
|
||||
//! WAL receiver logic that ensures the pageserver gets connectected to safekeeper,
|
||||
//! that contains the latest WAL to stream and this connection does not go stale.
|
||||
//!
|
||||
//! To achieve that, a etcd broker is used: safekepers propagate their timelines' state in it,
|
||||
//! To achieve that, a storage broker is used: safekepers propagate their timelines' state in it,
|
||||
//! the manager subscribes for changes and accumulates those to query the one with the biggest Lsn for connection.
|
||||
//! Current connection state is tracked too, to ensure it's not getting stale.
|
||||
//!
|
||||
//! After every connection or etcd update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader,
|
||||
//! After every connection or storage broker update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader,
|
||||
//! then a [re]connection happens, if necessary.
|
||||
//! Only WAL streaming task expects to be finished, other loops (etcd, connection management) never exit unless cancelled explicitly via the dedicated channel.
|
||||
//! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel.
|
||||
|
||||
use std::{
|
||||
collections::{hash_map, HashMap},
|
||||
num::NonZeroU64,
|
||||
ops::ControlFlow,
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::task_mgr::WALRECEIVER_RUNTIME;
|
||||
@@ -23,16 +17,18 @@ use crate::tenant::Timeline;
|
||||
use crate::{task_mgr, walreceiver::TaskStateUpdate};
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
use etcd_broker::{
|
||||
subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription,
|
||||
BrokerUpdate, Client,
|
||||
};
|
||||
use pageserver_api::models::TimelineState;
|
||||
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
|
||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use storage_broker::Streaming;
|
||||
use tokio::{select, sync::watch};
|
||||
use tracing::*;
|
||||
|
||||
use crate::{
|
||||
exponential_backoff, walreceiver::get_etcd_client, DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
exponential_backoff, walreceiver::get_broker_client, DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||
@@ -45,14 +41,13 @@ use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle};
|
||||
|
||||
/// Spawns the loop to take care of the timeline's WAL streaming connection.
|
||||
pub fn spawn_connection_manager_task(
|
||||
broker_loop_prefix: String,
|
||||
timeline: Arc<Timeline>,
|
||||
wal_connect_timeout: Duration,
|
||||
lagging_wal_timeout: Duration,
|
||||
max_lsn_wal_lag: NonZeroU64,
|
||||
auth_token: Option<Arc<String>>,
|
||||
) {
|
||||
let mut etcd_client = get_etcd_client().clone();
|
||||
let mut broker_client = get_broker_client().clone();
|
||||
|
||||
let tenant_id = timeline.tenant_id;
|
||||
let timeline_id = timeline.timeline_id;
|
||||
@@ -65,7 +60,7 @@ pub fn spawn_connection_manager_task(
|
||||
&format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
|
||||
false,
|
||||
async move {
|
||||
info!("WAL receiver broker started, connecting to etcd");
|
||||
info!("WAL receiver manager started, connecting to broker");
|
||||
let mut walreceiver_state = WalreceiverState::new(
|
||||
timeline,
|
||||
wal_connect_timeout,
|
||||
@@ -81,8 +76,7 @@ pub fn spawn_connection_manager_task(
|
||||
return Ok(());
|
||||
},
|
||||
loop_step_result = connection_manager_loop_step(
|
||||
&broker_loop_prefix,
|
||||
&mut etcd_client,
|
||||
&mut broker_client,
|
||||
&mut walreceiver_state,
|
||||
) => match loop_step_result {
|
||||
ControlFlow::Continue(()) => continue,
|
||||
@@ -103,10 +97,9 @@ pub fn spawn_connection_manager_task(
|
||||
|
||||
/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
|
||||
/// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
|
||||
/// If etcd subscription is cancelled, exits.
|
||||
/// If storage broker subscription is cancelled, exits.
|
||||
async fn connection_manager_loop_step(
|
||||
broker_prefix: &str,
|
||||
etcd_client: &mut Client,
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
walreceiver_state: &mut WalreceiverState,
|
||||
) -> ControlFlow<(), ()> {
|
||||
let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
|
||||
@@ -124,13 +117,11 @@ async fn connection_manager_loop_step(
|
||||
timeline_id: walreceiver_state.timeline.timeline_id,
|
||||
};
|
||||
|
||||
// XXX: We never explicitly cancel etcd task, instead establishing one and never letting it go,
|
||||
// running the entire loop step as much as possible to an end.
|
||||
// The task removal happens implicitly on drop, both aborting the etcd subscription task and dropping the receiver channel end,
|
||||
// forcing the etcd subscription to exit either way.
|
||||
let mut broker_subscription =
|
||||
subscribe_for_timeline_updates(etcd_client, broker_prefix, id).await;
|
||||
info!("Subscribed for etcd timeline changes, waiting for new etcd data");
|
||||
// Subscribe to the broker updates. Stream shares underlying TCP connection
|
||||
// with other streams on this client (other connection managers). When
|
||||
// object goes out of scope, stream finishes in drop() automatically.
|
||||
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
|
||||
info!("Subscribed for broker timeline updates");
|
||||
|
||||
loop {
|
||||
let time_until_next_retry = walreceiver_state.time_until_next_retry();
|
||||
@@ -145,12 +136,6 @@ async fn connection_manager_loop_step(
|
||||
// - this might change the current desired connection
|
||||
// - timeline state changes to something that does not allow walreceiver to run concurrently
|
||||
select! {
|
||||
broker_connection_result = &mut broker_subscription.watcher_handle => {
|
||||
info!("Broker connection was closed from the other side, ending current broker loop step");
|
||||
cleanup_broker_connection(broker_connection_result, walreceiver_state);
|
||||
return ControlFlow::Continue(());
|
||||
},
|
||||
|
||||
Some(wal_connection_update) = async {
|
||||
match walreceiver_state.wal_connection.as_mut() {
|
||||
Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
|
||||
@@ -185,22 +170,16 @@ async fn connection_manager_loop_step(
|
||||
}
|
||||
},
|
||||
|
||||
// Got a new update from etcd
|
||||
broker_update = broker_subscription.value_updates.recv() => {
|
||||
// Got a new update from the broker
|
||||
broker_update = broker_subscription.message() => {
|
||||
match broker_update {
|
||||
Some(broker_update) => walreceiver_state.register_timeline_update(broker_update),
|
||||
None => {
|
||||
info!("Broker sender end was dropped, ending current broker loop step");
|
||||
// Ensure to cancel and wait for the broker subscription task end, to log its result.
|
||||
// Broker sender end is in the broker subscription task and its drop means abnormal task completion.
|
||||
// First, ensure that the task is stopped (abort can be done without errors on already stopped tasks and repeated multiple times).
|
||||
broker_subscription.watcher_handle.abort();
|
||||
// Then, wait for the task to finish and print its result. If the task was finished before abort (which we assume in this abnormal case),
|
||||
// a proper error message will be printed, otherwise an abortion message is printed which is ok, since we're signalled to finish anyway.
|
||||
cleanup_broker_connection(
|
||||
(&mut broker_subscription.watcher_handle).await,
|
||||
walreceiver_state,
|
||||
);
|
||||
Ok(Some(broker_update)) => walreceiver_state.register_timeline_update(broker_update),
|
||||
Err(e) => {
|
||||
error!("broker subscription failed: {e}");
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
Ok(None) => {
|
||||
error!("broker subscription stream ended"); // can't happen
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
}
|
||||
@@ -234,17 +213,6 @@ async fn connection_manager_loop_step(
|
||||
_ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {}
|
||||
}
|
||||
|
||||
// Fetch more etcd timeline updates, but limit ourselves since they may arrive quickly.
|
||||
let mut max_events_to_poll = 100_u32;
|
||||
while max_events_to_poll > 0 {
|
||||
if let Ok(broker_update) = broker_subscription.value_updates.try_recv() {
|
||||
walreceiver_state.register_timeline_update(broker_update);
|
||||
max_events_to_poll -= 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(new_candidate) = walreceiver_state.next_connection_candidate() {
|
||||
info!("Switching to new connection candidate: {new_candidate:?}");
|
||||
walreceiver_state
|
||||
@@ -285,33 +253,11 @@ async fn wait_for_active_timeline(
|
||||
}
|
||||
}
|
||||
|
||||
fn cleanup_broker_connection(
|
||||
broker_connection_result: Result<Result<(), etcd_broker::BrokerError>, tokio::task::JoinError>,
|
||||
walreceiver_state: &mut WalreceiverState,
|
||||
) {
|
||||
match broker_connection_result {
|
||||
Ok(Ok(())) => info!("Broker conneciton task finished, ending current broker loop step"),
|
||||
Ok(Err(broker_error)) => warn!("Broker conneciton ended with error: {broker_error}"),
|
||||
Err(abort_error) => {
|
||||
if abort_error.is_panic() {
|
||||
error!("Broker connection panicked: {abort_error}")
|
||||
} else {
|
||||
debug!("Broker connection aborted: {abort_error}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
walreceiver_state.wal_stream_candidates.clear();
|
||||
}
|
||||
|
||||
/// Endlessly try to subscribe for broker updates for a given timeline.
|
||||
/// If there are no safekeepers to maintain the lease, the timeline subscription will be unavailable in the broker and the operation will fail constantly.
|
||||
/// This is ok, pageservers should anyway try subscribing (with some backoff) since it's the only way they can get the timeline WAL anyway.
|
||||
async fn subscribe_for_timeline_updates(
|
||||
etcd_client: &mut Client,
|
||||
broker_prefix: &str,
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
id: TenantTimelineId,
|
||||
) -> BrokerSubscription<SkTimelineInfo> {
|
||||
) -> Streaming<SafekeeperTimelineInfo> {
|
||||
let mut attempt = 0;
|
||||
loop {
|
||||
exponential_backoff(
|
||||
@@ -322,18 +268,21 @@ async fn subscribe_for_timeline_updates(
|
||||
.await;
|
||||
attempt += 1;
|
||||
|
||||
match etcd_broker::subscribe_for_json_values(
|
||||
etcd_client,
|
||||
SubscriptionKey::sk_timeline_info(broker_prefix.to_owned(), id),
|
||||
)
|
||||
.instrument(info_span!("etcd_subscription"))
|
||||
.await
|
||||
{
|
||||
Ok(new_subscription) => {
|
||||
return new_subscription;
|
||||
// subscribe to the specific timeline
|
||||
let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
|
||||
tenant_id: id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: id.timeline_id.as_ref().to_owned(),
|
||||
});
|
||||
let request = SubscribeSafekeeperInfoRequest {
|
||||
subscription_key: Some(key),
|
||||
};
|
||||
|
||||
match broker_client.subscribe_safekeeper_info(request).await {
|
||||
Ok(resp) => {
|
||||
return resp.into_inner();
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in etcd: {e:#}");
|
||||
warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -360,8 +309,8 @@ struct WalreceiverState {
|
||||
wal_connection: Option<WalConnection>,
|
||||
/// Info about retries and unsuccessful attempts to connect to safekeepers.
|
||||
wal_connection_retries: HashMap<NodeId, RetryInfo>,
|
||||
/// Data about all timelines, available for connection, fetched from etcd, grouped by their corresponding safekeeper node id.
|
||||
wal_stream_candidates: HashMap<NodeId, EtcdSkTimeline>,
|
||||
/// Data about all timelines, available for connection, fetched from storage broker, grouped by their corresponding safekeeper node id.
|
||||
wal_stream_candidates: HashMap<NodeId, BrokerSkTimeline>,
|
||||
auth_token: Option<Arc<String>>,
|
||||
}
|
||||
|
||||
@@ -395,13 +344,11 @@ struct RetryInfo {
|
||||
retry_duration_seconds: f64,
|
||||
}
|
||||
|
||||
/// Data about the timeline to connect to, received from etcd.
|
||||
/// Data about the timeline to connect to, received from the broker.
|
||||
#[derive(Debug)]
|
||||
struct EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo,
|
||||
/// Etcd generation, the bigger it is, the more up to date the timeline data is.
|
||||
etcd_version: i64,
|
||||
/// Time at which the data was fetched from etcd last time, to track the stale data.
|
||||
struct BrokerSkTimeline {
|
||||
timeline: SafekeeperTimelineInfo,
|
||||
/// Time at which the data was fetched from the broker last time, to track the stale data.
|
||||
latest_update: NaiveDateTime,
|
||||
}
|
||||
|
||||
@@ -538,31 +485,18 @@ impl WalreceiverState {
|
||||
next_retry_at.and_then(|next_retry_at| (next_retry_at - now).to_std().ok())
|
||||
}
|
||||
|
||||
/// Adds another etcd timeline into the state, if its more recent than the one already added there for the same key.
|
||||
fn register_timeline_update(&mut self, timeline_update: BrokerUpdate<SkTimelineInfo>) {
|
||||
match self
|
||||
.wal_stream_candidates
|
||||
.entry(timeline_update.key.node_id)
|
||||
{
|
||||
hash_map::Entry::Occupied(mut o) => {
|
||||
let existing_value = o.get_mut();
|
||||
if existing_value.etcd_version < timeline_update.etcd_version {
|
||||
existing_value.etcd_version = timeline_update.etcd_version;
|
||||
existing_value.timeline = timeline_update.value;
|
||||
existing_value.latest_update = Utc::now().naive_utc();
|
||||
}
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(EtcdSkTimeline {
|
||||
timeline: timeline_update.value,
|
||||
etcd_version: timeline_update.etcd_version,
|
||||
latest_update: Utc::now().naive_utc(),
|
||||
});
|
||||
}
|
||||
}
|
||||
/// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
|
||||
fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
|
||||
self.wal_stream_candidates.insert(
|
||||
NodeId(timeline_update.safekeeper_id),
|
||||
BrokerSkTimeline {
|
||||
timeline: timeline_update,
|
||||
latest_update: Utc::now().naive_utc(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
/// Cleans up stale etcd records and checks the rest for the new connection candidate.
|
||||
/// Cleans up stale broker records and checks the rest for the new connection candidate.
|
||||
/// Returns a new candidate, if the current state is absent or somewhat lagging, `None` otherwise.
|
||||
/// The current rules for approving new candidates:
|
||||
/// * pick a candidate different from the connected safekeeper with biggest `commit_lsn` and lowest failed connection attemps
|
||||
@@ -585,7 +519,7 @@ impl WalreceiverState {
|
||||
Some(existing_wal_connection) => {
|
||||
let connected_sk_node = existing_wal_connection.sk_id;
|
||||
|
||||
let (new_sk_id, new_safekeeper_etcd_data, new_wal_source_connconf) =
|
||||
let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) =
|
||||
self.select_connection_candidate(Some(connected_sk_node))?;
|
||||
|
||||
let now = Utc::now().naive_utc();
|
||||
@@ -614,7 +548,7 @@ impl WalreceiverState {
|
||||
}
|
||||
|
||||
if let Some(current_commit_lsn) = existing_wal_connection.status.commit_lsn {
|
||||
let new_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0));
|
||||
let new_commit_lsn = Lsn(new_safekeeper_broker_data.commit_lsn);
|
||||
// Check if the new candidate has much more WAL than the current one.
|
||||
match new_commit_lsn.0.checked_sub(current_commit_lsn.0) {
|
||||
Some(new_sk_lsn_advantage) => {
|
||||
@@ -644,7 +578,7 @@ impl WalreceiverState {
|
||||
.status
|
||||
.commit_lsn
|
||||
.unwrap_or(current_lsn);
|
||||
let candidate_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0));
|
||||
let candidate_commit_lsn = Lsn(new_safekeeper_broker_data.commit_lsn);
|
||||
|
||||
// Keep discovered_new_wal only if connected safekeeper has not caught up yet.
|
||||
let mut discovered_new_wal = existing_wal_connection
|
||||
@@ -727,7 +661,7 @@ impl WalreceiverState {
|
||||
None
|
||||
}
|
||||
|
||||
/// Selects the best possible candidate, based on the data collected from etcd updates about the safekeepers.
|
||||
/// Selects the best possible candidate, based on the data collected from the broker updates about the safekeepers.
|
||||
/// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another.
|
||||
///
|
||||
/// The candidate that is chosen:
|
||||
@@ -736,7 +670,7 @@ impl WalreceiverState {
|
||||
fn select_connection_candidate(
|
||||
&self,
|
||||
node_to_omit: Option<NodeId>,
|
||||
) -> Option<(NodeId, &SkTimelineInfo, PgConnectionConfig)> {
|
||||
) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
|
||||
self.applicable_connection_candidates()
|
||||
.filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
|
||||
.max_by_key(|(_, info, _)| info.commit_lsn)
|
||||
@@ -746,12 +680,12 @@ impl WalreceiverState {
|
||||
/// Some safekeepers are filtered by the retry cooldown.
|
||||
fn applicable_connection_candidates(
|
||||
&self,
|
||||
) -> impl Iterator<Item = (NodeId, &SkTimelineInfo, PgConnectionConfig)> {
|
||||
) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
self.wal_stream_candidates
|
||||
.iter()
|
||||
.filter(|(_, info)| info.timeline.commit_lsn.is_some())
|
||||
.filter(|(_, info)| Lsn(info.timeline.commit_lsn) != Lsn::INVALID)
|
||||
.filter(move |(sk_id, _)| {
|
||||
let next_retry_at = self
|
||||
.wal_connection_retries
|
||||
@@ -761,12 +695,14 @@ impl WalreceiverState {
|
||||
});
|
||||
|
||||
next_retry_at.is_none() || next_retry_at.unwrap() <= now
|
||||
})
|
||||
.filter_map(|(sk_id, etcd_info)| {
|
||||
let info = &etcd_info.timeline;
|
||||
}).filter_map(|(sk_id, broker_info)| {
|
||||
let info = &broker_info.timeline;
|
||||
if info.safekeeper_connstr.is_empty() {
|
||||
return None; // no connection string, ignore sk
|
||||
}
|
||||
match wal_stream_connection_config(
|
||||
self.id,
|
||||
info.safekeeper_connstr.as_deref()?,
|
||||
info.safekeeper_connstr.as_ref(),
|
||||
match &self.auth_token {
|
||||
None => None,
|
||||
Some(x) => Some(x),
|
||||
@@ -781,15 +717,15 @@ impl WalreceiverState {
|
||||
})
|
||||
}
|
||||
|
||||
/// Remove candidates which haven't sent etcd updates for a while.
|
||||
/// Remove candidates which haven't sent broker updates for a while.
|
||||
fn cleanup_old_candidates(&mut self) {
|
||||
let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len());
|
||||
|
||||
self.wal_stream_candidates.retain(|node_id, etcd_info| {
|
||||
if let Ok(time_since_latest_etcd_update) =
|
||||
(Utc::now().naive_utc() - etcd_info.latest_update).to_std()
|
||||
self.wal_stream_candidates.retain(|node_id, broker_info| {
|
||||
if let Ok(time_since_latest_broker_update) =
|
||||
(Utc::now().naive_utc() - broker_info.latest_update).to_std()
|
||||
{
|
||||
let should_retain = time_since_latest_etcd_update < self.lagging_wal_timeout;
|
||||
let should_retain = time_since_latest_broker_update < self.lagging_wal_timeout;
|
||||
if !should_retain {
|
||||
node_ids_to_remove.push(*node_id);
|
||||
}
|
||||
@@ -870,6 +806,28 @@ mod tests {
|
||||
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
|
||||
use url::Host;
|
||||
|
||||
fn dummy_broker_sk_timeline(
|
||||
commit_lsn: u64,
|
||||
safekeeper_connstr: &str,
|
||||
latest_update: NaiveDateTime,
|
||||
) -> BrokerSkTimeline {
|
||||
BrokerSkTimeline {
|
||||
timeline: SafekeeperTimelineInfo {
|
||||
safekeeper_id: 0,
|
||||
tenant_timeline_id: None,
|
||||
last_log_term: 0,
|
||||
flush_lsn: 0,
|
||||
commit_lsn,
|
||||
backup_lsn: 0,
|
||||
remote_consistent_lsn: 0,
|
||||
peer_horizon_lsn: 0,
|
||||
local_start_lsn: 0,
|
||||
safekeeper_connstr: safekeeper_connstr.to_owned(),
|
||||
},
|
||||
latest_update,
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn no_connection_no_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("no_connection_no_candidate")?;
|
||||
@@ -881,74 +839,16 @@ mod tests {
|
||||
|
||||
state.wal_connection = None;
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
(
|
||||
NodeId(0),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(1)),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
safekeeper_connstr: None,
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
),
|
||||
(
|
||||
NodeId(1),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: None,
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("no_commit_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
),
|
||||
(
|
||||
NodeId(2),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: None,
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
safekeeper_connstr: Some("no_commit_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
),
|
||||
(NodeId(0), dummy_broker_sk_timeline(1, "", now)),
|
||||
(NodeId(1), dummy_broker_sk_timeline(0, "no_commit_lsn", now)),
|
||||
(NodeId(2), dummy_broker_sk_timeline(0, "no_commit_lsn", now)),
|
||||
(
|
||||
NodeId(3),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
safekeeper_connstr: None,
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: delay_over_threshold,
|
||||
},
|
||||
dummy_broker_sk_timeline(
|
||||
1 + state.max_lsn_wal_lag.get(),
|
||||
"delay_over_threshold",
|
||||
delay_over_threshold,
|
||||
),
|
||||
),
|
||||
]);
|
||||
|
||||
@@ -995,57 +895,23 @@ mod tests {
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
(
|
||||
connected_sk_id,
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() * 2)),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(
|
||||
current_lsn + state.max_lsn_wal_lag.get() * 2,
|
||||
DUMMY_SAFEKEEPER_HOST,
|
||||
now,
|
||||
),
|
||||
),
|
||||
(
|
||||
NodeId(1),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(current_lsn)),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("not_advanced_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(current_lsn, "not_advanced_lsn", now),
|
||||
),
|
||||
(
|
||||
NodeId(2),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() / 2)),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("not_enough_advanced_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(
|
||||
current_lsn + state.max_lsn_wal_lag.get() / 2,
|
||||
"not_enough_advanced_lsn",
|
||||
now,
|
||||
),
|
||||
),
|
||||
]);
|
||||
|
||||
@@ -1067,21 +933,7 @@ mod tests {
|
||||
state.wal_connection = None;
|
||||
state.wal_stream_candidates = HashMap::from([(
|
||||
NodeId(0),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(1 + state.max_lsn_wal_lag.get(), DUMMY_SAFEKEEPER_HOST, now),
|
||||
)]);
|
||||
|
||||
let only_candidate = state
|
||||
@@ -1102,57 +954,15 @@ mod tests {
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
(
|
||||
NodeId(0),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(selected_lsn - 100)),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("smaller_commit_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(selected_lsn - 100, "smaller_commit_lsn", now),
|
||||
),
|
||||
(
|
||||
NodeId(1),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(selected_lsn)),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(selected_lsn, DUMMY_SAFEKEEPER_HOST, now),
|
||||
),
|
||||
(
|
||||
NodeId(2),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(selected_lsn + 100)),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: None,
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(selected_lsn + 100, "", now),
|
||||
),
|
||||
]);
|
||||
let biggest_wal_candidate = state.next_connection_candidate().expect(
|
||||
@@ -1186,39 +996,11 @@ mod tests {
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
(
|
||||
NodeId(0),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(bigger_lsn),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(bigger_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
|
||||
),
|
||||
(
|
||||
NodeId(1),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(current_lsn),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
|
||||
),
|
||||
]);
|
||||
state.wal_connection_retries = HashMap::from([(
|
||||
@@ -1275,39 +1057,11 @@ mod tests {
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
(
|
||||
connected_sk_id,
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(current_lsn),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
|
||||
),
|
||||
(
|
||||
NodeId(1),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(new_lsn),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("advanced_by_lsn_safekeeper".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(new_lsn.0, "advanced_by_lsn_safekeeper", now),
|
||||
),
|
||||
]);
|
||||
|
||||
@@ -1367,21 +1121,7 @@ mod tests {
|
||||
});
|
||||
state.wal_stream_candidates = HashMap::from([(
|
||||
NodeId(0),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(current_lsn),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
|
||||
)]);
|
||||
|
||||
let over_threshcurrent_candidate = state.next_connection_candidate().expect(
|
||||
@@ -1441,21 +1181,7 @@ mod tests {
|
||||
});
|
||||
state.wal_stream_candidates = HashMap::from([(
|
||||
NodeId(0),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(new_lsn),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(new_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
|
||||
)]);
|
||||
|
||||
let over_threshcurrent_candidate = state.next_connection_candidate().expect(
|
||||
|
||||
@@ -675,33 +675,6 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
|
||||
request.req.lsn = lsn;
|
||||
prefetch_lsn = Max(prefetch_lsn, lsn);
|
||||
slot->effective_request_lsn = prefetch_lsn;
|
||||
|
||||
/*
|
||||
* Remember request LSN in the last-written LSN cache to avoid false
|
||||
* prefetch invalidations.
|
||||
*
|
||||
* Imagine what would happen without this, when you perform a large
|
||||
* sequential scan with UPDATE. The sequential scan issues a prefetch
|
||||
* request for each page in order, and every page is also dirtied. On
|
||||
* each page, the oldest page in the last-written LSN cache is evicted,
|
||||
* which advances the global last-written LSN. The pages being scanned are
|
||||
* not in the last-written cache, so each prefetch request will use the
|
||||
* global last-written LSN in the request and memorize that in the
|
||||
* slot. However, when we receive the response to the prefetch request,
|
||||
* the global last-written LSN has already moved forwards, and the
|
||||
* cross-check we make that the last-written LSN matches will fail, and we
|
||||
* discard the prefetched response unnecessary.
|
||||
*
|
||||
* Inserting the LSN we use in the prefetch request to the last-written LSN
|
||||
* cache avoids that problem. With that, we will use the cached value in
|
||||
* the cross-check, instead of the more recent global last-written LSN value.
|
||||
*/
|
||||
SetLastWrittenLSNForBlock(
|
||||
request.req.lsn,
|
||||
slot->buftag.rnode,
|
||||
slot->buftag.forkNum,
|
||||
slot->buftag.blockNum
|
||||
);
|
||||
}
|
||||
|
||||
Assert(slot->response == NULL);
|
||||
|
||||
@@ -49,6 +49,9 @@ pub enum AuthErrorImpl {
|
||||
)]
|
||||
MissingProjectName,
|
||||
|
||||
#[error("password authentication failed for user '{0}'")]
|
||||
AuthFailed(Box<str>),
|
||||
|
||||
/// Errors produced by e.g. [`crate::stream::PqStream`].
|
||||
#[error(transparent)]
|
||||
Io(#[from] io::Error),
|
||||
@@ -62,6 +65,10 @@ impl AuthError {
|
||||
pub fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
|
||||
AuthErrorImpl::BadAuthMethod(name.into()).into()
|
||||
}
|
||||
|
||||
pub fn auth_failed(user: impl Into<Box<str>>) -> Self {
|
||||
AuthErrorImpl::AuthFailed(user.into()).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
|
||||
@@ -78,10 +85,11 @@ impl UserFacingError for AuthError {
|
||||
GetAuthInfo(e) => e.to_string_client(),
|
||||
WakeCompute(e) => e.to_string_client(),
|
||||
Sasl(e) => e.to_string_client(),
|
||||
AuthFailed(_) => self.to_string(),
|
||||
BadAuthMethod(_) => self.to_string(),
|
||||
MalformedPassword(_) => self.to_string(),
|
||||
MissingProjectName => self.to_string(),
|
||||
_ => "Internal error".to_string(),
|
||||
Io(_) => "Internal error".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,26 +5,74 @@ use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
compute,
|
||||
error::{io_error, UserFacingError},
|
||||
http, scram,
|
||||
http, sasl, scram,
|
||||
stream::PqStream,
|
||||
};
|
||||
use futures::TryFutureExt;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use reqwest::StatusCode as HttpStatusCode;
|
||||
use serde::Deserialize;
|
||||
use std::future::Future;
|
||||
use thiserror::Error;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{error, info, info_span};
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
|
||||
/// A go-to error message which doesn't leak any detail.
|
||||
const REQUEST_FAILED: &str = "Console request failed";
|
||||
|
||||
/// Common console API error.
|
||||
#[derive(Debug, Error)]
|
||||
#[error("{}", REQUEST_FAILED)]
|
||||
pub struct TransportError(#[from] std::io::Error);
|
||||
pub enum ApiError {
|
||||
/// Error returned by the console itself.
|
||||
#[error("{REQUEST_FAILED} with {}: {}", .status, .text)]
|
||||
Console {
|
||||
status: HttpStatusCode,
|
||||
text: Box<str>,
|
||||
},
|
||||
|
||||
impl UserFacingError for TransportError {}
|
||||
/// Various IO errors like broken pipe or malformed payload.
|
||||
#[error("{REQUEST_FAILED}: {0}")]
|
||||
Transport(#[from] std::io::Error),
|
||||
}
|
||||
|
||||
impl ApiError {
|
||||
/// Returns HTTP status code if it's the reason for failure.
|
||||
fn http_status_code(&self) -> Option<HttpStatusCode> {
|
||||
use ApiError::*;
|
||||
match self {
|
||||
Console { status, .. } => Some(*status),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UserFacingError for ApiError {
|
||||
fn to_string_client(&self) -> String {
|
||||
use ApiError::*;
|
||||
match self {
|
||||
// To minimize risks, only select errors are forwarded to users.
|
||||
// Ask @neondatabase/control-plane for review before adding more.
|
||||
Console { status, .. } => match *status {
|
||||
HttpStatusCode::NOT_FOUND => {
|
||||
// Status 404: failed to get a project-related resource.
|
||||
format!("{REQUEST_FAILED}: endpoint cannot be found")
|
||||
}
|
||||
HttpStatusCode::NOT_ACCEPTABLE => {
|
||||
// Status 406: endpoint is disabled (we don't allow connections).
|
||||
format!("{REQUEST_FAILED}: endpoint is disabled")
|
||||
}
|
||||
HttpStatusCode::LOCKED => {
|
||||
// Status 423: project might be in maintenance mode (or bad state).
|
||||
format!("{REQUEST_FAILED}: endpoint is temporary unavailable")
|
||||
}
|
||||
_ => REQUEST_FAILED.to_owned(),
|
||||
},
|
||||
_ => REQUEST_FAILED.to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Helps eliminate graceless `.map_err` calls without introducing another ctor.
|
||||
impl From<reqwest::Error> for TransportError {
|
||||
impl From<reqwest::Error> for ApiError {
|
||||
fn from(e: reqwest::Error) -> Self {
|
||||
io_error(e).into()
|
||||
}
|
||||
@@ -37,61 +85,73 @@ pub enum GetAuthInfoError {
|
||||
BadSecret,
|
||||
|
||||
#[error(transparent)]
|
||||
Transport(TransportError),
|
||||
ApiError(ApiError),
|
||||
}
|
||||
|
||||
// This allows more useful interactions than `#[from]`.
|
||||
impl<E: Into<ApiError>> From<E> for GetAuthInfoError {
|
||||
fn from(e: E) -> Self {
|
||||
Self::ApiError(e.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl UserFacingError for GetAuthInfoError {
|
||||
fn to_string_client(&self) -> String {
|
||||
use GetAuthInfoError::*;
|
||||
match self {
|
||||
// We absolutely should not leak any secrets!
|
||||
BadSecret => REQUEST_FAILED.to_owned(),
|
||||
Transport(e) => e.to_string_client(),
|
||||
// However, API might return a meaningful error.
|
||||
ApiError(e) => e.to_string_client(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<E: Into<TransportError>> From<E> for GetAuthInfoError {
|
||||
fn from(e: E) -> Self {
|
||||
Self::Transport(e.into())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum WakeComputeError {
|
||||
// We shouldn't show users the address even if it's broken.
|
||||
#[error("Console responded with a malformed compute address: {0}")]
|
||||
BadComputeAddress(String),
|
||||
BadComputeAddress(Box<str>),
|
||||
|
||||
#[error(transparent)]
|
||||
Transport(TransportError),
|
||||
ApiError(ApiError),
|
||||
}
|
||||
|
||||
// This allows more useful interactions than `#[from]`.
|
||||
impl<E: Into<ApiError>> From<E> for WakeComputeError {
|
||||
fn from(e: E) -> Self {
|
||||
Self::ApiError(e.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl UserFacingError for WakeComputeError {
|
||||
fn to_string_client(&self) -> String {
|
||||
use WakeComputeError::*;
|
||||
match self {
|
||||
// We shouldn't show user the address even if it's broken.
|
||||
// Besides, user is unlikely to care about this detail.
|
||||
BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
|
||||
Transport(e) => e.to_string_client(),
|
||||
// However, API might return a meaningful error.
|
||||
ApiError(e) => e.to_string_client(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<E: Into<TransportError>> From<E> for WakeComputeError {
|
||||
fn from(e: E) -> Self {
|
||||
Self::Transport(e.into())
|
||||
}
|
||||
/// Console's response which holds client's auth secret.
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct GetRoleSecret {
|
||||
role_secret: Box<str>,
|
||||
}
|
||||
|
||||
// TODO: convert into an enum with "error"
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
struct GetRoleSecretResponse {
|
||||
role_secret: String,
|
||||
/// Console's response which holds compute node's `host:port` pair.
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct WakeCompute {
|
||||
address: Box<str>,
|
||||
}
|
||||
|
||||
// TODO: convert into an enum with "error"
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
struct GetWakeComputeResponse {
|
||||
address: String,
|
||||
/// Console's error response with human-readable description.
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct ConsoleError {
|
||||
error: Box<str>,
|
||||
}
|
||||
|
||||
/// Auth secret which is managed by the cloud.
|
||||
@@ -110,6 +170,12 @@ pub(super) struct Api<'a> {
|
||||
creds: &'a ClientCredentials<'a>,
|
||||
}
|
||||
|
||||
impl<'a> AsRef<ClientCredentials<'a>> for Api<'a> {
|
||||
fn as_ref(&self) -> &ClientCredentials<'a> {
|
||||
self.creds
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Api<'a> {
|
||||
/// Construct an API object containing the auth parameters.
|
||||
pub(super) fn new(
|
||||
@@ -126,83 +192,88 @@ impl<'a> Api<'a> {
|
||||
|
||||
/// Authenticate the existing user or throw an error.
|
||||
pub(super) async fn handle_user(
|
||||
self,
|
||||
&'a self,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
|
||||
) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
|
||||
handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await
|
||||
handle_user(client, self, Self::get_auth_info, Self::wake_compute).await
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_auth_info(&self) -> Result<AuthInfo, GetAuthInfoError> {
|
||||
impl Api<'_> {
|
||||
async fn get_auth_info(&self) -> Result<Option<AuthInfo>, GetAuthInfoError> {
|
||||
let request_id = uuid::Uuid::new_v4().to_string();
|
||||
let req = self
|
||||
.endpoint
|
||||
.get("proxy_get_role_secret")
|
||||
.header("X-Request-ID", &request_id)
|
||||
.query(&[("session_id", self.extra.session_id)])
|
||||
.query(&[
|
||||
("application_name", self.extra.application_name),
|
||||
("project", Some(self.creds.project().expect("impossible"))),
|
||||
("role", Some(self.creds.user)),
|
||||
])
|
||||
.build()?;
|
||||
async {
|
||||
let request = self
|
||||
.endpoint
|
||||
.get("proxy_get_role_secret")
|
||||
.header("X-Request-ID", &request_id)
|
||||
.query(&[("session_id", self.extra.session_id)])
|
||||
.query(&[
|
||||
("application_name", self.extra.application_name),
|
||||
("project", Some(self.creds.project().expect("impossible"))),
|
||||
("role", Some(self.creds.user)),
|
||||
])
|
||||
.build()?;
|
||||
|
||||
let span = info_span!("http", id = request_id, url = req.url().as_str());
|
||||
info!(parent: &span, "request auth info");
|
||||
let msg = self
|
||||
.endpoint
|
||||
.checked_execute(req)
|
||||
.and_then(|r| r.json::<GetRoleSecretResponse>())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
error!(parent: &span, "{e}");
|
||||
e
|
||||
})?;
|
||||
info!(url = request.url().as_str(), "sending http request");
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
let body = match parse_body::<GetRoleSecret>(response).await {
|
||||
Ok(body) => body,
|
||||
// Error 404 is special: it's ok not to have a secret.
|
||||
Err(e) => match e.http_status_code() {
|
||||
Some(HttpStatusCode::NOT_FOUND) => return Ok(None),
|
||||
_otherwise => return Err(e.into()),
|
||||
},
|
||||
};
|
||||
|
||||
scram::ServerSecret::parse(&msg.role_secret)
|
||||
.map(AuthInfo::Scram)
|
||||
.ok_or(GetAuthInfoError::BadSecret)
|
||||
let secret = scram::ServerSecret::parse(&body.role_secret)
|
||||
.map(AuthInfo::Scram)
|
||||
.ok_or(GetAuthInfoError::BadSecret)?;
|
||||
|
||||
Ok(Some(secret))
|
||||
}
|
||||
.map_err(crate::error::log_error)
|
||||
.instrument(info_span!("get_auth_info", id = request_id))
|
||||
.await
|
||||
}
|
||||
|
||||
/// Wake up the compute node and return the corresponding connection info.
|
||||
pub(super) async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
|
||||
pub async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
|
||||
let request_id = uuid::Uuid::new_v4().to_string();
|
||||
let req = self
|
||||
.endpoint
|
||||
.get("proxy_wake_compute")
|
||||
.header("X-Request-ID", &request_id)
|
||||
.query(&[("session_id", self.extra.session_id)])
|
||||
.query(&[
|
||||
("application_name", self.extra.application_name),
|
||||
("project", Some(self.creds.project().expect("impossible"))),
|
||||
])
|
||||
.build()?;
|
||||
async {
|
||||
let request = self
|
||||
.endpoint
|
||||
.get("proxy_wake_compute")
|
||||
.header("X-Request-ID", &request_id)
|
||||
.query(&[("session_id", self.extra.session_id)])
|
||||
.query(&[
|
||||
("application_name", self.extra.application_name),
|
||||
("project", Some(self.creds.project().expect("impossible"))),
|
||||
])
|
||||
.build()?;
|
||||
|
||||
let span = info_span!("http", id = request_id, url = req.url().as_str());
|
||||
info!(parent: &span, "request wake-up");
|
||||
let msg = self
|
||||
.endpoint
|
||||
.checked_execute(req)
|
||||
.and_then(|r| r.json::<GetWakeComputeResponse>())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
error!(parent: &span, "{e}");
|
||||
e
|
||||
})?;
|
||||
info!(url = request.url().as_str(), "sending http request");
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
let body = parse_body::<WakeCompute>(response).await?;
|
||||
|
||||
// Unfortunately, ownership won't let us use `Option::ok_or` here.
|
||||
let (host, port) = match parse_host_port(&msg.address) {
|
||||
None => return Err(WakeComputeError::BadComputeAddress(msg.address)),
|
||||
Some(x) => x,
|
||||
};
|
||||
// Unfortunately, ownership won't let us use `Option::ok_or` here.
|
||||
let (host, port) = match parse_host_port(&body.address) {
|
||||
None => return Err(WakeComputeError::BadComputeAddress(body.address)),
|
||||
Some(x) => x,
|
||||
};
|
||||
|
||||
let mut config = compute::ConnCfg::new();
|
||||
config
|
||||
.host(host)
|
||||
.port(port)
|
||||
.dbname(self.creds.dbname)
|
||||
.user(self.creds.user);
|
||||
let mut config = compute::ConnCfg::new();
|
||||
config
|
||||
.host(host)
|
||||
.port(port)
|
||||
.dbname(self.creds.dbname)
|
||||
.user(self.creds.user);
|
||||
|
||||
Ok(config)
|
||||
Ok(config)
|
||||
}
|
||||
.map_err(crate::error::log_error)
|
||||
.instrument(info_span!("wake_compute", id = request_id))
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -215,24 +286,40 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>(
|
||||
wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute,
|
||||
) -> auth::Result<AuthSuccess<compute::ConnCfg>>
|
||||
where
|
||||
GetAuthInfo: Future<Output = Result<AuthInfo, GetAuthInfoError>>,
|
||||
Endpoint: AsRef<ClientCredentials<'a>>,
|
||||
GetAuthInfo: Future<Output = Result<Option<AuthInfo>, GetAuthInfoError>>,
|
||||
WakeCompute: Future<Output = Result<compute::ConnCfg, WakeComputeError>>,
|
||||
{
|
||||
let creds = endpoint.as_ref();
|
||||
|
||||
info!("fetching user's authentication info");
|
||||
let auth_info = get_auth_info(endpoint).await?;
|
||||
let info = get_auth_info(endpoint).await?.unwrap_or_else(|| {
|
||||
// If we don't have an authentication secret, we mock one to
|
||||
// prevent malicious probing (possible due to missing protocol steps).
|
||||
// This mocked secret will never lead to successful authentication.
|
||||
info!("authentication info not found, mocking it");
|
||||
AuthInfo::Scram(scram::ServerSecret::mock(creds.user, rand::random()))
|
||||
});
|
||||
|
||||
let flow = AuthFlow::new(client);
|
||||
let scram_keys = match auth_info {
|
||||
let scram_keys = match info {
|
||||
AuthInfo::Md5(_) => {
|
||||
// TODO: decide if we should support MD5 in api v2
|
||||
info!("auth endpoint chooses MD5");
|
||||
return Err(auth::AuthError::bad_auth_method("MD5"));
|
||||
}
|
||||
AuthInfo::Scram(secret) => {
|
||||
info!("auth endpoint chooses SCRAM");
|
||||
let scram = auth::Scram(&secret);
|
||||
let client_key = match flow.begin(scram).await?.authenticate().await? {
|
||||
sasl::Outcome::Success(key) => key,
|
||||
sasl::Outcome::Failure(reason) => {
|
||||
info!("auth backend failed with an error: {reason}");
|
||||
return Err(auth::AuthError::auth_failed(creds.user));
|
||||
}
|
||||
};
|
||||
|
||||
Some(compute::ScramKeys {
|
||||
client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(),
|
||||
client_key: client_key.as_bytes(),
|
||||
server_key: secret.server_key.as_bytes(),
|
||||
})
|
||||
}
|
||||
@@ -249,6 +336,31 @@ where
|
||||
})
|
||||
}
|
||||
|
||||
/// Parse http response body, taking status code into account.
|
||||
async fn parse_body<T: for<'a> Deserialize<'a>>(
|
||||
response: reqwest::Response,
|
||||
) -> Result<T, ApiError> {
|
||||
let status = response.status();
|
||||
if status.is_success() {
|
||||
// We shouldn't log raw body because it may contain secrets.
|
||||
info!("request succeeded, processing the body");
|
||||
return Ok(response.json().await?);
|
||||
}
|
||||
|
||||
// Don't throw an error here because it's not as important
|
||||
// as the fact that the request itself has failed.
|
||||
let body = response.json().await.unwrap_or_else(|e| {
|
||||
warn!("failed to parse error body: {e}");
|
||||
ConsoleError {
|
||||
error: "reason unclear (malformed error message)".into(),
|
||||
}
|
||||
});
|
||||
|
||||
let text = body.error;
|
||||
error!("console responded with an error ({status}): {text}");
|
||||
Err(ApiError::Console { status, text })
|
||||
}
|
||||
|
||||
fn parse_host_port(input: &str) -> Option<(&str, u16)> {
|
||||
let (host, port) = input.split_once(':')?;
|
||||
Some((host, port.parse().ok()?))
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//! Local mock of Cloud API V2.
|
||||
|
||||
use super::{
|
||||
console::{self, AuthInfo, GetAuthInfoError, TransportError, WakeComputeError},
|
||||
console::{self, AuthInfo, GetAuthInfoError, WakeComputeError},
|
||||
AuthSuccess,
|
||||
};
|
||||
use crate::{
|
||||
@@ -12,7 +12,28 @@ use crate::{
|
||||
stream::PqStream,
|
||||
url::ApiUrl,
|
||||
};
|
||||
use futures::TryFutureExt;
|
||||
use thiserror::Error;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, info_span, warn, Instrument};
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
enum MockApiError {
|
||||
#[error("Failed to read password: {0}")]
|
||||
PasswordNotSet(tokio_postgres::Error),
|
||||
}
|
||||
|
||||
impl From<MockApiError> for console::ApiError {
|
||||
fn from(e: MockApiError) -> Self {
|
||||
io_error(e).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<tokio_postgres::Error> for console::ApiError {
|
||||
fn from(e: tokio_postgres::Error) -> Self {
|
||||
io_error(e).into()
|
||||
}
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub(super) struct Api<'a> {
|
||||
@@ -20,10 +41,9 @@ pub(super) struct Api<'a> {
|
||||
creds: &'a ClientCredentials<'a>,
|
||||
}
|
||||
|
||||
// Helps eliminate graceless `.map_err` calls without introducing another ctor.
|
||||
impl From<tokio_postgres::Error> for TransportError {
|
||||
fn from(e: tokio_postgres::Error) -> Self {
|
||||
io_error(e).into()
|
||||
impl<'a> AsRef<ClientCredentials<'a>> for Api<'a> {
|
||||
fn as_ref(&self) -> &ClientCredentials<'a> {
|
||||
self.creds
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,54 +55,55 @@ impl<'a> Api<'a> {
|
||||
|
||||
/// Authenticate the existing user or throw an error.
|
||||
pub(super) async fn handle_user(
|
||||
self,
|
||||
&'a self,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
|
||||
) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
|
||||
// We reuse user handling logic from a production module.
|
||||
console::handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await
|
||||
console::handle_user(client, self, Self::get_auth_info, Self::wake_compute).await
|
||||
}
|
||||
}
|
||||
|
||||
impl Api<'_> {
|
||||
/// This implementation fetches the auth info from a local postgres instance.
|
||||
async fn get_auth_info(&self) -> Result<AuthInfo, GetAuthInfoError> {
|
||||
// Perhaps we could persist this connection, but then we'd have to
|
||||
// write more code for reopening it if it got closed, which doesn't
|
||||
// seem worth it.
|
||||
let (client, connection) =
|
||||
tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
|
||||
async fn get_auth_info(&self) -> Result<Option<AuthInfo>, GetAuthInfoError> {
|
||||
async {
|
||||
// Perhaps we could persist this connection, but then we'd have to
|
||||
// write more code for reopening it if it got closed, which doesn't
|
||||
// seem worth it.
|
||||
let (client, connection) =
|
||||
tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
|
||||
|
||||
tokio::spawn(connection);
|
||||
let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1";
|
||||
let rows = client.query(query, &[&self.creds.user]).await?;
|
||||
tokio::spawn(connection);
|
||||
let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1";
|
||||
let rows = client.query(query, &[&self.creds.user]).await?;
|
||||
|
||||
match &rows[..] {
|
||||
// We can't get a secret if there's no such user.
|
||||
[] => Err(io_error(format!("unknown user '{}'", self.creds.user)).into()),
|
||||
// We can get at most one row, because `rolname` is unique.
|
||||
let row = match rows.get(0) {
|
||||
Some(row) => row,
|
||||
// This means that the user doesn't exist, so there can be no secret.
|
||||
// However, this is still a *valid* outcome which is very similar
|
||||
// to getting `404 Not found` from the Neon console.
|
||||
None => {
|
||||
warn!("user '{}' does not exist", self.creds.user);
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
|
||||
// We shouldn't get more than one row anyway.
|
||||
[row, ..] => {
|
||||
let entry = row
|
||||
.try_get("rolpassword")
|
||||
.map_err(|e| io_error(format!("failed to read user's password: {e}")))?;
|
||||
let entry = row
|
||||
.try_get("rolpassword")
|
||||
.map_err(MockApiError::PasswordNotSet)?;
|
||||
|
||||
scram::ServerSecret::parse(entry)
|
||||
.map(AuthInfo::Scram)
|
||||
.or_else(|| {
|
||||
// It could be an md5 hash if it's not a SCRAM secret.
|
||||
let text = entry.strip_prefix("md5")?;
|
||||
Some(AuthInfo::Md5({
|
||||
let mut bytes = [0u8; 16];
|
||||
hex::decode_to_slice(text, &mut bytes).ok()?;
|
||||
bytes
|
||||
}))
|
||||
})
|
||||
// Putting the secret into this message is a security hazard!
|
||||
.ok_or(GetAuthInfoError::BadSecret)
|
||||
}
|
||||
info!("got a secret: {entry}"); // safe since it's not a prod scenario
|
||||
let secret = scram::ServerSecret::parse(entry).map(AuthInfo::Scram);
|
||||
Ok(secret.or_else(|| parse_md5(entry).map(AuthInfo::Md5)))
|
||||
}
|
||||
.map_err(crate::error::log_error)
|
||||
.instrument(info_span!("get_auth_info", mock = self.endpoint.as_str()))
|
||||
.await
|
||||
}
|
||||
|
||||
/// We don't need to wake anything locally, so we just return the connection info.
|
||||
pub(super) async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
|
||||
pub async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
|
||||
let mut config = compute::ConnCfg::new();
|
||||
config
|
||||
.host(self.endpoint.host_str().unwrap_or("localhost"))
|
||||
@@ -93,3 +114,12 @@ impl<'a> Api<'a> {
|
||||
Ok(config)
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_md5(input: &str) -> Option<[u8; 16]> {
|
||||
let text = input.strip_prefix("md5")?;
|
||||
|
||||
let mut bytes = [0u8; 16];
|
||||
hex::decode_to_slice(text, &mut bytes).ok()?;
|
||||
|
||||
Some(bytes)
|
||||
}
|
||||
|
||||
@@ -89,7 +89,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
|
||||
/// Stream wrapper for handling [SCRAM](crate::scram) auth.
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
/// Perform user authentication. Raise an error in case authentication failed.
|
||||
pub async fn authenticate(self) -> super::Result<scram::ScramKey> {
|
||||
pub async fn authenticate(self) -> super::Result<sasl::Outcome<scram::ScramKey>> {
|
||||
// Initial client message contains the chosen auth method's name.
|
||||
let msg = self.stream.read_password_message().await?;
|
||||
let sasl = sasl::FirstMessage::parse(&msg)
|
||||
@@ -101,10 +101,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
}
|
||||
|
||||
let secret = self.state.0;
|
||||
let key = sasl::SaslStream::new(self.stream, sasl.message)
|
||||
let outcome = sasl::SaslStream::new(self.stream, sasl.message)
|
||||
.authenticate(scram::Exchange::new(secret, rand::random, None))
|
||||
.await?;
|
||||
|
||||
Ok(key)
|
||||
Ok(outcome)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,15 @@
|
||||
use std::io;
|
||||
use std::{error::Error as StdError, fmt, io};
|
||||
|
||||
/// Upcast (almost) any error into an opaque [`io::Error`].
|
||||
pub fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, e)
|
||||
}
|
||||
|
||||
/// A small combinator for pluggable error logging.
|
||||
pub fn log_error<E: fmt::Display>(e: E) -> E {
|
||||
tracing::error!("{e}");
|
||||
e
|
||||
}
|
||||
|
||||
/// Marks errors that may be safely shown to a client.
|
||||
/// This trait can be seen as a specialized version of [`ToString`].
|
||||
@@ -6,7 +17,7 @@ use std::io;
|
||||
/// NOTE: This trait should not be implemented for [`anyhow::Error`], since it
|
||||
/// is way too convenient and tends to proliferate all across the codebase,
|
||||
/// ultimately leading to accidental leaks of sensitive data.
|
||||
pub trait UserFacingError: ToString {
|
||||
pub trait UserFacingError: fmt::Display {
|
||||
/// Format the error for client, stripping all sensitive info.
|
||||
///
|
||||
/// Although this might be a no-op for many types, it's highly
|
||||
@@ -17,8 +28,3 @@ pub trait UserFacingError: ToString {
|
||||
self.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// Upcast (almost) any error into an opaque [`io::Error`].
|
||||
pub fn io_error(e: impl Into<Box<dyn std::error::Error + Send + Sync>>) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, e)
|
||||
}
|
||||
|
||||
@@ -37,16 +37,6 @@ impl Endpoint {
|
||||
) -> Result<reqwest::Response, reqwest::Error> {
|
||||
self.client.execute(request).await
|
||||
}
|
||||
|
||||
/// Execute a [request](reqwest::Request) and raise an error if status != 200.
|
||||
pub async fn checked_execute(
|
||||
&self,
|
||||
request: reqwest::Request,
|
||||
) -> Result<reqwest::Response, reqwest::Error> {
|
||||
self.execute(request)
|
||||
.await
|
||||
.and_then(|r| r.error_for_status())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -49,17 +49,6 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
/// A small combinator for pluggable error logging.
|
||||
async fn log_error<R, F>(future: F) -> F::Output
|
||||
where
|
||||
F: std::future::Future<Output = anyhow::Result<R>>,
|
||||
{
|
||||
future.await.map_err(|err| {
|
||||
error!("{err}");
|
||||
err
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn task_main(
|
||||
config: &'static ProxyConfig,
|
||||
listener: tokio::net::TcpListener,
|
||||
@@ -80,7 +69,7 @@ pub async fn task_main(
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
let cancel_map = Arc::clone(&cancel_map);
|
||||
tokio::spawn(
|
||||
log_error(async move {
|
||||
async move {
|
||||
info!("spawned a task for {peer_addr}");
|
||||
|
||||
socket
|
||||
@@ -88,6 +77,10 @@ pub async fn task_main(
|
||||
.context("failed to set socket option")?;
|
||||
|
||||
handle_client(config, &cancel_map, session_id, socket).await
|
||||
}
|
||||
.unwrap_or_else(|e| {
|
||||
// Acknowledge that the task has finished with an error.
|
||||
error!("per-client task finished with an error: {e:#}");
|
||||
})
|
||||
.instrument(info_span!("client", session = format_args!("{session_id}"))),
|
||||
);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
///! A group of high-level tests for connection establishing logic and auth.
|
||||
use super::*;
|
||||
use crate::{auth, scram};
|
||||
use crate::{auth, sasl, scram};
|
||||
use async_trait::async_trait;
|
||||
use rstest::rstest;
|
||||
use tokio_postgres::config::SslMode;
|
||||
@@ -100,8 +100,7 @@ impl Scram {
|
||||
}
|
||||
|
||||
fn mock(user: &str) -> Self {
|
||||
let salt = rand::random::<[u8; 32]>();
|
||||
Scram(scram::ServerSecret::mock(user, &salt))
|
||||
Scram(scram::ServerSecret::mock(user, rand::random()))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -111,13 +110,17 @@ impl TestAuth for Scram {
|
||||
self,
|
||||
stream: &mut PqStream<Stream<S>>,
|
||||
) -> anyhow::Result<()> {
|
||||
auth::AuthFlow::new(stream)
|
||||
let outcome = auth::AuthFlow::new(stream)
|
||||
.begin(auth::Scram(&self.0))
|
||||
.await?
|
||||
.authenticate()
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
use sasl::Outcome::*;
|
||||
match outcome {
|
||||
Success(_) => Ok(()),
|
||||
Failure(reason) => bail!("autentication failed with an error: {reason}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -16,22 +16,19 @@ use thiserror::Error;
|
||||
|
||||
pub use channel_binding::ChannelBinding;
|
||||
pub use messages::FirstMessage;
|
||||
pub use stream::SaslStream;
|
||||
pub use stream::{Outcome, SaslStream};
|
||||
|
||||
/// Fine-grained auth errors help in writing tests.
|
||||
#[derive(Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error("Failed to authenticate client: {0}")]
|
||||
AuthenticationFailed(&'static str),
|
||||
|
||||
#[error("Channel binding failed: {0}")]
|
||||
ChannelBindingFailed(&'static str),
|
||||
|
||||
#[error("Unsupported channel binding method: {0}")]
|
||||
ChannelBindingBadMethod(Box<str>),
|
||||
|
||||
#[error("Bad client message")]
|
||||
BadClientMessage,
|
||||
#[error("Bad client message: {0}")]
|
||||
BadClientMessage(&'static str),
|
||||
|
||||
#[error(transparent)]
|
||||
Io(#[from] io::Error),
|
||||
@@ -41,8 +38,6 @@ impl UserFacingError for Error {
|
||||
fn to_string_client(&self) -> String {
|
||||
use Error::*;
|
||||
match self {
|
||||
// This constructor contains the reason why auth has failed.
|
||||
AuthenticationFailed(s) => s.to_string(),
|
||||
// TODO: add support for channel binding
|
||||
ChannelBindingFailed(_) => "channel binding is not supported yet".to_string(),
|
||||
ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"),
|
||||
@@ -55,11 +50,14 @@ impl UserFacingError for Error {
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
/// A result of one SASL exchange.
|
||||
#[must_use]
|
||||
pub enum Step<T, R> {
|
||||
/// We should continue exchanging messages.
|
||||
Continue(T),
|
||||
Continue(T, String),
|
||||
/// The client has been authenticated successfully.
|
||||
Authenticated(R),
|
||||
Success(R, String),
|
||||
/// Authentication failed (reason attached).
|
||||
Failure(&'static str),
|
||||
}
|
||||
|
||||
/// Every SASL mechanism (e.g. [SCRAM](crate::scram)) is expected to implement this trait.
|
||||
@@ -69,5 +67,5 @@ pub trait Mechanism: Sized {
|
||||
|
||||
/// Produce a server challenge to be sent to the client.
|
||||
/// This is how this method is called in PostgreSQL (`libpq/sasl.h`).
|
||||
fn exchange(self, input: &str) -> Result<(Step<Self, Self::Output>, String)>;
|
||||
fn exchange(self, input: &str) -> Result<Step<Self, Self::Output>>;
|
||||
}
|
||||
|
||||
@@ -48,28 +48,41 @@ impl<S: AsyncWrite + Unpin> SaslStream<'_, S> {
|
||||
}
|
||||
}
|
||||
|
||||
/// SASL authentication outcome.
|
||||
/// It's much easier to match on those two variants
|
||||
/// than to peek into a noisy protocol error type.
|
||||
#[must_use = "caller must explicitly check for success"]
|
||||
pub enum Outcome<R> {
|
||||
/// Authentication succeeded and produced some value.
|
||||
Success(R),
|
||||
/// Authentication failed (reason attached).
|
||||
Failure(&'static str),
|
||||
}
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> SaslStream<'_, S> {
|
||||
/// Perform SASL message exchange according to the underlying algorithm
|
||||
/// until user is either authenticated or denied access.
|
||||
pub async fn authenticate<M: Mechanism>(
|
||||
mut self,
|
||||
mut mechanism: M,
|
||||
) -> super::Result<M::Output> {
|
||||
) -> super::Result<Outcome<M::Output>> {
|
||||
loop {
|
||||
let input = self.recv().await?;
|
||||
let (moved, reply) = mechanism.exchange(input)?;
|
||||
let step = mechanism.exchange(input)?;
|
||||
|
||||
use super::Step::*;
|
||||
match moved {
|
||||
Continue(moved) => {
|
||||
use super::Step;
|
||||
return Ok(match step {
|
||||
Step::Continue(moved_mechanism, reply) => {
|
||||
self.send(&ServerMessage::Continue(&reply)).await?;
|
||||
mechanism = moved;
|
||||
mechanism = moved_mechanism;
|
||||
continue;
|
||||
}
|
||||
Authenticated(result) => {
|
||||
Step::Success(result, reply) => {
|
||||
self.send(&ServerMessage::Final(&reply)).await?;
|
||||
return Ok(result);
|
||||
Outcome::Success(result)
|
||||
}
|
||||
}
|
||||
Step::Failure(reason) => Outcome::Failure(reason),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,12 +64,12 @@ impl<'a> Exchange<'a> {
|
||||
impl sasl::Mechanism for Exchange<'_> {
|
||||
type Output = super::ScramKey;
|
||||
|
||||
fn exchange(mut self, input: &str) -> sasl::Result<(sasl::Step<Self, Self::Output>, String)> {
|
||||
fn exchange(mut self, input: &str) -> sasl::Result<sasl::Step<Self, Self::Output>> {
|
||||
use {sasl::Step::*, ExchangeState::*};
|
||||
match &self.state {
|
||||
Initial => {
|
||||
let client_first_message =
|
||||
ClientFirstMessage::parse(input).ok_or(SaslError::BadClientMessage)?;
|
||||
let client_first_message = ClientFirstMessage::parse(input)
|
||||
.ok_or(SaslError::BadClientMessage("invalid client-first-message"))?;
|
||||
|
||||
let server_first_message = client_first_message.build_server_first_message(
|
||||
&(self.nonce)(),
|
||||
@@ -84,15 +84,15 @@ impl sasl::Mechanism for Exchange<'_> {
|
||||
server_first_message,
|
||||
};
|
||||
|
||||
Ok((Continue(self), msg))
|
||||
Ok(Continue(self, msg))
|
||||
}
|
||||
SaltSent {
|
||||
cbind_flag,
|
||||
client_first_message_bare,
|
||||
server_first_message,
|
||||
} => {
|
||||
let client_final_message =
|
||||
ClientFinalMessage::parse(input).ok_or(SaslError::BadClientMessage)?;
|
||||
let client_final_message = ClientFinalMessage::parse(input)
|
||||
.ok_or(SaslError::BadClientMessage("invalid client-final-message"))?;
|
||||
|
||||
let channel_binding = cbind_flag.encode(|_| {
|
||||
self.cert_digest
|
||||
@@ -106,9 +106,7 @@ impl sasl::Mechanism for Exchange<'_> {
|
||||
}
|
||||
|
||||
if client_final_message.nonce != server_first_message.nonce() {
|
||||
return Err(SaslError::AuthenticationFailed(
|
||||
"combined nonce doesn't match",
|
||||
));
|
||||
return Err(SaslError::BadClientMessage("combined nonce doesn't match"));
|
||||
}
|
||||
|
||||
let signature_builder = SignatureBuilder {
|
||||
@@ -121,14 +119,15 @@ impl sasl::Mechanism for Exchange<'_> {
|
||||
.build(&self.secret.stored_key)
|
||||
.derive_client_key(&client_final_message.proof);
|
||||
|
||||
if client_key.sha256() != self.secret.stored_key {
|
||||
return Err(SaslError::AuthenticationFailed("password doesn't match"));
|
||||
// Auth fails either if keys don't match or it's pre-determined to fail.
|
||||
if client_key.sha256() != self.secret.stored_key || self.secret.doomed {
|
||||
return Ok(Failure("password doesn't match"));
|
||||
}
|
||||
|
||||
let msg = client_final_message
|
||||
.build_server_final_message(signature_builder, &self.secret.server_key);
|
||||
|
||||
Ok((Authenticated(client_key), msg))
|
||||
Ok(Success(client_key, msg))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,6 +14,9 @@ pub struct ServerSecret {
|
||||
pub stored_key: ScramKey,
|
||||
/// Used by client to verify server's signature.
|
||||
pub server_key: ScramKey,
|
||||
/// Should auth fail no matter what?
|
||||
/// This is exactly the case for mocked secrets.
|
||||
pub doomed: bool,
|
||||
}
|
||||
|
||||
impl ServerSecret {
|
||||
@@ -30,6 +33,7 @@ impl ServerSecret {
|
||||
salt_base64: salt.to_owned(),
|
||||
stored_key: base64_decode_array(stored_key)?.into(),
|
||||
server_key: base64_decode_array(server_key)?.into(),
|
||||
doomed: false,
|
||||
};
|
||||
|
||||
Some(secret)
|
||||
@@ -38,16 +42,16 @@ impl ServerSecret {
|
||||
/// To avoid revealing information to an attacker, we use a
|
||||
/// mocked server secret even if the user doesn't exist.
|
||||
/// See `auth-scram.c : mock_scram_secret` for details.
|
||||
#[allow(dead_code)]
|
||||
pub fn mock(user: &str, nonce: &[u8; 32]) -> Self {
|
||||
pub fn mock(user: &str, nonce: [u8; 32]) -> Self {
|
||||
// Refer to `auth-scram.c : scram_mock_salt`.
|
||||
let mocked_salt = super::sha256([user.as_bytes(), nonce]);
|
||||
let mocked_salt = super::sha256([user.as_bytes(), &nonce]);
|
||||
|
||||
Self {
|
||||
iterations: 4096,
|
||||
salt_base64: base64::encode(&mocked_salt),
|
||||
stored_key: ScramKey::default(),
|
||||
server_key: ScramKey::default(),
|
||||
doomed: true,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -67,6 +71,7 @@ impl ServerSecret {
|
||||
salt_base64: base64::encode(&salt),
|
||||
stored_key: password.client_key().sha256(),
|
||||
server_key: password.server_key(),
|
||||
doomed: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,8 +109,9 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
|
||||
|
||||
/// Write the error message using [`Self::write_message`], then re-throw it.
|
||||
/// Allowing string literals is safe under the assumption they might not contain any runtime info.
|
||||
/// This method exists due to `&str` not implementing `Into<anyhow::Error>`.
|
||||
pub async fn throw_error_str<T>(&mut self, error: &'static str) -> anyhow::Result<T> {
|
||||
// This method exists due to `&str` not implementing `Into<anyhow::Error>`
|
||||
tracing::info!("forwarding error to user: {error}");
|
||||
self.write_message(&BeMessage::ErrorResponse(error)).await?;
|
||||
bail!(error)
|
||||
}
|
||||
@@ -122,6 +123,7 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
|
||||
E: UserFacingError + Into<anyhow::Error>,
|
||||
{
|
||||
let msg = error.to_string_client();
|
||||
tracing::info!("forwarding error to user: {msg}");
|
||||
self.write_message(&BeMessage::ErrorResponse(&msg)).await?;
|
||||
bail!(error)
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# version, we can consider updating.
|
||||
# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package,
|
||||
# we use "unstable" version number as the highest version used in the project by default.
|
||||
channel = "1.62.1" # do update GitHub CI cache values for rust builds, when changing this value
|
||||
channel = "1.62.1"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -4,11 +4,12 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
async-stream = "0.3"
|
||||
anyhow = "1.0"
|
||||
async-trait = "0.1"
|
||||
byteorder = "1.4.3"
|
||||
bytes = "1.0.1"
|
||||
clap = "4.0"
|
||||
clap = { version = "4.0", features = ["derive"] }
|
||||
const_format = "0.2.21"
|
||||
crc32c = "0.6.0"
|
||||
fs2 = "0.4.3"
|
||||
@@ -33,12 +34,12 @@ toml_edit = { version = "0.14", features = ["easy"] }
|
||||
tracing = "0.1.27"
|
||||
url = "2.2.2"
|
||||
|
||||
etcd_broker = { path = "../libs/etcd_broker" }
|
||||
metrics = { path = "../libs/metrics" }
|
||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||
pq_proto = { path = "../libs/pq_proto" }
|
||||
remote_storage = { path = "../libs/remote_storage" }
|
||||
safekeeper_api = { path = "../libs/safekeeper_api" }
|
||||
storage_broker = { version = "0.1", path = "../storage_broker" }
|
||||
utils = { path = "../libs/utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
|
||||
@@ -2,18 +2,20 @@
|
||||
// Main entry point for the safekeeper executable
|
||||
//
|
||||
use anyhow::{bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, Command};
|
||||
use const_format::formatcp;
|
||||
use clap::Parser;
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use toml_edit::Document;
|
||||
|
||||
use std::fs::{self, File};
|
||||
use std::io::{ErrorKind, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use storage_broker::Uri;
|
||||
use tokio::sync::mpsc;
|
||||
use toml_edit::Document;
|
||||
|
||||
use tracing::*;
|
||||
use url::{ParseError, Url};
|
||||
use utils::pid_file;
|
||||
|
||||
use metrics::set_build_info_metric;
|
||||
@@ -21,7 +23,7 @@ use safekeeper::broker;
|
||||
use safekeeper::control_file;
|
||||
use safekeeper::defaults::{
|
||||
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
||||
DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
|
||||
DEFAULT_PG_LISTEN_ADDR,
|
||||
};
|
||||
use safekeeper::http;
|
||||
use safekeeper::remove_wal;
|
||||
@@ -29,6 +31,7 @@ use safekeeper::wal_backup;
|
||||
use safekeeper::wal_service;
|
||||
use safekeeper::GlobalTimelines;
|
||||
use safekeeper::SafeKeeperConf;
|
||||
use storage_broker::DEFAULT_ENDPOINT;
|
||||
use utils::auth::JwtAuth;
|
||||
use utils::{
|
||||
http::endpoint,
|
||||
@@ -44,128 +47,131 @@ const ID_FILE_NAME: &str = "safekeeper.id";
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let arg_matches = cli().get_matches();
|
||||
const ABOUT: &str = r#"
|
||||
A fleet of safekeepers is responsible for reliably storing WAL received from
|
||||
compute, passing it through consensus (mitigating potential computes brain
|
||||
split), and serving the hardened part further downstream to pageserver(s).
|
||||
"#;
|
||||
|
||||
if let Some(addr) = arg_matches.get_one::<String>("dump-control-file") {
|
||||
let state = control_file::FileStorage::load_control_file(Path::new(addr))?;
|
||||
#[derive(Parser)]
|
||||
#[command(name = "Neon safekeeper", version = GIT_VERSION, about = ABOUT, long_about = None)]
|
||||
struct Args {
|
||||
/// Path to the safekeeper data directory.
|
||||
#[arg(short = 'D', long, default_value = "./")]
|
||||
datadir: PathBuf,
|
||||
/// Safekeeper node id.
|
||||
#[arg(long)]
|
||||
id: Option<u64>,
|
||||
/// Initialize safekeeper with given id and exit.
|
||||
#[arg(long)]
|
||||
init: bool,
|
||||
/// Listen endpoint for receiving/sending WAL in the form host:port.
|
||||
#[arg(short, long, default_value = DEFAULT_PG_LISTEN_ADDR)]
|
||||
listen_pg: String,
|
||||
/// Listen http endpoint for management and metrics in the form host:port.
|
||||
#[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)]
|
||||
listen_http: String,
|
||||
/// Do not wait for changes to be written safely to disk. Unsafe.
|
||||
#[arg(short, long)]
|
||||
no_sync: bool,
|
||||
/// Dump control file at path specified by this argument and exit.
|
||||
#[arg(long)]
|
||||
dump_control_file: Option<PathBuf>,
|
||||
/// Broker endpoint for storage nodes coordination in the form
|
||||
/// http[s]://host:port. In case of https schema TLS is connection is
|
||||
/// established; plaintext otherwise.
|
||||
#[arg(long, default_value = DEFAULT_ENDPOINT, verbatim_doc_comment)]
|
||||
broker_endpoint: Uri,
|
||||
/// Peer safekeeper is considered dead after not receiving heartbeats from
|
||||
/// it during this period passed as a human readable duration.
|
||||
#[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT)]
|
||||
heartbeat_timeout: Duration,
|
||||
/// Remote storage configuration for WAL backup (offloading to s3) as TOML
|
||||
/// inline table, e.g.
|
||||
/// {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "<BUCKETNAME>", "bucket_region":"<REGION>", "concurrency_limit": 119}
|
||||
/// Safekeeper offloads WAL to
|
||||
/// [prefix_in_bucket/]<tenant_id>/<timeline_id>/<segment_file>, mirroring
|
||||
/// structure on the file system.
|
||||
#[arg(long, value_parser = parse_remote_storage, verbatim_doc_comment)]
|
||||
remote_storage: Option<RemoteStorageConfig>,
|
||||
/// Safekeeper won't be elected for WAL offloading if it is lagging for more than this value in bytes
|
||||
#[arg(long, default_value_t = DEFAULT_MAX_OFFLOADER_LAG_BYTES)]
|
||||
max_offloader_lag: u64,
|
||||
/// Number of threads for wal backup runtime, by default number of cores
|
||||
/// available to the system.
|
||||
#[arg(long)]
|
||||
wal_backup_threads: Option<usize>,
|
||||
/// Disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring
|
||||
/// WAL backup horizon.
|
||||
#[arg(long)]
|
||||
disable_wal_backup: bool,
|
||||
/// Path to an RSA .pem public key which is used to check JWT tokens.
|
||||
#[arg(long)]
|
||||
auth_validation_public_key_path: Option<PathBuf>,
|
||||
/// Format for logging, either 'plain' or 'json'.
|
||||
#[arg(long, default_value = "plain")]
|
||||
log_format: String,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
if let Some(addr) = args.dump_control_file {
|
||||
let state = control_file::FileStorage::load_control_file(addr)?;
|
||||
let json = serde_json::to_string(&state)?;
|
||||
print!("{json}");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut conf = SafeKeeperConf::default();
|
||||
logging::init(LogFormat::from_config(&args.log_format)?)?;
|
||||
info!("version: {GIT_VERSION}");
|
||||
|
||||
if let Some(dir) = arg_matches.get_one::<PathBuf>("datadir") {
|
||||
// change into the data directory.
|
||||
std::env::set_current_dir(dir)?;
|
||||
// Change into the data directory.
|
||||
std::env::set_current_dir(&args.datadir)?;
|
||||
|
||||
// Set or read our ID.
|
||||
let id = set_id(&args.datadir, args.id.map(NodeId))?;
|
||||
if args.init {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if arg_matches.get_flag("no-sync") {
|
||||
conf.no_sync = true;
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.get_one::<String>("listen-pg") {
|
||||
conf.listen_pg_addr = addr.to_string();
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.get_one::<String>("listen-http") {
|
||||
conf.listen_http_addr = addr.to_string();
|
||||
}
|
||||
|
||||
let mut given_id = None;
|
||||
if let Some(given_id_str) = arg_matches.get_one::<String>("id") {
|
||||
given_id = Some(NodeId(
|
||||
given_id_str
|
||||
.parse()
|
||||
.context("failed to parse safekeeper id")?,
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.get_one::<String>("broker-endpoints") {
|
||||
let collected_ep: Result<Vec<Url>, ParseError> = addr.split(',').map(Url::parse).collect();
|
||||
conf.broker_endpoints = collected_ep.context("Failed to parse broker endpoint urls")?;
|
||||
}
|
||||
if let Some(prefix) = arg_matches.get_one::<String>("broker-etcd-prefix") {
|
||||
conf.broker_etcd_prefix = prefix.to_string();
|
||||
}
|
||||
|
||||
if let Some(heartbeat_timeout_str) = arg_matches.get_one::<String>("heartbeat-timeout") {
|
||||
conf.heartbeat_timeout =
|
||||
humantime::parse_duration(heartbeat_timeout_str).with_context(|| {
|
||||
format!(
|
||||
"failed to parse heartbeat-timeout {}",
|
||||
heartbeat_timeout_str
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
if let Some(backup_threads) = arg_matches.get_one::<String>("wal-backup-threads") {
|
||||
conf.backup_runtime_threads = backup_threads
|
||||
.parse()
|
||||
.with_context(|| format!("Failed to parse backup threads {}", backup_threads))?;
|
||||
}
|
||||
if let Some(storage_conf) = arg_matches.get_one::<String>("remote-storage") {
|
||||
// funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse
|
||||
let storage_conf_toml = format!("remote_storage = {}", storage_conf);
|
||||
let parsed_toml = storage_conf_toml.parse::<Document>()?; // parse
|
||||
let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again
|
||||
conf.remote_storage = Some(RemoteStorageConfig::from_toml(storage_conf_parsed_toml)?);
|
||||
}
|
||||
if let Some(max_offloader_lag_str) = arg_matches.get_one::<String>("max-offloader-lag") {
|
||||
conf.max_offloader_lag_bytes = max_offloader_lag_str.parse().with_context(|| {
|
||||
format!(
|
||||
"failed to parse max offloader lag {}",
|
||||
max_offloader_lag_str
|
||||
)
|
||||
})?;
|
||||
}
|
||||
// Seems like there is no better way to accept bool values explicitly in clap.
|
||||
conf.wal_backup_enabled = arg_matches
|
||||
.get_one::<String>("enable-wal-backup")
|
||||
.unwrap()
|
||||
.parse()
|
||||
.context("failed to parse bool enable-s3-offload bool")?;
|
||||
|
||||
conf.auth_validation_public_key_path = arg_matches
|
||||
.get_one::<String>("auth-validation-public-key-path")
|
||||
.map(PathBuf::from);
|
||||
|
||||
if let Some(log_format) = arg_matches.get_one::<String>("log-format") {
|
||||
conf.log_format = LogFormat::from_config(log_format)?;
|
||||
}
|
||||
let conf = SafeKeeperConf {
|
||||
workdir: args.datadir,
|
||||
my_id: id,
|
||||
listen_pg_addr: args.listen_pg,
|
||||
listen_http_addr: args.listen_http,
|
||||
no_sync: args.no_sync,
|
||||
broker_endpoint: args.broker_endpoint,
|
||||
heartbeat_timeout: args.heartbeat_timeout,
|
||||
remote_storage: args.remote_storage,
|
||||
max_offloader_lag_bytes: args.max_offloader_lag,
|
||||
backup_runtime_threads: args.wal_backup_threads,
|
||||
wal_backup_enabled: !args.disable_wal_backup,
|
||||
auth_validation_public_key_path: args.auth_validation_public_key_path,
|
||||
};
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.my_id.to_string())]);
|
||||
start_safekeeper(conf, given_id, arg_matches.get_flag("init"))
|
||||
start_safekeeper(conf)
|
||||
}
|
||||
|
||||
fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bool) -> Result<()> {
|
||||
logging::init(conf.log_format)?;
|
||||
info!("version: {GIT_VERSION}");
|
||||
|
||||
fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
// Prevent running multiple safekeepers on the same directory
|
||||
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
|
||||
let lock_file =
|
||||
pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
|
||||
info!("Claimed pid file at {lock_file_path:?}");
|
||||
info!("claimed pid file at {lock_file_path:?}");
|
||||
|
||||
// ensure that the lock file is held even if the main thread of the process is panics
|
||||
// we need to release the lock file only when the current process is gone
|
||||
std::mem::forget(lock_file);
|
||||
|
||||
// Set or read our ID.
|
||||
set_id(&mut conf, given_id)?;
|
||||
if init {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
info!("Starting safekeeper on {}", conf.listen_pg_addr);
|
||||
info!("starting safekeeper on {}", conf.listen_pg_addr);
|
||||
let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
|
||||
e
|
||||
@@ -173,11 +179,11 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
|
||||
|
||||
let auth = match conf.auth_validation_public_key_path.as_ref() {
|
||||
None => {
|
||||
info!("Auth is disabled");
|
||||
info!("auth is disabled");
|
||||
None
|
||||
}
|
||||
Some(path) => {
|
||||
info!("Loading JWT auth key from {}", path.display());
|
||||
info!("loading JWT auth key from {}", path.display());
|
||||
Some(Arc::new(
|
||||
JwtAuth::from_key_path(path).context("failed to load the auth key")?,
|
||||
))
|
||||
@@ -214,7 +220,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
|
||||
|
||||
let conf_cloned = conf.clone();
|
||||
let safekeeper_thread = thread::Builder::new()
|
||||
.name("Safekeeper thread".into())
|
||||
.name("safekeeper thread".into())
|
||||
.spawn(|| {
|
||||
if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener, auth) {
|
||||
info!("safekeeper thread terminated: {e}");
|
||||
@@ -224,19 +230,15 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
|
||||
|
||||
threads.push(safekeeper_thread);
|
||||
|
||||
if !conf.broker_endpoints.is_empty() {
|
||||
let conf_ = conf.clone();
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("broker thread".into())
|
||||
.spawn(|| {
|
||||
// TODO: add auth?
|
||||
broker::thread_main(conf_);
|
||||
})?,
|
||||
);
|
||||
} else {
|
||||
warn!("No broker endpoints providing, starting without node sync")
|
||||
}
|
||||
let conf_ = conf.clone();
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("broker thread".into())
|
||||
.spawn(|| {
|
||||
// TODO: add auth?
|
||||
broker::thread_main(conf_);
|
||||
})?,
|
||||
);
|
||||
|
||||
let conf_ = conf.clone();
|
||||
threads.push(
|
||||
@@ -247,12 +249,11 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
|
||||
})?,
|
||||
);
|
||||
|
||||
let conf_ = conf.clone();
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("wal backup launcher thread".into())
|
||||
.name("WAL backup launcher thread".into())
|
||||
.spawn(move || {
|
||||
wal_backup::wal_backup_launcher_thread_main(conf_, wal_backup_launcher_rx);
|
||||
wal_backup::wal_backup_launcher_thread_main(conf, wal_backup_launcher_rx);
|
||||
})?,
|
||||
);
|
||||
|
||||
@@ -271,12 +272,12 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
|
||||
})
|
||||
}
|
||||
|
||||
/// Determine safekeeper id and set it in config.
|
||||
fn set_id(conf: &mut SafeKeeperConf, given_id: Option<NodeId>) -> Result<()> {
|
||||
let id_file_path = conf.workdir.join(ID_FILE_NAME);
|
||||
/// Determine safekeeper id.
|
||||
fn set_id(workdir: &Path, given_id: Option<NodeId>) -> Result<NodeId> {
|
||||
let id_file_path = workdir.join(ID_FILE_NAME);
|
||||
|
||||
let my_id: NodeId;
|
||||
// If ID exists, read it in; otherwise set one passed
|
||||
// If file with ID exists, read it in; otherwise set one passed.
|
||||
match fs::read(&id_file_path) {
|
||||
Ok(id_serialized) => {
|
||||
my_id = NodeId(
|
||||
@@ -306,115 +307,30 @@ fn set_id(conf: &mut SafeKeeperConf, given_id: Option<NodeId>) -> Result<()> {
|
||||
let mut f = File::create(&id_file_path)?;
|
||||
f.write_all(my_id.to_string().as_bytes())?;
|
||||
f.sync_all()?;
|
||||
info!("initialized safekeeper ID {}", my_id);
|
||||
info!("initialized safekeeper id {}", my_id);
|
||||
}
|
||||
_ => {
|
||||
return Err(error.into());
|
||||
}
|
||||
},
|
||||
}
|
||||
conf.my_id = my_id;
|
||||
Ok(())
|
||||
Ok(my_id)
|
||||
}
|
||||
|
||||
fn cli() -> Command {
|
||||
Command::new("Neon safekeeper")
|
||||
.about("Store WAL stream to local file system and push it to WAL receivers")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::new("datadir")
|
||||
.short('D')
|
||||
.long("dir")
|
||||
.value_parser(value_parser!(PathBuf))
|
||||
.help("Path to the safekeeper data directory"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("init")
|
||||
.long("init")
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Initialize safekeeper with ID"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("listen-pg")
|
||||
.short('l')
|
||||
.long("listen-pg")
|
||||
.alias("listen") // for compatibility
|
||||
.help(formatcp!("listen for incoming WAL data connections on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("listen-http")
|
||||
.long("listen-http")
|
||||
.help(formatcp!("http endpoint address for metrics on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")),
|
||||
)
|
||||
// FIXME this argument is no longer needed since pageserver address is forwarded from compute.
|
||||
// However because this argument is in use by console's e2e tests let's keep it for now and remove separately.
|
||||
// So currently it is a noop.
|
||||
.arg(
|
||||
Arg::new("pageserver")
|
||||
.short('p')
|
||||
.long("pageserver"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("no-sync")
|
||||
.short('n')
|
||||
.long("no-sync")
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Do not wait for changes to be written safely to disk"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("dump-control-file")
|
||||
.long("dump-control-file")
|
||||
.help("Dump control file at path specified by this argument and exit"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("id").long("id").help("safekeeper node id: integer")
|
||||
).arg(
|
||||
Arg::new("broker-endpoints")
|
||||
.long("broker-endpoints")
|
||||
.help("a comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("broker-etcd-prefix")
|
||||
.long("broker-etcd-prefix")
|
||||
.help("a prefix to always use when polling/pusing data in etcd from this safekeeper"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("heartbeat-timeout")
|
||||
.long("heartbeat-timeout")
|
||||
.help(formatcp!("Peer is considered dead after not receiving heartbeats from it during this period (default {}s), passed as a human readable duration.", DEFAULT_HEARTBEAT_TIMEOUT.as_secs()))
|
||||
)
|
||||
.arg(
|
||||
Arg::new("wal-backup-threads").long("backup-threads").help(formatcp!("number of threads for wal backup (default {DEFAULT_WAL_BACKUP_RUNTIME_THREADS}")),
|
||||
).arg(
|
||||
Arg::new("remote-storage")
|
||||
.long("remote-storage")
|
||||
.help("Remote storage configuration for WAL backup (offloading to s3) as TOML inline table, e.g. {\"max_concurrent_syncs\" = 17, \"max_sync_errors\": 13, \"bucket_name\": \"<BUCKETNAME>\", \"bucket_region\":\"<REGION>\", \"concurrency_limit\": 119}.\nSafekeeper offloads WAL to [prefix_in_bucket/]<tenant_id>/<timeline_id>/<segment_file>, mirroring structure on the file system.")
|
||||
)
|
||||
.arg(
|
||||
Arg::new("max-offloader-lag")
|
||||
.long("max-offloader-lag")
|
||||
.help(formatcp!("Safekeeper won't be elected for WAL offloading if it is lagging for more than this value (default {}MB) in bytes", DEFAULT_MAX_OFFLOADER_LAG_BYTES / (1 << 20)))
|
||||
)
|
||||
.arg(
|
||||
Arg::new("enable-wal-backup")
|
||||
.long("enable-wal-backup")
|
||||
.default_value("true")
|
||||
.default_missing_value("true")
|
||||
.help("Enable/disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring WAL backup horizon."),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("auth-validation-public-key-path")
|
||||
.long("auth-validation-public-key-path")
|
||||
.help("Path to an RSA .pem public key which is used to check JWT tokens")
|
||||
)
|
||||
.arg(
|
||||
Arg::new("log-format")
|
||||
.long("log-format")
|
||||
.help("Format for logging, either 'plain' or 'json'")
|
||||
)
|
||||
// Parse RemoteStorage from TOML table.
|
||||
fn parse_remote_storage(storage_conf: &str) -> anyhow::Result<RemoteStorageConfig> {
|
||||
// funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse
|
||||
let storage_conf_toml = format!("remote_storage = {storage_conf}");
|
||||
let parsed_toml = storage_conf_toml.parse::<Document>()?; // parse
|
||||
let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again
|
||||
RemoteStorageConfig::from_toml(storage_conf_parsed_toml).and_then(|parsed_config| {
|
||||
// XXX: Don't print the original toml here, there might be some sensitive data
|
||||
parsed_config.context("Incorrectly parsed remote storage toml as no remote storage config")
|
||||
})
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_cli() {
|
||||
cli().debug_assert();
|
||||
use clap::CommandFactory;
|
||||
Args::command().debug_assert()
|
||||
}
|
||||
|
||||
@@ -1,15 +1,18 @@
|
||||
//! Communication with etcd, providing safekeeper peers and pageserver coordination.
|
||||
//! Communication with the broker, providing safekeeper peers and pageserver coordination.
|
||||
|
||||
use anyhow::anyhow;
|
||||
use anyhow::bail;
|
||||
use anyhow::Context;
|
||||
|
||||
use anyhow::Error;
|
||||
use anyhow::Result;
|
||||
use etcd_broker::subscription_value::SkTimelineInfo;
|
||||
use etcd_broker::LeaseKeepAliveStream;
|
||||
use etcd_broker::LeaseKeeper;
|
||||
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use storage_broker::parse_proto_ttid;
|
||||
use storage_broker::proto::broker_service_client::BrokerServiceClient;
|
||||
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
|
||||
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
|
||||
use storage_broker::Request;
|
||||
|
||||
use std::time::Duration;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::{runtime, time::sleep};
|
||||
@@ -17,15 +20,9 @@ use tracing::*;
|
||||
|
||||
use crate::GlobalTimelines;
|
||||
use crate::SafeKeeperConf;
|
||||
use etcd_broker::{
|
||||
subscription_key::{OperationKind, SkOperationKind, SubscriptionKey},
|
||||
Client, PutOptions,
|
||||
};
|
||||
use utils::id::{NodeId, TenantTimelineId};
|
||||
|
||||
const RETRY_INTERVAL_MSEC: u64 = 1000;
|
||||
const PUSH_INTERVAL_MSEC: u64 = 1000;
|
||||
const LEASE_TTL_SEC: i64 = 10;
|
||||
|
||||
pub fn thread_main(conf: SafeKeeperConf) {
|
||||
let runtime = runtime::Builder::new_current_thread()
|
||||
@@ -34,158 +31,70 @@ pub fn thread_main(conf: SafeKeeperConf) {
|
||||
.unwrap();
|
||||
|
||||
let _enter = info_span!("broker").entered();
|
||||
info!("started, broker endpoints {:?}", conf.broker_endpoints);
|
||||
info!("started, broker endpoint {:?}", conf.broker_endpoint);
|
||||
|
||||
runtime.block_on(async {
|
||||
main_loop(conf).await;
|
||||
});
|
||||
}
|
||||
|
||||
/// Key to per timeline per safekeeper data.
|
||||
fn timeline_safekeeper_path(
|
||||
broker_etcd_prefix: String,
|
||||
ttid: TenantTimelineId,
|
||||
sk_id: NodeId,
|
||||
) -> String {
|
||||
format!(
|
||||
"{}/{sk_id}",
|
||||
SubscriptionKey::sk_timeline_info(broker_etcd_prefix, ttid).watch_key()
|
||||
)
|
||||
}
|
||||
|
||||
async fn push_sk_info(
|
||||
ttid: TenantTimelineId,
|
||||
mut client: Client,
|
||||
key: String,
|
||||
sk_info: SkTimelineInfo,
|
||||
mut lease: Lease,
|
||||
) -> anyhow::Result<(TenantTimelineId, Lease)> {
|
||||
let put_opts = PutOptions::new().with_lease(lease.id);
|
||||
client
|
||||
.put(
|
||||
key.clone(),
|
||||
serde_json::to_string(&sk_info)?,
|
||||
Some(put_opts),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("failed to push safekeeper info to {}", key))?;
|
||||
|
||||
// revive the lease
|
||||
lease
|
||||
.keeper
|
||||
.keep_alive()
|
||||
.await
|
||||
.context("failed to send LeaseKeepAliveRequest")?;
|
||||
lease
|
||||
.ka_stream
|
||||
.message()
|
||||
.await
|
||||
.context("failed to receive LeaseKeepAliveResponse")?;
|
||||
|
||||
Ok((ttid, lease))
|
||||
}
|
||||
|
||||
struct Lease {
|
||||
id: i64,
|
||||
keeper: LeaseKeeper,
|
||||
ka_stream: LeaseKeepAliveStream,
|
||||
}
|
||||
|
||||
/// Push once in a while data about all active timelines to the broker.
|
||||
async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
let mut client = Client::connect(&conf.broker_endpoints, None).await?;
|
||||
let mut leases: HashMap<TenantTimelineId, Lease> = HashMap::new();
|
||||
|
||||
let mut client = BrokerServiceClient::connect(conf.broker_endpoint.clone()).await?;
|
||||
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
|
||||
loop {
|
||||
// Note: we lock runtime here and in timeline methods as GlobalTimelines
|
||||
// is under plain mutex. That's ok, all this code is not performance
|
||||
// sensitive and there is no risk of deadlock as we don't await while
|
||||
// lock is held.
|
||||
let mut active_tlis = GlobalTimelines::get_all();
|
||||
active_tlis.retain(|tli| tli.is_active());
|
||||
|
||||
let active_tlis_set: HashSet<TenantTimelineId> =
|
||||
active_tlis.iter().map(|tli| tli.ttid).collect();
|
||||
|
||||
// // Get and maintain (if not yet) per timeline lease to automatically delete obsolete data.
|
||||
for tli in &active_tlis {
|
||||
if let Entry::Vacant(v) = leases.entry(tli.ttid) {
|
||||
let lease = client.lease_grant(LEASE_TTL_SEC, None).await?;
|
||||
let (keeper, ka_stream) = client.lease_keep_alive(lease.id()).await?;
|
||||
v.insert(Lease {
|
||||
id: lease.id(),
|
||||
keeper,
|
||||
ka_stream,
|
||||
});
|
||||
}
|
||||
}
|
||||
leases.retain(|ttid, _| active_tlis_set.contains(ttid));
|
||||
|
||||
// Push data concurrently to not suffer from latency, with many timelines it can be slow.
|
||||
let handles = active_tlis
|
||||
.iter()
|
||||
.map(|tli| {
|
||||
let outbound = async_stream::stream! {
|
||||
loop {
|
||||
// Note: we lock runtime here and in timeline methods as GlobalTimelines
|
||||
// is under plain mutex. That's ok, all this code is not performance
|
||||
// sensitive and there is no risk of deadlock as we don't await while
|
||||
// lock is held.
|
||||
let mut active_tlis = GlobalTimelines::get_all();
|
||||
active_tlis.retain(|tli| tli.is_active());
|
||||
for tli in &active_tlis {
|
||||
let sk_info = tli.get_safekeeper_info(&conf);
|
||||
let key =
|
||||
timeline_safekeeper_path(conf.broker_etcd_prefix.clone(), tli.ttid, conf.my_id);
|
||||
let lease = leases.remove(&tli.ttid).unwrap();
|
||||
tokio::spawn(push_sk_info(tli.ttid, client.clone(), key, sk_info, lease))
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
for h in handles {
|
||||
let (ttid, lease) = h.await??;
|
||||
// It is ugly to pull leases from hash and then put it back, but
|
||||
// otherwise we have to resort to long living per tli tasks (which
|
||||
// would generate a lot of errors when etcd is down) as task wants to
|
||||
// have 'static objects, we can't borrow to it.
|
||||
leases.insert(ttid, lease);
|
||||
yield sk_info;
|
||||
}
|
||||
sleep(push_interval).await;
|
||||
}
|
||||
|
||||
sleep(push_interval).await;
|
||||
}
|
||||
};
|
||||
client
|
||||
.publish_safekeeper_info(Request::new(outbound))
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Subscribe and fetch all the interesting data from the broker.
|
||||
async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
|
||||
let mut client = Client::connect(&conf.broker_endpoints, None).await?;
|
||||
let mut client = storage_broker::connect(conf.broker_endpoint)?;
|
||||
|
||||
let mut subscription = etcd_broker::subscribe_for_values(
|
||||
&mut client,
|
||||
SubscriptionKey::all(conf.broker_etcd_prefix.clone()),
|
||||
|full_key, value_str| {
|
||||
if full_key.operation == OperationKind::Safekeeper(SkOperationKind::TimelineInfo) {
|
||||
match serde_json::from_str::<SkTimelineInfo>(value_str) {
|
||||
Ok(new_info) => return Some(new_info),
|
||||
Err(e) => {
|
||||
error!("Failed to parse timeline info from value str '{value_str}': {e}")
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
},
|
||||
)
|
||||
.await
|
||||
.context("failed to subscribe for safekeeper info")?;
|
||||
loop {
|
||||
match subscription.value_updates.recv().await {
|
||||
Some(new_info) => {
|
||||
// note: there are blocking operations below, but it's considered fine for now
|
||||
if let Ok(tli) = GlobalTimelines::get(new_info.key.id) {
|
||||
// Note that we also receive *our own* info. That's
|
||||
// important, as it is used as an indication of live
|
||||
// connection to the broker.
|
||||
tli.record_safekeeper_info(&new_info.value, new_info.key.node_id)
|
||||
.await?
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// XXX it means we lost connection with etcd, error is consumed inside sub object
|
||||
debug!("timeline updates sender closed, aborting the pull loop");
|
||||
return Ok(());
|
||||
}
|
||||
// TODO: subscribe only to local timelines instead of all
|
||||
let request = SubscribeSafekeeperInfoRequest {
|
||||
subscription_key: Some(ProtoSubscriptionKey::All(())),
|
||||
};
|
||||
|
||||
let mut stream = client
|
||||
.subscribe_safekeeper_info(request)
|
||||
.await
|
||||
.context("subscribe_safekeper_info request failed")?
|
||||
.into_inner();
|
||||
|
||||
while let Some(msg) = stream.message().await? {
|
||||
let proto_ttid = msg
|
||||
.tenant_timeline_id
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
|
||||
let ttid = parse_proto_ttid(proto_ttid)?;
|
||||
if let Ok(tli) = GlobalTimelines::get(ttid) {
|
||||
// Note that we also receive *our own* info. That's
|
||||
// important, as it is used as an indication of live
|
||||
// connection to the broker.
|
||||
|
||||
// note: there are blocking operations below, but it's considered fine for now
|
||||
tli.record_safekeeper_info(&msg).await?
|
||||
}
|
||||
}
|
||||
bail!("end of stream");
|
||||
}
|
||||
|
||||
async fn main_loop(conf: SafeKeeperConf) {
|
||||
|
||||
@@ -231,7 +231,7 @@ mod test {
|
||||
let workdir = tempfile::tempdir().unwrap().into_path();
|
||||
SafeKeeperConf {
|
||||
workdir,
|
||||
..Default::default()
|
||||
..SafeKeeperConf::dummy()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user