mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-17 10:22:56 +00:00
Compare commits
81 Commits
layer_map_
...
alexk/get_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1e3a6cc813 | ||
|
|
eb403da814 | ||
|
|
f3ad635911 | ||
|
|
a8d7360881 | ||
|
|
b0311cfdeb | ||
|
|
412e0aa985 | ||
|
|
965b4f4ae2 | ||
|
|
95018672fa | ||
|
|
2caece2077 | ||
|
|
b8b8c19fb4 | ||
|
|
225add041f | ||
|
|
5d001b1e5a | ||
|
|
fe462de85b | ||
|
|
c0de7f5cd8 | ||
|
|
b220ba6cd1 | ||
|
|
7de373210d | ||
|
|
5c5b03ce08 | ||
|
|
d7d3f451f0 | ||
|
|
bc7d3c6476 | ||
|
|
e3d75879c0 | ||
|
|
485b269674 | ||
|
|
ee1eda9921 | ||
|
|
e363911c85 | ||
|
|
d5d690c044 | ||
|
|
8f557477c6 | ||
|
|
af210c8b42 | ||
|
|
2153d2e00a | ||
|
|
564fa11244 | ||
|
|
8d28a24b26 | ||
|
|
53128d56d9 | ||
|
|
40799d8ae7 | ||
|
|
b242b0ad67 | ||
|
|
d90cd36bcc | ||
|
|
956b6f17ca | ||
|
|
6f9af0aa8c | ||
|
|
8e6b27bf7c | ||
|
|
ae3eff1ad2 | ||
|
|
501702b27c | ||
|
|
526f8b76aa | ||
|
|
a1b062123b | ||
|
|
a4d5c8085b | ||
|
|
edffe0dd9d | ||
|
|
d9c518b2cc | ||
|
|
0d3aefb274 | ||
|
|
6139e8e426 | ||
|
|
d9ba3c5f5e | ||
|
|
0cf7fd0fb8 | ||
|
|
f0b41e7750 | ||
|
|
5082d84f5b | ||
|
|
7991bd3b69 | ||
|
|
ddbdcdddd7 | ||
|
|
7b182e2605 | ||
|
|
1d9d7c02db | ||
|
|
a974602f9f | ||
|
|
a839860c2e | ||
|
|
a5ce2b5330 | ||
|
|
3569c1bacd | ||
|
|
86681b92aa | ||
|
|
eb21d9969d | ||
|
|
e6618f1cc0 | ||
|
|
eaff14da5f | ||
|
|
f383b4d540 | ||
|
|
694150ce40 | ||
|
|
f4359b688c | ||
|
|
948f047f0a | ||
|
|
4175cfbdac | ||
|
|
9657459d80 | ||
|
|
a4256b3250 | ||
|
|
175a577ad4 | ||
|
|
1fdf01e3bc | ||
|
|
446a39e969 | ||
|
|
7ed9eb4a56 | ||
|
|
f07d6433b6 | ||
|
|
2040db98ef | ||
|
|
371493ae32 | ||
|
|
1b9e5e84aa | ||
|
|
7ed93fff06 | ||
|
|
a6dffb6ef9 | ||
|
|
c5c14368e3 | ||
|
|
1254dc7ee2 | ||
|
|
fcb905f519 |
@@ -21,3 +21,4 @@
|
||||
!workspace_hack/
|
||||
!neon_local/
|
||||
!scripts/ninstall.sh
|
||||
!vm-cgconfig.conf
|
||||
|
||||
@@ -2,11 +2,11 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-ap-southeast-1
|
||||
bucket_region: ap-southeast-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
@@ -32,7 +32,7 @@ storage:
|
||||
hosts:
|
||||
safekeeper-0.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-0d6f1dc5161eef894
|
||||
safekeeper-1.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-0e338adda8eb2d19f
|
||||
safekeeper-2.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-04fb63634e4679eb9
|
||||
safekeeper-3.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-05481f3bc88cfc2d4
|
||||
|
||||
4
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
4
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
@@ -2,11 +2,11 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-eu-central-1
|
||||
bucket_region: eu-central-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
|
||||
4
.github/ansible/prod.us-east-2.hosts.yaml
vendored
4
.github/ansible/prod.us-east-2.hosts.yaml
vendored
@@ -2,11 +2,11 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-us-east-2
|
||||
bucket_region: us-east-2
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
|
||||
6
.github/ansible/prod.us-west-2.hosts.yaml
vendored
6
.github/ansible/prod.us-west-2.hosts.yaml
vendored
@@ -2,11 +2,11 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-us-west-2
|
||||
bucket_region: us-west-2
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
@@ -29,6 +29,8 @@ storage:
|
||||
ansible_host: i-0c834be1dddba8b3f
|
||||
pageserver-2.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-051642d372c0a4f32
|
||||
pageserver-3.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-00c3844beb9ad1c6b
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
|
||||
40
.github/ansible/production.hosts.yaml
vendored
40
.github/ansible/production.hosts.yaml
vendored
@@ -1,40 +0,0 @@
|
||||
---
|
||||
storage:
|
||||
vars:
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
bucket_name: zenith-storage-oregon
|
||||
bucket_region: us-west-2
|
||||
broker_endpoint: http://storage-broker.prod.local:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "{{ inventory_hostname }}"
|
||||
safekeeper_s3_prefix: prod-1/wal
|
||||
hostname_suffix: ".local"
|
||||
remote_user: admin
|
||||
sentry_environment: production
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
zenith-1-ps-2:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-3:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-4:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-5:
|
||||
console_region_id: aws-us-west-2
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
zenith-1-sk-1:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-sk-2:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-sk-4:
|
||||
console_region_id: aws-us-west-2
|
||||
9
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
9
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
@@ -2,12 +2,17 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-dev-storage-eu-west-1
|
||||
bucket_region: eu-west-1
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.build
|
||||
broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "20m"
|
||||
threshold: "20m"
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
13
.github/ansible/staging.us-east-2.hosts.yaml
vendored
13
.github/ansible/staging.us-east-2.hosts.yaml
vendored
@@ -2,12 +2,17 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-staging-storage-us-east-2
|
||||
bucket_region: us-east-2
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.build
|
||||
broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "20m"
|
||||
threshold: "20m"
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
@@ -31,6 +36,8 @@ storage:
|
||||
ansible_host: i-01e31cdf7e970586a
|
||||
pageserver-3.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0602a0291365ef7cc
|
||||
pageserver-99.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0c39491109bb88824
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
@@ -40,3 +47,5 @@ storage:
|
||||
ansible_host: i-0171efc3604a7b907
|
||||
safekeeper-2.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0de0b03a51676a6ce
|
||||
safekeeper-99.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0d61b6a2ea32028d5
|
||||
|
||||
@@ -1,16 +1,31 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.eu-west-1.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -10,7 +10,7 @@ settings:
|
||||
uri: "https://console.stage.neon.tech/psql_session/"
|
||||
domain: "pg.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy-link pods
|
||||
|
||||
@@ -6,11 +6,11 @@ image:
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.cloud.stage.neon.tech"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -1,16 +1,31 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -1,16 +1,32 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.ap-southeast-1.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -1,16 +1,32 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.eu-central-1.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -1,16 +1,32 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -6,11 +6,11 @@ image:
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.cloud.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -1,16 +1,32 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.us-west-2.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -1,56 +0,0 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: production
|
||||
neon_service: storage-broker
|
||||
|
||||
# Use L4 LB
|
||||
service:
|
||||
# service.annotations -- Annotations to add to the service
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||
# assign service to this name at external-dns
|
||||
external-dns.alpha.kubernetes.io/hostname: storage-broker.prod.local
|
||||
# service.type -- Service type
|
||||
type: LoadBalancer
|
||||
# service.port -- broker listen port
|
||||
port: 50051
|
||||
|
||||
ingress:
|
||||
enabled: false
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "production"
|
||||
25
.github/workflows/build_and_test.yml
vendored
25
.github/workflows/build_and_test.yml
vendored
@@ -611,34 +611,31 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_INFORMANT_VERSION: 0.1.1
|
||||
VM_BUILDER_VERSION: v0.4.6
|
||||
|
||||
steps:
|
||||
- name: Downloading latest vm-builder
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Downloading vm-builder
|
||||
run: |
|
||||
curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder
|
||||
curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
||||
chmod +x vm-builder
|
||||
|
||||
- name: Pulling compute-node image
|
||||
run: |
|
||||
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Downloading VM informant version ${{ env.VM_INFORMANT_VERSION }}
|
||||
- name: Building VM compute-node rootfs
|
||||
run: |
|
||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/${{ env.VM_INFORMANT_VERSION }}/vm-informant -o vm-informant
|
||||
chmod +x vm-informant
|
||||
|
||||
- name: Adding VM informant to compute-node image
|
||||
run: |
|
||||
ID=$(docker create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}})
|
||||
docker cp vm-informant $ID:/bin/vm-informant
|
||||
docker commit $ID temp-vm-compute-node
|
||||
docker rm -f $ID
|
||||
docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node .
|
||||
|
||||
- name: Build vm image
|
||||
run: |
|
||||
# note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
|
||||
./vm-builder -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Pushing vm-compute-node image
|
||||
run: |
|
||||
|
||||
2
.github/workflows/deploy-dev.yml
vendored
2
.github/workflows/deploy-dev.yml
vendored
@@ -67,7 +67,7 @@ jobs:
|
||||
./get_binaries.sh
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
ansible-playbook -v deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
- name: Cleanup ansible folder
|
||||
|
||||
81
.github/workflows/deploy-prod.yml
vendored
81
.github/workflows/deploy-prod.yml
vendored
@@ -40,7 +40,9 @@ concurrency:
|
||||
jobs:
|
||||
deploy-prod-new:
|
||||
runs-on: prod
|
||||
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
container:
|
||||
image: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
options: --user root --privileged
|
||||
if: inputs.deployStorage && inputs.disclamerAcknowledged
|
||||
defaults:
|
||||
run:
|
||||
@@ -66,7 +68,7 @@ jobs:
|
||||
./get_binaries.sh
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
ansible-playbook -v deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-proxy-prod-new:
|
||||
@@ -163,78 +165,3 @@ jobs:
|
||||
- name: Deploy storage-broker
|
||||
run:
|
||||
helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
||||
|
||||
# Deploy to old account below
|
||||
|
||||
deploy:
|
||||
runs-on: prod
|
||||
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
if: inputs.deployStorage && inputs.disclamerAcknowledged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
environment:
|
||||
name: prod-old
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Redeploy
|
||||
run: |
|
||||
export DOCKER_TAG=${{ inputs.dockerTag }}
|
||||
cd "$(pwd)/.github/ansible"
|
||||
|
||||
./get_binaries.sh
|
||||
|
||||
eval $(ssh-agent)
|
||||
echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key
|
||||
echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
|
||||
chmod 0600 ssh-key
|
||||
ssh-add ssh-key
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
|
||||
ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied
|
||||
- name: Cleanup ansible folder
|
||||
run: rm -rf ~/.ansible
|
||||
|
||||
deploy-storage-broker:
|
||||
name: deploy storage broker on old staging and old prod
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
if: inputs.deployStorageBroker && inputs.disclamerAcknowledged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
environment:
|
||||
name: prod-old
|
||||
env:
|
||||
KUBECONFIG: .kubeconfig
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Store kubeconfig file
|
||||
run: |
|
||||
echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
|
||||
chmod 0600 ${KUBECONFIG}
|
||||
|
||||
- name: Add neon helm chart
|
||||
run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
|
||||
- name: Deploy storage-broker
|
||||
run:
|
||||
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
||||
|
||||
- name: Cleanup helm folder
|
||||
run: rm -rf ~/.cache
|
||||
|
||||
71
Cargo.lock
generated
71
Cargo.lock
generated
@@ -917,6 +917,7 @@ dependencies = [
|
||||
"reqwest",
|
||||
"safekeeper_api",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"storage_broker",
|
||||
"tar",
|
||||
@@ -2109,6 +2110,16 @@ version = "0.3.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
|
||||
|
||||
[[package]]
|
||||
name = "mime_guess"
|
||||
version = "2.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
|
||||
dependencies = [
|
||||
"mime",
|
||||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "minimal-lexical"
|
||||
version = "0.2.1"
|
||||
@@ -2421,6 +2432,7 @@ dependencies = [
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"crossbeam-utils",
|
||||
"either",
|
||||
"enum-map",
|
||||
"enumset",
|
||||
"fail",
|
||||
@@ -2484,6 +2496,7 @@ dependencies = [
|
||||
"enum-map",
|
||||
"postgres_ffi",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
@@ -2881,6 +2894,7 @@ dependencies = [
|
||||
"md5",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"parking_lot",
|
||||
"pin-project-lite",
|
||||
"pq_proto",
|
||||
@@ -2889,6 +2903,8 @@ dependencies = [
|
||||
"rcgen",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
"reqwest-tracing",
|
||||
"routerify",
|
||||
"rstest",
|
||||
"rustls",
|
||||
@@ -2898,6 +2914,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"socket2",
|
||||
"sync_wrapper",
|
||||
"thiserror",
|
||||
"tls-listener",
|
||||
"tokio",
|
||||
@@ -2905,7 +2922,9 @@ dependencies = [
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"tracing-utils",
|
||||
"url",
|
||||
"utils",
|
||||
"uuid",
|
||||
@@ -3035,6 +3054,7 @@ dependencies = [
|
||||
"hyper",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
@@ -3075,6 +3095,7 @@ dependencies = [
|
||||
"js-sys",
|
||||
"log",
|
||||
"mime",
|
||||
"mime_guess",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
@@ -3094,6 +3115,36 @@ dependencies = [
|
||||
"winreg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-middleware"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a1c03e9011a8c59716ad13115550469e081e2e9892656b0ba6a47c907921894"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"http",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"task-local-extensions",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-tracing"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b739d87a6b2cf4743968ad2b4cef648fbe0204c19999509824425babb2097bce"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"opentelemetry",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
"task-local-extensions",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.16.20"
|
||||
@@ -3786,6 +3837,15 @@ dependencies = [
|
||||
"xattr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "task-local-extensions"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4167afbec18ae012de40f8cf1b9bf48420abb390678c34821caa07d924941cc4"
|
||||
dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.3.0"
|
||||
@@ -3805,6 +3865,8 @@ name = "tenant_size_model"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -4354,6 +4416,15 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicase"
|
||||
version = "2.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
|
||||
dependencies = [
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-bidi"
|
||||
version = "0.3.10"
|
||||
|
||||
@@ -38,6 +38,7 @@ comfy-table = "6.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
crossbeam-utils = "0.8.5"
|
||||
either = "1.8"
|
||||
enum-map = "2.4.2"
|
||||
enumset = "1.0.12"
|
||||
fail = "0.5.0"
|
||||
@@ -68,7 +69,6 @@ once_cell = "1.13"
|
||||
opentelemetry = "0.18.0"
|
||||
opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.10.0"
|
||||
tracing-opentelemetry = "0.18.0"
|
||||
parking_lot = "0.12"
|
||||
pin-project-lite = "0.2"
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
@@ -76,6 +76,8 @@ prost = "0.11"
|
||||
rand = "0.8"
|
||||
regex = "1.4"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
|
||||
reqwest-middleware = "0.2.0"
|
||||
routerify = "3"
|
||||
rpds = "0.12.0"
|
||||
rustls = "0.20"
|
||||
@@ -92,6 +94,7 @@ socket2 = "0.4.4"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
svg_fmt = "0.4.1"
|
||||
sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
thiserror = "1.0"
|
||||
tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
|
||||
@@ -104,6 +107,7 @@ toml = "0.5"
|
||||
toml_edit = { version = "0.17", features = ["easy"] }
|
||||
tonic = {version = "0.8", features = ["tls", "tls-roots"]}
|
||||
tracing = "0.1"
|
||||
tracing-opentelemetry = "0.18.0"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
url = "2.2"
|
||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
ARG PG_VERSION
|
||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
@@ -11,7 +12,7 @@ FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
|
||||
libicu-dev
|
||||
libicu-dev libxslt1-dev
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -23,7 +24,8 @@ FROM build-deps AS pg-build
|
||||
ARG PG_VERSION
|
||||
COPY vendor/postgres-${PG_VERSION} postgres
|
||||
RUN cd postgres && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu \
|
||||
--with-libxml --with-libxslt && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
@@ -34,7 +36,8 @@ RUN cd postgres && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -50,17 +53,18 @@ RUN apt update && \
|
||||
libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
|
||||
protobuf-c-compiler xsltproc
|
||||
|
||||
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz && \
|
||||
tar zxvf SFCGAL-v1.3.10.tar.gz && \
|
||||
cd SFCGAL-v1.3.10 && cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
|
||||
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
|
||||
mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
|
||||
cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make clean && cp -R /sfcgal/* /
|
||||
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||
tar xvzf postgis-3.3.1.tar.gz && \
|
||||
cd postgis-3.3.1 && \
|
||||
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
|
||||
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
|
||||
./autogen.sh && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
cd extensions/postgis && \
|
||||
@@ -74,6 +78,15 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
|
||||
|
||||
RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
|
||||
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
cmake .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "plv8-build"
|
||||
@@ -83,30 +96,17 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||
FROM build-deps AS plv8-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
|
||||
apt install -y ninja-build python3-dev libncurses5 binutils clang
|
||||
|
||||
# https://github.com/plv8/plv8/issues/475:
|
||||
# v8 uses gold for linking and sets `--thread-count=4` which breaks
|
||||
# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
|
||||
# Install newer gold version manually as debian-testing binutils version updates
|
||||
# libc version, which in turn breaks other extension built against non-testing libc.
|
||||
RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
|
||||
tar xvzf binutils-2.38.tar.gz && \
|
||||
cd binutils-2.38 && \
|
||||
cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
cd ../bfd && ./configure && make bfdver.h && \
|
||||
cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
|
||||
cp /usr/local/bin/ld.gold /usr/bin/gold
|
||||
|
||||
# Sed is used to patch for https://github.com/plv8/plv8/issues/503
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||
tar xvzf v3.1.4.tar.gz && \
|
||||
cd plv8-3.1.4 && \
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \
|
||||
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
|
||||
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
|
||||
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -124,20 +124,17 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2
|
||||
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
|
||||
&& rm /tmp/cmake-install.sh
|
||||
|
||||
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
|
||||
tar xvzf h3.tgz && \
|
||||
cd h3-4.0.1 && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
|
||||
mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && cd build && \
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
DESTDIR=/h3 make install && \
|
||||
cp -R /h3/usr / && \
|
||||
rm -rf build
|
||||
|
||||
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
|
||||
tar xvzf h3-pg.tgz && \
|
||||
cd h3-pg-4.0.1 && \
|
||||
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \
|
||||
mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
@@ -153,9 +150,8 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
|
||||
FROM build-deps AS unit-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \
|
||||
tar xvzf 7.7.tar.gz && \
|
||||
cd postgresql-unit-7.7 && \
|
||||
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
|
||||
mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
# unit extension's "create extension" script relies on absolute install path to fill some reference tables.
|
||||
@@ -165,6 +161,107 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz &
|
||||
find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "vector-pg-build"
|
||||
# compile pgvector extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS vector-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pgjwt-pg-build"
|
||||
# compile pgjwt extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pgjwt-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
|
||||
RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
|
||||
mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "hypopg-pg-build"
|
||||
# compile hypopg extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS hypopg-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypopg.tar.gz && \
|
||||
mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions"
|
||||
# This layer is used to build `pgx` deps
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rust-extensions-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl libclang-dev cmake && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||
rm rustup-init && \
|
||||
cargo install --git https://github.com/vadim2404/pgx --branch neon_abi_v0.6.1 --locked cargo-pgx && \
|
||||
/bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
USER root
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-jsonschema-pg-build"
|
||||
# Compile "pg_jsonschema" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-jsonschema-pg-build
|
||||
|
||||
RUN git clone --depth=1 --single-branch --branch neon_abi_v0.1.4 https://github.com/vadim2404/pg_jsonschema/ && \
|
||||
cd pg_jsonschema && \
|
||||
cargo pgx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-graphql-pg-build"
|
||||
# Compile "pg_graphql" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-graphql-pg-build
|
||||
|
||||
RUN git clone --depth=1 --single-branch --branch neon_abi_v1.1.0 https://github.com/vadim2404/pg_graphql && \
|
||||
cd pg_graphql && \
|
||||
cargo pgx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
@@ -178,6 +275,11 @@ COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=h3-pg-build /h3/usr /
|
||||
COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -228,7 +330,9 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||
chown -R postgres:postgres /var/db/postgres && \
|
||||
chmod 0750 /var/db/postgres/compute && \
|
||||
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
|
||||
# create folder for file cache
|
||||
mkdir -p -m 777 /neon/cache
|
||||
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
@@ -238,6 +342,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
|
||||
# libicu67, locales for collations (including ICU)
|
||||
# libossp-uuid16 for extension ossp-uuid
|
||||
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
||||
# libxml2, libxslt1.1 for xml2
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
locales \
|
||||
@@ -249,6 +354,8 @@ RUN apt update && \
|
||||
libproj19 \
|
||||
libprotobuf-c1 \
|
||||
libsfcgal1 \
|
||||
libxml2 \
|
||||
libxslt1.1 \
|
||||
gdb && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
32
Dockerfile.vm-compute-node
Normal file
32
Dockerfile.vm-compute-node
Normal file
@@ -0,0 +1,32 @@
|
||||
# Note: this file *mostly* just builds on Dockerfile.compute-node
|
||||
|
||||
ARG SRC_IMAGE
|
||||
ARG VM_INFORMANT_VERSION=v0.1.6
|
||||
|
||||
# Pull VM informant and set up inittab
|
||||
FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant
|
||||
|
||||
RUN set -e \
|
||||
&& rm -f /etc/inittab \
|
||||
&& touch /etc/inittab
|
||||
|
||||
ADD vm-cgconfig.conf /etc/cgconfig.conf
|
||||
RUN set -e \
|
||||
&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
|
||||
&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart --cgroup=neon-postgres'" >> /etc/inittab
|
||||
|
||||
# Combine, starting from non-VM compute node image.
|
||||
FROM $SRC_IMAGE as base
|
||||
|
||||
# Temporarily set user back to root so we can run apt update and adduser
|
||||
USER root
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
cgroup-tools
|
||||
RUN adduser vm-informant --disabled-password --no-create-home
|
||||
USER postgres
|
||||
|
||||
COPY --from=informant /etc/inittab /etc/inittab
|
||||
COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant
|
||||
|
||||
ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"]
|
||||
12
Makefile
12
Makefile
@@ -136,9 +136,15 @@ neon-pg-ext-%: postgres-%
|
||||
|
||||
.PHONY: neon-pg-ext-clean-%
|
||||
neon-pg-ext-clean-%:
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
|
||||
|
||||
.PHONY: neon-pg-ext
|
||||
neon-pg-ext: \
|
||||
|
||||
15
README.md
15
README.md
@@ -34,6 +34,11 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \
|
||||
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
|
||||
protobuf-devel
|
||||
```
|
||||
* On Arch based systems, these packages are needed:
|
||||
```bash
|
||||
pacman -S base-devel readline zlib libseccomp openssl clang \
|
||||
postgresql-libs cmake postgresql protobuf
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
```
|
||||
@@ -83,9 +88,10 @@ cd neon
|
||||
|
||||
# The preferred and default is to make a debug build. This will create a
|
||||
# demonstrably slower build than a release build. For a release build,
|
||||
# use "BUILD_TYPE=release make -j`nproc`"
|
||||
# use "BUILD_TYPE=release make -j`nproc` -s"
|
||||
# Remove -s for the verbose build log
|
||||
|
||||
make -j`nproc`
|
||||
make -j`nproc` -s
|
||||
```
|
||||
|
||||
#### Building on OSX
|
||||
@@ -99,9 +105,10 @@ cd neon
|
||||
|
||||
# The preferred and default is to make a debug build. This will create a
|
||||
# demonstrably slower build than a release build. For a release build,
|
||||
# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`"
|
||||
# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu` -s"
|
||||
# Remove -s for the verbose build log
|
||||
|
||||
make -j`sysctl -n hw.logicalcpu`
|
||||
make -j`sysctl -n hw.logicalcpu` -s
|
||||
```
|
||||
|
||||
#### Dependency installation notes
|
||||
|
||||
@@ -44,7 +44,6 @@ use tracing::{error, info};
|
||||
|
||||
use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::informant::spawn_vm_informant_if_present;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
@@ -141,8 +140,6 @@ fn main() -> Result<()> {
|
||||
// requests, while configuration is still in progress.
|
||||
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
|
||||
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
||||
// Also spawn the thread responsible for handling the VM informant -- if it's present
|
||||
let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant");
|
||||
|
||||
// Start Postgres
|
||||
let mut delay_exit = false;
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
use anyhow::Result;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
@@ -10,8 +11,6 @@ use serde_json;
|
||||
use tracing::{error, info};
|
||||
use tracing_utils::http::OtelName;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
// Service function to handle all available routes.
|
||||
async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
|
||||
//
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
use std::path::Path;
|
||||
use std::process;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
const VM_INFORMANT_PATH: &str = "/bin/vm-informant";
|
||||
const RESTART_INFORMANT_AFTER_MILLIS: u64 = 5000;
|
||||
|
||||
/// Launch a thread to start the VM informant if it's present (and restart, on failure)
|
||||
pub fn spawn_vm_informant_if_present() -> Result<Option<thread::JoinHandle<()>>> {
|
||||
let exists = Path::new(VM_INFORMANT_PATH)
|
||||
.try_exists()
|
||||
.context("could not check if path exists")?;
|
||||
|
||||
if !exists {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some(
|
||||
thread::Builder::new()
|
||||
.name("run-vm-informant".into())
|
||||
.spawn(move || run_informant())?,
|
||||
))
|
||||
}
|
||||
|
||||
fn run_informant() -> ! {
|
||||
let restart_wait = Duration::from_millis(RESTART_INFORMANT_AFTER_MILLIS);
|
||||
|
||||
info!("starting VM informant");
|
||||
|
||||
loop {
|
||||
let mut cmd = process::Command::new(VM_INFORMANT_PATH);
|
||||
// Block on subprocess:
|
||||
let result = cmd.status();
|
||||
|
||||
match result {
|
||||
Err(e) => warn!("failed to run VM informant at {VM_INFORMANT_PATH:?}: {e}"),
|
||||
Ok(status) if !status.success() => {
|
||||
warn!("{VM_INFORMANT_PATH} exited with code {status:?}, retrying")
|
||||
}
|
||||
Ok(_) => info!("{VM_INFORMANT_PATH} ended gracefully (unexpectedly). Retrying"),
|
||||
}
|
||||
|
||||
// Wait before retrying
|
||||
thread::sleep(restart_wait);
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,6 @@ pub mod http;
|
||||
#[macro_use]
|
||||
pub mod logger;
|
||||
pub mod compute;
|
||||
pub mod informant;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
|
||||
@@ -15,6 +15,7 @@ postgres.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde_with.workspace = true
|
||||
tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
@@ -419,6 +419,11 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||
eviction_policy: settings
|
||||
.get("eviction_policy")
|
||||
.map(|x| serde_json::from_str(x))
|
||||
.transpose()
|
||||
.context("Failed to parse 'eviction_policy' json")?,
|
||||
})
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
|
||||
@@ -16,7 +16,7 @@ listen_http_addr = '127.0.0.1:9898'
|
||||
checkpoint_distance = '268435456' # in bytes
|
||||
checkpoint_timeout = '10m'
|
||||
|
||||
gc_period = '100 s'
|
||||
gc_period = '1 hour'
|
||||
gc_horizon = '67108864'
|
||||
|
||||
max_file_descriptors = '100'
|
||||
@@ -101,7 +101,7 @@ away.
|
||||
|
||||
#### gc_period
|
||||
|
||||
Interval at which garbage collection is triggered. Default is 100 s.
|
||||
Interval at which garbage collection is triggered. Default is 1 hour.
|
||||
|
||||
#### image_creation_threshold
|
||||
|
||||
@@ -109,7 +109,7 @@ L0 delta layer threshold for L1 image layer creation. Default is 3.
|
||||
|
||||
#### pitr_interval
|
||||
|
||||
WAL retention duration for PITR branching. Default is 30 days.
|
||||
WAL retention duration for PITR branching. Default is 7 days.
|
||||
|
||||
#### walreceiver_connect_timeout
|
||||
|
||||
|
||||
335
docs/synthetic-size.md
Normal file
335
docs/synthetic-size.md
Normal file
@@ -0,0 +1,335 @@
|
||||
# Synthetic size
|
||||
|
||||
Neon storage has copy-on-write branching, which makes it difficult to
|
||||
answer the question "how large is my database"? To give one reasonable
|
||||
answer, we calculate _synthetic size_ for a project.
|
||||
|
||||
The calculation is called "synthetic", because it is based purely on
|
||||
the user-visible logical size, which is the size that you would see on
|
||||
a standalone PostgreSQL installation, and the amount of WAL, which is
|
||||
also the same as what you'd see on a standalone PostgreSQL, for the
|
||||
same set of updates.
|
||||
|
||||
The synthetic size does *not* depend on the actual physical size
|
||||
consumed in the storage, or implementation details of the Neon storage
|
||||
like garbage collection, compaction and compression. There is a
|
||||
strong *correlation* between the physical size and the synthetic size,
|
||||
but the synthetic size is designed to be independent of the
|
||||
implementation details, so that any improvements we make in the
|
||||
storage system simply reduce our COGS. And vice versa: any bugs or bad
|
||||
implementation where we keep more data than we would need to, do not
|
||||
change the synthetic size or incur any costs to the user.
|
||||
|
||||
The synthetic size is calculated for the whole project. It is not
|
||||
straighforward to attribute size to individual branches. See "What is
|
||||
the size of an individual branch?" for discussion on those
|
||||
difficulties.
|
||||
|
||||
The synthetic size is designed to:
|
||||
|
||||
- Take into account the copy-on-write nature of the storage. For
|
||||
example, if you create a branch, it doesn't immediately add anything
|
||||
to the synthetic size. It starts to affect the synthetic size only
|
||||
as it diverges from the parent branch.
|
||||
|
||||
- Be independent of any implementation details of the storage, like
|
||||
garbage collection, remote storage, or compression.
|
||||
|
||||
## Terms & assumptions
|
||||
|
||||
- logical size is the size of a branch *at a given point in
|
||||
time*. It's the total size of all tables in all databases, as you
|
||||
see with "\l+" in psql for example, plus the Postgres SLRUs and some
|
||||
small amount of metadata. NOTE that currently, Neon does not include
|
||||
the SLRUs and metadata in the logical size. See comment to `get_current_logical_size_non_incremental()`.
|
||||
|
||||
- a "point in time" is defined as an LSN value. You can convert a
|
||||
timestamp to an LSN, but the storage internally works with LSNs.
|
||||
|
||||
- PITR horizon can be set per-branch.
|
||||
|
||||
- PITR horizon can be set as a time interval, e.g. 5 days or hours, or
|
||||
as amount of WAL, in bytes. If it's given as a time interval, it's
|
||||
converted to an LSN for the calculation.
|
||||
|
||||
- PITR horizon can be set to 0, if you don't want to retain any history.
|
||||
|
||||
## Calculation
|
||||
|
||||
Inputs to the calculation are:
|
||||
- logical size of the database at different points in time,
|
||||
- amount of WAL generated, and
|
||||
- the PITR horizon settings
|
||||
|
||||
The synthetic size is based on an idealistic model of the storage
|
||||
system, where we pretend that the storage consists of two things:
|
||||
- snapshots, containing a full snapshot of the database, at a given
|
||||
point in time, and
|
||||
- WAL.
|
||||
|
||||
In the simple case that the project contains just one branch (main),
|
||||
and a fixed PITR horizon, the synthetic size is the sum of:
|
||||
|
||||
- the logical size of the branch *at the beginning of the PITR
|
||||
horizon*, i.e. at the oldest point that you can still recover to, and
|
||||
- the size of the WAL covering the PITR horizon.
|
||||
|
||||
The snapshot allows you to recover to the beginning of the PITR
|
||||
horizon, and the WAL allows you to recover from that point to any
|
||||
point within the horizon.
|
||||
|
||||
```
|
||||
WAL
|
||||
-----------------------#########>
|
||||
^
|
||||
snapshot
|
||||
|
||||
Legend:
|
||||
##### PITR horizon. This is the region that you can still access
|
||||
with Point-in-time query and you can still create branches
|
||||
from.
|
||||
----- history that has fallen out of the PITR horizon, and can no
|
||||
longer be accessed
|
||||
```
|
||||
|
||||
NOTE: This is not how the storage system actually works! The actual
|
||||
implementation is also based on snapshots and WAL, but the snapshots
|
||||
are taken for individual database pages and ranges of pages rather
|
||||
than the whole database, and it is much more complicated. This model
|
||||
is a reasonable approximation, however, to make the synthetic size a
|
||||
useful proxy for the actual storage consumption.
|
||||
|
||||
|
||||
## Example: Data is INSERTed
|
||||
|
||||
For example, let's assume that your database contained 10 GB of data
|
||||
at the beginning of the PITR horizon, and you have since then inserted
|
||||
5 GB of additional data into it. The additional insertions of 5 GB of
|
||||
data consume roughly 5 GB of WAL. In that case, the synthetic size is:
|
||||
|
||||
> 10 GB (snapshot) + 5 GB (WAL) = 15 GB
|
||||
|
||||
If you now set the PITR horizon on the project to 0, so that no
|
||||
historical data is retained, then the beginning PITR horizon would be
|
||||
at the end of the branch, so the size of the snapshot would be
|
||||
calculated at the end of the branch, after the insertions. Then the
|
||||
synthetic size is:
|
||||
|
||||
> 15 GB (snapshot) + 0 GB (WAL) = 15 GB.
|
||||
|
||||
In this case, the synthetic size is the same, regardless of the PITR horizon,
|
||||
because all the history consists of inserts. The newly inserted data takes
|
||||
up the same amount of space, whether it's stored as part of the logical
|
||||
snapshot, or as WAL. (*)
|
||||
|
||||
(*) This is a rough approximation. In reality, the WAL contains
|
||||
headers and other overhead, and on the other hand, the logical
|
||||
snapshot includes empty space on pages, so the size of insertions in
|
||||
WAL can be smaller or greater than the size of the final table after
|
||||
the insertions. But in most cases, it's in the same ballpark.
|
||||
|
||||
## Example: Data is DELETEd
|
||||
|
||||
Let's look at another example:
|
||||
|
||||
Let's start again with a database that contains 10 GB of data. Then,
|
||||
you DELETE 5 GB of the data, and run VACUUM to free up the space, so
|
||||
that the logical size of the database is now only 5 GB.
|
||||
|
||||
Let's assume that the WAL for the deletions and the vacuum take up
|
||||
100 MB of space. In that case, the synthetic size of the project is:
|
||||
|
||||
> 10 GB (snapshot) + 100 MB (WAL) = 10.1 GB
|
||||
|
||||
This is much larger than the logical size of the database after the
|
||||
deletions (5 GB). That's because the system still needs to retain the
|
||||
deleted data, because it's still accessible to queries and branching
|
||||
in the PITR window.
|
||||
|
||||
If you now set the PITR horizon to 0 or just wait for time to pass so
|
||||
that the data falls out of the PITR horizon, making the deleted data
|
||||
inaccessible, the synthetic size shrinks:
|
||||
|
||||
> 5 GB (snapshot) + 0 GB (WAL) = 5 GB
|
||||
|
||||
|
||||
# Branching
|
||||
|
||||
Things get more complicated with branching. Branches in Neon are
|
||||
copy-on-write, which is also reflected in the synthetic size.
|
||||
|
||||
When you create a branch, it doesn't immediately change the synthetic
|
||||
size at all. The branch point is within the PITR horizon, and all the
|
||||
data needed to recover to that point in time needs to be retained
|
||||
anyway.
|
||||
|
||||
However, if you make modifications on the branch, the system needs to
|
||||
keep the WAL of those modifications. The WAL is included in the
|
||||
synthetic size.
|
||||
|
||||
## Example: branch and INSERT
|
||||
|
||||
Let's assume that you again start with a 10 GB database.
|
||||
On the main branch, you insert 2 GB of data. Then you create
|
||||
a branch at that point, and insert another 3 GB of data on the
|
||||
main branch, and 1 GB of data on the child branch
|
||||
|
||||
```
|
||||
child +#####>
|
||||
|
|
||||
| WAL
|
||||
main ---------###############>
|
||||
^
|
||||
snapshot
|
||||
```
|
||||
|
||||
In this case, the synthetic size consists of:
|
||||
- the snapshot at the beginning of the PITR horizon (10 GB)
|
||||
- the WAL on the main branch (2 GB + 3 GB = 5 GB)
|
||||
- the WAL on the child branch (1 GB)
|
||||
|
||||
Total: 16 GB
|
||||
|
||||
# Diverging branches
|
||||
|
||||
If there is only a small amount of changes in the database on the
|
||||
different branches, as in the previous example, the synthetic size
|
||||
consists of a snapshot before the branch point, containing all the
|
||||
shared data, and the WAL on both branches. However, if the branches
|
||||
diverge a lot, it is more efficient to store a separate snapshot of
|
||||
branches.
|
||||
|
||||
## Example: diverging branches
|
||||
|
||||
You start with a 10 GB database. You insert 5 GB of data on the main
|
||||
branch. Then you create a branch, and immediately delete all the data
|
||||
on the child branch and insert 5 GB of new data to it. Then you do the
|
||||
same on the main branch. Let's assume
|
||||
that the PITR horizon requires keeping the last 1 GB of WAL on the
|
||||
both branches.
|
||||
|
||||
```
|
||||
snapshot
|
||||
v WAL
|
||||
child +---------##############>
|
||||
|
|
||||
|
|
||||
main -------------+---------##############>
|
||||
^ WAL
|
||||
snapshot
|
||||
```
|
||||
|
||||
In this case, the synthetic size consists of:
|
||||
- snapshot at the beginning of the PITR horizon on the main branch (4 GB)
|
||||
- WAL on the main branch (1 GB)
|
||||
- snapshot at the beginning of the PITR horizon on the child branch (4 GB)
|
||||
- last 1 GB of WAL on the child branch (1 GB)
|
||||
|
||||
Total: 10 GB
|
||||
|
||||
The alternative way to store this would be to take only one snapshot
|
||||
at the beginning of branch point, and keep all the WAL on both
|
||||
branches. However, the size with that method would be larger, as it
|
||||
would require one 10 GB snapshot, and 5 GB + 5 GB of WAL. It depends
|
||||
on the amount of changes (WAL) on both branches, and the logical size
|
||||
at the branch point, which method would result in a smaller synthetic
|
||||
size. On each branch point, the system performs the calculation with
|
||||
both methods, and uses the method that is cheaper, i.e. the one that
|
||||
results in a smaller synthetic size.
|
||||
|
||||
One way to think about this is that when you create a branch, it
|
||||
starts out as a thin branch that only stores the WAL since the branch
|
||||
point. As you modify it, and the amount of WAL grows, at some point
|
||||
it becomes cheaper to store a completely new snapshot of the branch
|
||||
and truncate the WAL.
|
||||
|
||||
|
||||
# What is the size of an individual branch?
|
||||
|
||||
Synthetic size is calculated for the whole project, and includes all
|
||||
branches. There is no such thing as the size of a branch, because it
|
||||
is not straighforward to attribute the parts of size to individual
|
||||
branches.
|
||||
|
||||
## Example: attributing size to branches
|
||||
|
||||
(copied from https://github.com/neondatabase/neon/pull/2884#discussion_r1029365278)
|
||||
|
||||
Imagine that you create two branches, A and B, at the same point from
|
||||
main branch, and do a couple of small updates on both branches. Then
|
||||
six months pass, and during those six months the data on the main
|
||||
branch churns over completely multiple times. The retention period is,
|
||||
say 1 month.
|
||||
|
||||
```
|
||||
+------> A
|
||||
/
|
||||
--------------------*-------------------------------> main
|
||||
\
|
||||
+--------> B
|
||||
```
|
||||
|
||||
In that situation, the synthetic tenant size would be calculated based
|
||||
on a "logical snapshot" at the branch point, that is, the logical size
|
||||
of the database at that point. Plus the WAL on branches A and B. Let's
|
||||
say that the snapshot size is 10 GB, and the WAL is 1 MB on both
|
||||
branches A and B. So the total synthetic storage size is 10002
|
||||
MB. (Let's ignore the main branch for now, that would be just added to
|
||||
the sum)
|
||||
|
||||
How would you break that down per branch? I can think of three
|
||||
different ways to do it, and all of them have their own problems:
|
||||
|
||||
### Subtraction method
|
||||
|
||||
For each branch, calculate how much smaller the total synthetic size
|
||||
would be, if that branch didn't exist. In other words, how much would
|
||||
you save if you dropped the branch. With this method, the size of
|
||||
branches A and B is 1 MB.
|
||||
|
||||
With this method, the 10 GB shared logical snapshot is not included
|
||||
for A nor B. So the size of all branches is not equal to the total
|
||||
synthetic size of the tenant. If you drop branch A, you save 1 MB as
|
||||
you'd expect, but also the size of B suddenly jumps from 1 MB to 10001
|
||||
MB, which might feel surprising.
|
||||
|
||||
### Division method
|
||||
|
||||
Divide the common parts evenly across all branches that need
|
||||
them. With this method, the size of branches A and B would be 5001 MB.
|
||||
|
||||
With this method, the sum of all branches adds up to the total
|
||||
synthetic size. But it's surprising in other ways: if you drop branch
|
||||
A, you might think that you save 5001 MB, but in reality you only save
|
||||
1 MB, and the size of branch B suddenly grows from 5001 to 10001 MB.
|
||||
|
||||
### Addition method
|
||||
|
||||
For each branch, include all the snapshots and WAL that it depends on,
|
||||
even if some of them are shared by other branches. With this method,
|
||||
the size of branches A and B would be 10001 MB.
|
||||
|
||||
The surprise with this method is that the sum of all the branches is
|
||||
larger than the total synthetic size. And if you drop branch A, the
|
||||
total synthetic size doesn't fall by 10001 MB as you might think.
|
||||
|
||||
# Alternatives
|
||||
|
||||
A sort of cop-out method would be to show the whole tree of branches
|
||||
graphically, and for each section of WAL or logical snapshot, display
|
||||
the size of that section. You can then see which branches depend on
|
||||
which sections, which sections are shared etc. That would be good to
|
||||
have in the UI anyway.
|
||||
|
||||
Or perhaps calculate per-branch numbers using the subtraction method,
|
||||
and in addition to that, one more number for "shared size" that
|
||||
includes all the data that is needed by more than one branch.
|
||||
|
||||
## Which is the right method?
|
||||
|
||||
The bottom line is that it's not straightforward to attribute the
|
||||
synthetic size to individual branches. There are things we can do, and
|
||||
all of those methods are pretty straightforward to implement, but they
|
||||
all have their own problems. What makes sense depends a lot on what
|
||||
you want to do with the number, what question you are trying to
|
||||
answer.
|
||||
@@ -14,5 +14,6 @@ byteorder.workspace = true
|
||||
utils.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
enum-map.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -155,6 +155,11 @@ pub struct TenantConfigRequest {
|
||||
pub lagging_wal_timeout: Option<String>,
|
||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||
pub trace_read_requests: Option<bool>,
|
||||
// We defer the parsing of the eviction_policy field to the request handler.
|
||||
// Otherwise we'd have to move the types for eviction policy into this package.
|
||||
// We might do that once the eviction feature has stabilizied.
|
||||
// For now, this field is not even documented in the openapi_spec.yml.
|
||||
pub eviction_policy: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
impl TenantConfigRequest {
|
||||
@@ -174,6 +179,7 @@ impl TenantConfigRequest {
|
||||
lagging_wal_timeout: None,
|
||||
max_lsn_wal_lag: None,
|
||||
trace_read_requests: None,
|
||||
eviction_policy: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -263,11 +269,11 @@ pub struct LayerResidenceEvent {
|
||||
///
|
||||
#[serde(rename = "timestamp_millis_since_epoch")]
|
||||
#[serde_as(as = "serde_with::TimestampMilliSeconds")]
|
||||
timestamp: SystemTime,
|
||||
pub timestamp: SystemTime,
|
||||
/// The new residence status of the layer.
|
||||
status: LayerResidenceStatus,
|
||||
pub status: LayerResidenceStatus,
|
||||
/// The reason why we had to record this event.
|
||||
reason: LayerResidenceEventReason,
|
||||
pub reason: LayerResidenceEventReason,
|
||||
}
|
||||
|
||||
/// The reason for recording a given [`ResidenceEvent`].
|
||||
|
||||
@@ -75,27 +75,36 @@ impl StartupMessageParams {
|
||||
/// taking into account all escape sequences but leaving them as-is.
|
||||
/// [`None`] means that there's no `options` in [`Self`].
|
||||
pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
|
||||
// See `postgres: pg_split_opts`.
|
||||
let mut last_was_escape = false;
|
||||
let iter = self
|
||||
.get("options")?
|
||||
.split(move |c: char| {
|
||||
// We split by non-escaped whitespace symbols.
|
||||
let should_split = c.is_ascii_whitespace() && !last_was_escape;
|
||||
last_was_escape = c == '\\' && !last_was_escape;
|
||||
should_split
|
||||
})
|
||||
.filter(|s| !s.is_empty());
|
||||
|
||||
Some(iter)
|
||||
self.get("options").map(Self::parse_options_raw)
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
/// applying all escape sequences (using owned strings as needed).
|
||||
/// [`None`] means that there's no `options` in [`Self`].
|
||||
pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
|
||||
self.get("options").map(Self::parse_options_escaped)
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
/// taking into account all escape sequences but leaving them as-is.
|
||||
pub fn parse_options_raw(input: &str) -> impl Iterator<Item = &str> {
|
||||
// See `postgres: pg_split_opts`.
|
||||
let iter = self.options_raw()?.map(|s| {
|
||||
let mut last_was_escape = false;
|
||||
input
|
||||
.split(move |c: char| {
|
||||
// We split by non-escaped whitespace symbols.
|
||||
let should_split = c.is_ascii_whitespace() && !last_was_escape;
|
||||
last_was_escape = c == '\\' && !last_was_escape;
|
||||
should_split
|
||||
})
|
||||
.filter(|s| !s.is_empty())
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
/// applying all escape sequences (using owned strings as needed).
|
||||
pub fn parse_options_escaped(input: &str) -> impl Iterator<Item = Cow<'_, str>> {
|
||||
// See `postgres: pg_split_opts`.
|
||||
Self::parse_options_raw(input).map(|s| {
|
||||
let mut preserve_next_escape = false;
|
||||
let escape = |c| {
|
||||
// We should remove '\\' unless it's preceded by '\\'.
|
||||
@@ -108,9 +117,12 @@ impl StartupMessageParams {
|
||||
true => Cow::Owned(s.replace(escape, "")),
|
||||
false => Cow::Borrowed(s),
|
||||
}
|
||||
});
|
||||
})
|
||||
}
|
||||
|
||||
Some(iter)
|
||||
/// Iterate through key-value pairs in an arbitrary order.
|
||||
pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
|
||||
self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
|
||||
}
|
||||
|
||||
// This function is mostly useful in tests.
|
||||
|
||||
@@ -21,7 +21,7 @@ toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
pin-project-lite.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -20,7 +20,10 @@ use aws_sdk_s3::{
|
||||
};
|
||||
use aws_smithy_http::body::SdkBody;
|
||||
use hyper::Body;
|
||||
use tokio::{io, sync::Semaphore};
|
||||
use tokio::{
|
||||
io::{self, AsyncRead},
|
||||
sync::Semaphore,
|
||||
};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::debug;
|
||||
|
||||
@@ -102,7 +105,7 @@ pub struct S3Bucket {
|
||||
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
|
||||
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
|
||||
// The helps to ensure we don't exceed the thresholds.
|
||||
concurrency_limiter: Semaphore,
|
||||
concurrency_limiter: Arc<Semaphore>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -162,7 +165,7 @@ impl S3Bucket {
|
||||
client,
|
||||
bucket_name: aws_config.bucket_name.clone(),
|
||||
prefix_in_bucket,
|
||||
concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
|
||||
concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -194,9 +197,10 @@ impl S3Bucket {
|
||||
}
|
||||
|
||||
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
|
||||
let _guard = self
|
||||
let permit = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.clone()
|
||||
.acquire_owned()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 download")
|
||||
.map_err(DownloadError::Other)?;
|
||||
@@ -217,9 +221,10 @@ impl S3Bucket {
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
Ok(Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(io::BufReader::new(
|
||||
download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
|
||||
permit,
|
||||
object_output.body.into_async_read(),
|
||||
)),
|
||||
))),
|
||||
})
|
||||
}
|
||||
Err(SdkError::ServiceError {
|
||||
@@ -240,6 +245,32 @@ impl S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
|
||||
struct RatelimitedAsyncRead<S> {
|
||||
permit: tokio::sync::OwnedSemaphorePermit,
|
||||
#[pin]
|
||||
inner: S,
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> RatelimitedAsyncRead<S> {
|
||||
fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
|
||||
RatelimitedAsyncRead { permit, inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
|
||||
fn poll_read(
|
||||
self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut io::ReadBuf<'_>,
|
||||
) -> std::task::Poll<std::io::Result<()>> {
|
||||
let this = self.project();
|
||||
this.inner.poll_read(cx, buf)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3Bucket {
|
||||
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
|
||||
@@ -7,5 +7,7 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
219
libs/tenant_size_model/src/calculation.rs
Normal file
219
libs/tenant_size_model/src/calculation.rs
Normal file
@@ -0,0 +1,219 @@
|
||||
use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
|
||||
|
||||
//
|
||||
// *-g--*---D--->
|
||||
// /
|
||||
// /
|
||||
// / *---b----*-B--->
|
||||
// / /
|
||||
// / /
|
||||
// -----*--e---*-----f----* C
|
||||
// E \
|
||||
// \
|
||||
// *--a---*---A-->
|
||||
//
|
||||
// If A and B need to be retained, is it cheaper to store
|
||||
// snapshot at C+a+b, or snapshots at A and B ?
|
||||
//
|
||||
// If D also needs to be retained, which is cheaper:
|
||||
//
|
||||
// 1. E+g+e+f+a+b
|
||||
// 2. D+C+a+b
|
||||
// 3. D+A+B
|
||||
|
||||
/// [`Segment`] which has had it's size calculated.
|
||||
#[derive(Clone, Debug)]
|
||||
struct SegmentSize {
|
||||
method: SegmentMethod,
|
||||
|
||||
// calculated size of this subtree, using this method
|
||||
accum_size: u64,
|
||||
|
||||
seg_id: usize,
|
||||
children: Vec<SegmentSize>,
|
||||
}
|
||||
|
||||
struct SizeAlternatives {
|
||||
// cheapest alternative if parent is available.
|
||||
incremental: SegmentSize,
|
||||
|
||||
// cheapest alternative if parent node is not available
|
||||
non_incremental: Option<SegmentSize>,
|
||||
}
|
||||
|
||||
impl StorageModel {
|
||||
pub fn calculate(&self) -> SizeResult {
|
||||
// Build adjacency list. 'child_list' is indexed by segment id. Each entry
|
||||
// contains a list of all child segments of the segment.
|
||||
let mut roots: Vec<usize> = Vec::new();
|
||||
let mut child_list: Vec<Vec<usize>> = Vec::new();
|
||||
child_list.resize(self.segments.len(), Vec::new());
|
||||
|
||||
for (seg_id, seg) in self.segments.iter().enumerate() {
|
||||
if let Some(parent_id) = seg.parent {
|
||||
child_list[parent_id].push(seg_id);
|
||||
} else {
|
||||
roots.push(seg_id);
|
||||
}
|
||||
}
|
||||
|
||||
let mut segment_results = Vec::new();
|
||||
segment_results.resize(
|
||||
self.segments.len(),
|
||||
SegmentSizeResult {
|
||||
method: SegmentMethod::Skipped,
|
||||
accum_size: 0,
|
||||
},
|
||||
);
|
||||
|
||||
let mut total_size = 0;
|
||||
for root in roots {
|
||||
if let Some(selected) = self.size_here(root, &child_list).non_incremental {
|
||||
StorageModel::fill_selected_sizes(&selected, &mut segment_results);
|
||||
total_size += selected.accum_size;
|
||||
} else {
|
||||
// Couldn't find any way to get this root. Error?
|
||||
}
|
||||
}
|
||||
|
||||
SizeResult {
|
||||
total_size,
|
||||
segments: segment_results,
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_selected_sizes(selected: &SegmentSize, result: &mut Vec<SegmentSizeResult>) {
|
||||
result[selected.seg_id] = SegmentSizeResult {
|
||||
method: selected.method,
|
||||
accum_size: selected.accum_size,
|
||||
};
|
||||
// recurse to children
|
||||
for child in selected.children.iter() {
|
||||
StorageModel::fill_selected_sizes(child, result);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// This is the core of the sizing calculation.
|
||||
//
|
||||
// This is a recursive function, that for each Segment calculates the best way
|
||||
// to reach all the Segments that are marked as needed in this subtree, under two
|
||||
// different conditions:
|
||||
// a) when the parent of this segment is available (as a snaphot or through WAL), and
|
||||
// b) when the parent of this segment is not available.
|
||||
//
|
||||
fn size_here(&self, seg_id: usize, child_list: &Vec<Vec<usize>>) -> SizeAlternatives {
|
||||
let seg = &self.segments[seg_id];
|
||||
// First figure out the best way to get each child
|
||||
let mut children = Vec::new();
|
||||
for child_id in &child_list[seg_id] {
|
||||
children.push(self.size_here(*child_id, child_list))
|
||||
}
|
||||
|
||||
// Method 1. If this node is not needed, we can skip it as long as we
|
||||
// take snapshots later in each sub-tree
|
||||
let snapshot_later = if !seg.needed {
|
||||
let mut snapshot_later = SegmentSize {
|
||||
seg_id,
|
||||
method: SegmentMethod::Skipped,
|
||||
accum_size: 0,
|
||||
children: Vec::new(),
|
||||
};
|
||||
|
||||
let mut possible = true;
|
||||
for child in children.iter() {
|
||||
if let Some(non_incremental) = &child.non_incremental {
|
||||
snapshot_later.accum_size += non_incremental.accum_size;
|
||||
snapshot_later.children.push(non_incremental.clone())
|
||||
} else {
|
||||
possible = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if possible {
|
||||
Some(snapshot_later)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Method 2. Get a snapshot here. This assumed to be possible, if the 'size' of
|
||||
// this Segment was given.
|
||||
let snapshot_here = if !seg.needed || seg.parent.is_none() {
|
||||
if let Some(snapshot_size) = seg.size {
|
||||
let mut snapshot_here = SegmentSize {
|
||||
seg_id,
|
||||
method: SegmentMethod::SnapshotHere,
|
||||
accum_size: snapshot_size,
|
||||
children: Vec::new(),
|
||||
};
|
||||
for child in children.iter() {
|
||||
snapshot_here.accum_size += child.incremental.accum_size;
|
||||
snapshot_here.children.push(child.incremental.clone())
|
||||
}
|
||||
Some(snapshot_here)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Method 3. Use WAL to get here from parent
|
||||
let wal_here = {
|
||||
let mut wal_here = SegmentSize {
|
||||
seg_id,
|
||||
method: SegmentMethod::Wal,
|
||||
accum_size: if let Some(parent_id) = seg.parent {
|
||||
seg.lsn - self.segments[parent_id].lsn
|
||||
} else {
|
||||
0
|
||||
},
|
||||
children: Vec::new(),
|
||||
};
|
||||
for child in children {
|
||||
wal_here.accum_size += child.incremental.accum_size;
|
||||
wal_here.children.push(child.incremental)
|
||||
}
|
||||
wal_here
|
||||
};
|
||||
|
||||
// If the parent is not available, what's the cheapest method involving
|
||||
// a snapshot here or later?
|
||||
let mut cheapest_non_incremental: Option<SegmentSize> = None;
|
||||
if let Some(snapshot_here) = snapshot_here {
|
||||
cheapest_non_incremental = Some(snapshot_here);
|
||||
}
|
||||
if let Some(snapshot_later) = snapshot_later {
|
||||
// Use <=, to prefer skipping if the size is equal
|
||||
if let Some(parent) = &cheapest_non_incremental {
|
||||
if snapshot_later.accum_size <= parent.accum_size {
|
||||
cheapest_non_incremental = Some(snapshot_later);
|
||||
}
|
||||
} else {
|
||||
cheapest_non_incremental = Some(snapshot_later);
|
||||
}
|
||||
}
|
||||
|
||||
// And what's the cheapest method, if the parent is available?
|
||||
let cheapest_incremental = if let Some(cheapest_non_incremental) = &cheapest_non_incremental
|
||||
{
|
||||
// Is it cheaper to use a snapshot here or later, anyway?
|
||||
// Use <, to prefer Wal over snapshot if the cost is the same
|
||||
if wal_here.accum_size < cheapest_non_incremental.accum_size {
|
||||
wal_here
|
||||
} else {
|
||||
cheapest_non_incremental.clone()
|
||||
}
|
||||
} else {
|
||||
wal_here
|
||||
};
|
||||
|
||||
SizeAlternatives {
|
||||
incremental: cheapest_incremental,
|
||||
non_incremental: cheapest_non_incremental,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,401 +1,70 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
//! Synthetic size calculation
|
||||
|
||||
use anyhow::Context;
|
||||
mod calculation;
|
||||
pub mod svg;
|
||||
|
||||
/// Pricing model or history size builder.
|
||||
/// StorageModel is the input to the synthetic size calculation. It represents
|
||||
/// a tree of timelines, with just the information that's needed for the
|
||||
/// calculation. This doesn't track timeline names or where each timeline
|
||||
/// begins and ends, for example. Instead, it consists of "points of interest"
|
||||
/// on the timelines. A point of interest could be the timeline start or end point,
|
||||
/// the oldest point on a timeline that needs to be retained because of PITR
|
||||
/// cutoff, or snapshot points named by the user. For each such point, and the
|
||||
/// edge connecting the points (implicit in Segment), we store information about
|
||||
/// whether we need to be able to recover to the point, and if known, the logical
|
||||
/// size at the point.
|
||||
///
|
||||
/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
|
||||
/// type.
|
||||
pub struct Storage<K: 'static> {
|
||||
segments: Vec<Segment>,
|
||||
|
||||
/// Mapping from the branch name to the index of a segment describing it's latest state.
|
||||
branches: HashMap<K, usize>,
|
||||
/// The segments must form a well-formed tree, with no loops.
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct StorageModel {
|
||||
pub segments: Vec<Segment>,
|
||||
}
|
||||
|
||||
/// Snapshot of a branch.
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
/// Segment represents one point in the tree of branches, *and* the edge that leads
|
||||
/// to it (if any). We don't need separate structs for points and edges, because each
|
||||
/// point can have only one parent.
|
||||
///
|
||||
/// When 'needed' is true, it means that we need to be able to reconstruct
|
||||
/// any version between 'parent.lsn' and 'lsn'. If you want to represent that only
|
||||
/// a single point is needed, create two Segments with the same lsn, and mark only
|
||||
/// the child as needed.
|
||||
///
|
||||
#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct Segment {
|
||||
/// Previous segment index into ['Storage::segments`], if any.
|
||||
parent: Option<usize>,
|
||||
pub parent: Option<usize>,
|
||||
|
||||
/// Description of how did we get to this state.
|
||||
///
|
||||
/// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when
|
||||
/// modifying a branch directly.
|
||||
pub op: Cow<'static, str>,
|
||||
/// LSN at this point
|
||||
pub lsn: u64,
|
||||
|
||||
/// LSN before this state
|
||||
start_lsn: u64,
|
||||
/// Logical size at this node, if known.
|
||||
pub size: Option<u64>,
|
||||
|
||||
/// LSN at this state
|
||||
pub end_lsn: u64,
|
||||
|
||||
/// Logical size before this state
|
||||
start_size: u64,
|
||||
|
||||
/// Logical size at this state. Can be None in the last Segment of a branch.
|
||||
pub end_size: Option<u64>,
|
||||
|
||||
/// Indices to [`Storage::segments`]
|
||||
///
|
||||
/// FIXME: this could be an Option<usize>
|
||||
children_after: Vec<usize>,
|
||||
|
||||
/// Determined by `retention_period` given to [`Storage::calculate`]
|
||||
/// If true, the segment from parent to this node is needed by `retention_period`
|
||||
pub needed: bool,
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
//
|
||||
// *-g--*---D--->
|
||||
// /
|
||||
// /
|
||||
// / *---b----*-B--->
|
||||
// / /
|
||||
// / /
|
||||
// -----*--e---*-----f----* C
|
||||
// E \
|
||||
// \
|
||||
// *--a---*---A-->
|
||||
//
|
||||
// If A and B need to be retained, is it cheaper to store
|
||||
// snapshot at C+a+b, or snapshots at A and B ?
|
||||
//
|
||||
// If D also needs to be retained, which is cheaper:
|
||||
//
|
||||
// 1. E+g+e+f+a+b
|
||||
// 2. D+C+a+b
|
||||
// 3. D+A+B
|
||||
/// Result of synthetic size calculation. Returned by StorageModel::calculate()
|
||||
pub struct SizeResult {
|
||||
pub total_size: u64,
|
||||
|
||||
/// [`Segment`] which has had it's size calculated.
|
||||
pub struct SegmentSize {
|
||||
pub seg_id: usize,
|
||||
|
||||
pub method: SegmentMethod,
|
||||
|
||||
this_size: u64,
|
||||
|
||||
pub children: Vec<SegmentSize>,
|
||||
// This has same length as the StorageModel::segments vector in the input.
|
||||
// Each entry in this array corresponds to the entry with same index in
|
||||
// StorageModel::segments.
|
||||
pub segments: Vec<SegmentSizeResult>,
|
||||
}
|
||||
|
||||
impl SegmentSize {
|
||||
fn total(&self) -> u64 {
|
||||
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||
}
|
||||
|
||||
pub fn total_children(&self) -> u64 {
|
||||
if self.method == SnapshotAfter {
|
||||
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||
} else {
|
||||
self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||
}
|
||||
}
|
||||
#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct SegmentSizeResult {
|
||||
pub method: SegmentMethod,
|
||||
// calculated size of this subtree, using this method
|
||||
pub accum_size: u64,
|
||||
}
|
||||
|
||||
/// Different methods to retain history from a particular state
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum SegmentMethod {
|
||||
SnapshotAfter,
|
||||
Wal,
|
||||
WalNeeded,
|
||||
SnapshotHere, // A logical snapshot is needed after this segment
|
||||
Wal, // Keep WAL leading up to this node
|
||||
Skipped,
|
||||
}
|
||||
|
||||
use SegmentMethod::*;
|
||||
|
||||
impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
/// Creates a new storage with the given default branch name.
|
||||
pub fn new(initial_branch: K) -> Storage<K> {
|
||||
let init_segment = Segment {
|
||||
op: "".into(),
|
||||
needed: false,
|
||||
parent: None,
|
||||
start_lsn: 0,
|
||||
end_lsn: 0,
|
||||
start_size: 0,
|
||||
end_size: Some(0),
|
||||
children_after: Vec::new(),
|
||||
};
|
||||
|
||||
Storage {
|
||||
segments: vec![init_segment],
|
||||
branches: HashMap::from([(initial_branch, 0)]),
|
||||
}
|
||||
}
|
||||
|
||||
/// Advances the branch with a new point, at given LSN.
|
||||
pub fn insert_point<Q: ?Sized>(
|
||||
&mut self,
|
||||
branch: &Q,
|
||||
op: Cow<'static, str>,
|
||||
lsn: u64,
|
||||
size: Option<u64>,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
|
||||
let newseg_id = self.segments.len();
|
||||
let lastseg = &mut self.segments[lastseg_id];
|
||||
|
||||
assert!(lsn > lastseg.end_lsn);
|
||||
|
||||
let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
|
||||
|
||||
let newseg = Segment {
|
||||
op,
|
||||
parent: Some(lastseg_id),
|
||||
start_lsn: lastseg.end_lsn,
|
||||
end_lsn: lsn,
|
||||
start_size,
|
||||
end_size: size,
|
||||
children_after: Vec::new(),
|
||||
needed: false,
|
||||
};
|
||||
lastseg.children_after.push(newseg_id);
|
||||
|
||||
self.segments.push(newseg);
|
||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
|
||||
pub fn modify_branch<Q: ?Sized>(
|
||||
&mut self,
|
||||
branch: &Q,
|
||||
op: Cow<'static, str>,
|
||||
lsn_bytes: u64,
|
||||
size_bytes: i64,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
|
||||
let newseg_id = self.segments.len();
|
||||
let lastseg = &mut self.segments[lastseg_id];
|
||||
|
||||
let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
|
||||
|
||||
let newseg = Segment {
|
||||
op,
|
||||
parent: Some(lastseg_id),
|
||||
start_lsn: lastseg.end_lsn,
|
||||
end_lsn: lastseg.end_lsn + lsn_bytes,
|
||||
start_size: last_end_size,
|
||||
end_size: Some((last_end_size as i64 + size_bytes) as u64),
|
||||
children_after: Vec::new(),
|
||||
needed: false,
|
||||
};
|
||||
lastseg.children_after.push(newseg_id);
|
||||
|
||||
self.segments.push(newseg);
|
||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
self.modify_branch(branch, "insert".into(), bytes, bytes as i64)
|
||||
}
|
||||
|
||||
pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
self.modify_branch(branch, "update".into(), bytes, 0i64)
|
||||
}
|
||||
|
||||
pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64))
|
||||
}
|
||||
|
||||
pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q> + std::fmt::Debug,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
// Find the right segment
|
||||
let branchseg_id = *self.branches.get(parent).with_context(|| {
|
||||
format!(
|
||||
"should had found the parent {:?} by key. in branches {:?}",
|
||||
parent, self.branches
|
||||
)
|
||||
})?;
|
||||
|
||||
let _branchseg = &mut self.segments[branchseg_id];
|
||||
|
||||
// Create branch name for it
|
||||
self.branches.insert(name, branchseg_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result<SegmentSize> {
|
||||
// Phase 1: Mark all the segments that need to be retained
|
||||
for (_branch, &last_seg_id) in self.branches.iter() {
|
||||
let last_seg = &self.segments[last_seg_id];
|
||||
let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period);
|
||||
let mut seg_id = last_seg_id;
|
||||
loop {
|
||||
let seg = &mut self.segments[seg_id];
|
||||
if seg.end_lsn < cutoff_lsn {
|
||||
break;
|
||||
}
|
||||
seg.needed = true;
|
||||
if let Some(prev_seg_id) = seg.parent {
|
||||
seg_id = prev_seg_id;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: For each oldest segment in a chain that needs to be retained,
|
||||
// calculate if we should store snapshot or WAL
|
||||
self.size_from_snapshot_later(0)
|
||||
}
|
||||
|
||||
fn size_from_wal(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
|
||||
let seg = &self.segments[seg_id];
|
||||
|
||||
let this_size = seg.end_lsn - seg.start_lsn;
|
||||
|
||||
let mut children = Vec::new();
|
||||
|
||||
// try both ways
|
||||
for &child_id in seg.children_after.iter() {
|
||||
// try each child both ways
|
||||
let child = &self.segments[child_id];
|
||||
let p1 = self.size_from_wal(child_id)?;
|
||||
|
||||
let p = if !child.needed {
|
||||
let p2 = self.size_from_snapshot_later(child_id)?;
|
||||
if p1.total() < p2.total() {
|
||||
p1
|
||||
} else {
|
||||
p2
|
||||
}
|
||||
} else {
|
||||
p1
|
||||
};
|
||||
children.push(p);
|
||||
}
|
||||
Ok(SegmentSize {
|
||||
seg_id,
|
||||
method: if seg.needed { WalNeeded } else { Wal },
|
||||
this_size,
|
||||
children,
|
||||
})
|
||||
}
|
||||
|
||||
fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
|
||||
// If this is needed, then it's time to do the snapshot and continue
|
||||
// with wal method.
|
||||
let seg = &self.segments[seg_id];
|
||||
//eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed);
|
||||
if seg.needed {
|
||||
let mut children = Vec::new();
|
||||
|
||||
for &child_id in seg.children_after.iter() {
|
||||
// try each child both ways
|
||||
let child = &self.segments[child_id];
|
||||
let p1 = self.size_from_wal(child_id)?;
|
||||
|
||||
let p = if !child.needed {
|
||||
let p2 = self.size_from_snapshot_later(child_id)?;
|
||||
if p1.total() < p2.total() {
|
||||
p1
|
||||
} else {
|
||||
p2
|
||||
}
|
||||
} else {
|
||||
p1
|
||||
};
|
||||
children.push(p);
|
||||
}
|
||||
Ok(SegmentSize {
|
||||
seg_id,
|
||||
method: WalNeeded,
|
||||
this_size: seg.start_size,
|
||||
children,
|
||||
})
|
||||
} else {
|
||||
// If any of the direct children are "needed", need to be able to reconstruct here
|
||||
let mut children_needed = false;
|
||||
for &child in seg.children_after.iter() {
|
||||
let seg = &self.segments[child];
|
||||
if seg.needed {
|
||||
children_needed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let method1 = if !children_needed {
|
||||
let mut children = Vec::new();
|
||||
for child in seg.children_after.iter() {
|
||||
children.push(self.size_from_snapshot_later(*child)?);
|
||||
}
|
||||
Some(SegmentSize {
|
||||
seg_id,
|
||||
method: Skipped,
|
||||
this_size: 0,
|
||||
children,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// If this a junction, consider snapshotting here
|
||||
let method2 = if children_needed || seg.children_after.len() >= 2 {
|
||||
let mut children = Vec::new();
|
||||
for child in seg.children_after.iter() {
|
||||
children.push(self.size_from_wal(*child)?);
|
||||
}
|
||||
let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") };
|
||||
Some(SegmentSize {
|
||||
seg_id,
|
||||
method: SnapshotAfter,
|
||||
this_size,
|
||||
children,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(match (method1, method2) {
|
||||
(None, None) => anyhow::bail!(
|
||||
"neither method was applicable: children_after={}, children_needed={}",
|
||||
seg.children_after.len(),
|
||||
children_needed
|
||||
),
|
||||
(Some(method), None) => method,
|
||||
(None, Some(method)) => method,
|
||||
(Some(method1), Some(method2)) => {
|
||||
if method1.total() < method2.total() {
|
||||
method1
|
||||
} else {
|
||||
method2
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_segments(self) -> Vec<Segment> {
|
||||
self.segments
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,269 +0,0 @@
|
||||
//! Tenant size model testing ground.
|
||||
//!
|
||||
//! Has a number of scenarios and a `main` for invoking these by number, calculating the history
|
||||
//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios
|
||||
//! into pngs.
|
||||
|
||||
use tenant_size_model::{Segment, SegmentSize, Storage};
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
fn scenario_1() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
fn scenario_2() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child")?;
|
||||
storage.update("child", 1_000)?;
|
||||
|
||||
// More updates on parent
|
||||
storage.update("main", 1_000)?;
|
||||
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
// Like 2, but more updates on main
|
||||
fn scenario_3() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child")?;
|
||||
storage.update("child", 1_000)?;
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
// Diverged branches
|
||||
fn scenario_4() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child")?;
|
||||
storage.update("child", 1_000)?;
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..8 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
fn scenario_5() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
let mut storage = Storage::new("a");
|
||||
storage.insert("a", 5000)?;
|
||||
storage.branch("a", "b")?;
|
||||
storage.update("b", 4000)?;
|
||||
storage.update("a", 2000)?;
|
||||
storage.branch("a", "c")?;
|
||||
storage.insert("c", 4000)?;
|
||||
storage.insert("a", 2000)?;
|
||||
|
||||
let size = storage.calculate(5000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
fn scenario_6() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
use std::borrow::Cow;
|
||||
|
||||
const NO_OP: Cow<'static, str> = Cow::Borrowed("");
|
||||
|
||||
let branches = [
|
||||
Some(0x7ff1edab8182025f15ae33482edb590a_u128),
|
||||
Some(0xb1719e044db05401a05a2ed588a3ad3f),
|
||||
Some(0xb68d6691c895ad0a70809470020929ef),
|
||||
];
|
||||
|
||||
// compared to other scenarios, this one uses bytes instead of kB
|
||||
|
||||
let mut storage = Storage::new(None);
|
||||
|
||||
storage.branch(&None, branches[0])?; // at 0
|
||||
storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064
|
||||
storage.branch(&branches[0], branches[1])?; // at 108951064
|
||||
storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472
|
||||
storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424
|
||||
storage.branch(&branches[0], branches[2])?; // at 283415424
|
||||
storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616
|
||||
storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400
|
||||
|
||||
let size = storage.calculate(100_000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args: Vec<String> = std::env::args().collect();
|
||||
|
||||
let scenario = if args.len() < 2 { "1" } else { &args[1] };
|
||||
|
||||
let (segments, size) = match scenario {
|
||||
"1" => scenario_1(),
|
||||
"2" => scenario_2(),
|
||||
"3" => scenario_3(),
|
||||
"4" => scenario_4(),
|
||||
"5" => scenario_5(),
|
||||
"6" => scenario_6(),
|
||||
other => {
|
||||
eprintln!("invalid scenario {}", other);
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
.unwrap();
|
||||
|
||||
graphviz_tree(&segments, &size);
|
||||
}
|
||||
|
||||
fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
|
||||
use tenant_size_model::SegmentMethod::*;
|
||||
|
||||
let seg_id = node.seg_id;
|
||||
let seg = segments.get(seg_id).unwrap();
|
||||
let lsn = seg.end_lsn;
|
||||
let size = seg.end_size.unwrap_or(0);
|
||||
let method = node.method;
|
||||
|
||||
println!(" {{");
|
||||
println!(" node [width=0.1 height=0.1 shape=oval]");
|
||||
|
||||
let tenant_size = node.total_children();
|
||||
|
||||
let penwidth = if seg.needed { 6 } else { 3 };
|
||||
let x = match method {
|
||||
SnapshotAfter =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"),
|
||||
Wal =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
|
||||
WalNeeded =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
|
||||
Skipped =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"),
|
||||
};
|
||||
|
||||
println!(" \"seg{seg_id}\" [{x}]");
|
||||
println!(" }}");
|
||||
|
||||
// Recurse. Much of the data is actually on the edge
|
||||
for child in node.children.iter() {
|
||||
let child_id = child.seg_id;
|
||||
graphviz_recurse(segments, child);
|
||||
|
||||
let edge_color = match child.method {
|
||||
SnapshotAfter => "gray",
|
||||
Wal => "black",
|
||||
WalNeeded => "black",
|
||||
Skipped => "gray",
|
||||
};
|
||||
|
||||
println!(" {{");
|
||||
println!(" edge [] ");
|
||||
print!(" \"seg{seg_id}\" -> \"seg{child_id}\" [");
|
||||
print!("color={edge_color}");
|
||||
if child.method == WalNeeded {
|
||||
print!(" penwidth=6");
|
||||
}
|
||||
if child.method == Wal {
|
||||
print!(" penwidth=3");
|
||||
}
|
||||
|
||||
let next = segments.get(child_id).unwrap();
|
||||
|
||||
if next.op.is_empty() {
|
||||
print!(
|
||||
" label=\"{} / {}\"",
|
||||
next.end_lsn - seg.end_lsn,
|
||||
(next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128)
|
||||
);
|
||||
} else {
|
||||
print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
|
||||
}
|
||||
println!("]");
|
||||
println!(" }}");
|
||||
}
|
||||
}
|
||||
|
||||
fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
|
||||
println!("digraph G {{");
|
||||
println!(" fontname=\"Helvetica,Arial,sans-serif\"");
|
||||
println!(" node [fontname=\"Helvetica,Arial,sans-serif\"]");
|
||||
println!(" edge [fontname=\"Helvetica,Arial,sans-serif\"]");
|
||||
println!(" graph [center=1 rankdir=LR]");
|
||||
println!(" edge [dir=none]");
|
||||
|
||||
graphviz_recurse(segments, tree);
|
||||
|
||||
println!("}}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scenarios_return_same_size() {
|
||||
type ScenarioFn = fn() -> anyhow::Result<(Vec<Segment>, SegmentSize)>;
|
||||
let truths: &[(u32, ScenarioFn, _)] = &[
|
||||
(line!(), scenario_1, 8000),
|
||||
(line!(), scenario_2, 9000),
|
||||
(line!(), scenario_3, 13000),
|
||||
(line!(), scenario_4, 16000),
|
||||
(line!(), scenario_5, 17000),
|
||||
(line!(), scenario_6, 333_792_000),
|
||||
];
|
||||
|
||||
for (line, scenario, expected) in truths {
|
||||
let (_, size) = scenario().unwrap();
|
||||
assert_eq!(*expected, size.total_children(), "scenario on line {line}");
|
||||
}
|
||||
}
|
||||
193
libs/tenant_size_model/src/svg.rs
Normal file
193
libs/tenant_size_model/src/svg.rs
Normal file
@@ -0,0 +1,193 @@
|
||||
use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
|
||||
use std::fmt::Write;
|
||||
|
||||
const SVG_WIDTH: f32 = 500.0;
|
||||
|
||||
struct SvgDraw<'a> {
|
||||
storage: &'a StorageModel,
|
||||
branches: &'a [String],
|
||||
seg_to_branch: &'a [usize],
|
||||
sizes: &'a [SegmentSizeResult],
|
||||
|
||||
// layout
|
||||
xscale: f32,
|
||||
min_lsn: u64,
|
||||
seg_coordinates: Vec<(f32, f32)>,
|
||||
}
|
||||
|
||||
fn draw_legend(result: &mut String) -> anyhow::Result<()> {
|
||||
writeln!(
|
||||
result,
|
||||
"<circle cx=\"10\" cy=\"10\" r=\"5\" stroke=\"red\"/>"
|
||||
)?;
|
||||
writeln!(result, "<text x=\"20\" y=\"15\">logical snapshot</text>")?;
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"5\" y1=\"30\" x2=\"15\" y2=\"30\" stroke-width=\"6\" stroke=\"black\" />"
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
"<text x=\"20\" y=\"35\">WAL within retention period</text>"
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"5\" y1=\"50\" x2=\"15\" y2=\"50\" stroke-width=\"3\" stroke=\"black\" />"
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
"<text x=\"20\" y=\"55\">WAL retained to avoid copy</text>"
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
|
||||
)?;
|
||||
writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn draw_svg(
|
||||
storage: &StorageModel,
|
||||
branches: &[String],
|
||||
seg_to_branch: &[usize],
|
||||
sizes: &SizeResult,
|
||||
) -> anyhow::Result<String> {
|
||||
let mut draw = SvgDraw {
|
||||
storage,
|
||||
branches,
|
||||
seg_to_branch,
|
||||
sizes: &sizes.segments,
|
||||
|
||||
xscale: 0.0,
|
||||
min_lsn: 0,
|
||||
seg_coordinates: Vec::new(),
|
||||
};
|
||||
|
||||
let mut result = String::new();
|
||||
|
||||
writeln!(result, "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" height=\"300\" width=\"500\">")?;
|
||||
|
||||
draw.calculate_svg_layout();
|
||||
|
||||
// Draw the tree
|
||||
for (seg_id, _seg) in storage.segments.iter().enumerate() {
|
||||
draw.draw_seg_phase1(seg_id, &mut result)?;
|
||||
}
|
||||
|
||||
// Draw snapshots
|
||||
for (seg_id, _seg) in storage.segments.iter().enumerate() {
|
||||
draw.draw_seg_phase2(seg_id, &mut result)?;
|
||||
}
|
||||
|
||||
draw_legend(&mut result)?;
|
||||
|
||||
write!(result, "</svg>")?;
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
impl<'a> SvgDraw<'a> {
|
||||
fn calculate_svg_layout(&mut self) {
|
||||
// Find x scale
|
||||
let segments = &self.storage.segments;
|
||||
let min_lsn = segments.iter().map(|s| s.lsn).fold(u64::MAX, std::cmp::min);
|
||||
let max_lsn = segments.iter().map(|s| s.lsn).fold(0, std::cmp::max);
|
||||
|
||||
// Start with 1 pixel = 1 byte. Double the scale until it fits into the image
|
||||
let mut xscale = 1.0;
|
||||
while (max_lsn - min_lsn) as f32 / xscale > SVG_WIDTH {
|
||||
xscale *= 2.0;
|
||||
}
|
||||
|
||||
// Layout the timelines on Y dimension.
|
||||
// TODO
|
||||
let mut y = 100.0;
|
||||
let mut branch_y_coordinates = Vec::new();
|
||||
for _branch in self.branches {
|
||||
branch_y_coordinates.push(y);
|
||||
y += 40.0;
|
||||
}
|
||||
|
||||
// Calculate coordinates for each point
|
||||
let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
|
||||
.map(|(seg, branch_id)| {
|
||||
let x = (seg.lsn - min_lsn) as f32 / xscale;
|
||||
let y = branch_y_coordinates[*branch_id];
|
||||
(x, y)
|
||||
})
|
||||
.collect();
|
||||
|
||||
self.xscale = xscale;
|
||||
self.min_lsn = min_lsn;
|
||||
self.seg_coordinates = seg_coordinates;
|
||||
}
|
||||
|
||||
/// Draws lines between points
|
||||
fn draw_seg_phase1(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
|
||||
let seg = &self.storage.segments[seg_id];
|
||||
|
||||
let wal_bytes = if let Some(parent_id) = seg.parent {
|
||||
seg.lsn - self.storage.segments[parent_id].lsn
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
let style = match self.sizes[seg_id].method {
|
||||
SegmentMethod::SnapshotHere => "stroke-width=\"1\" stroke=\"gray\"",
|
||||
SegmentMethod::Wal if seg.needed && wal_bytes > 0 => {
|
||||
"stroke-width=\"6\" stroke=\"black\""
|
||||
}
|
||||
SegmentMethod::Wal => "stroke-width=\"3\" stroke=\"black\"",
|
||||
SegmentMethod::Skipped => "stroke-width=\"1\" stroke=\"gray\"",
|
||||
};
|
||||
if let Some(parent_id) = seg.parent {
|
||||
let (x1, y1) = self.seg_coordinates[parent_id];
|
||||
let (x2, y2) = self.seg_coordinates[seg_id];
|
||||
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
" <title>{wal_bytes} bytes of WAL (seg {seg_id})</title>"
|
||||
)?;
|
||||
writeln!(result, "</line>")?;
|
||||
} else {
|
||||
// draw a little dash to mark the starting point of this branch
|
||||
let (x, y) = self.seg_coordinates[seg_id];
|
||||
let (x1, y1) = (x, y - 5.0);
|
||||
let (x2, y2) = (x, y + 5.0);
|
||||
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
|
||||
)?;
|
||||
writeln!(result, " <title>(seg {seg_id})</title>")?;
|
||||
writeln!(result, "</line>")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Draw circles where snapshots are taken
|
||||
fn draw_seg_phase2(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
|
||||
let seg = &self.storage.segments[seg_id];
|
||||
|
||||
// draw a snapshot point if it's needed
|
||||
let (coord_x, coord_y) = self.seg_coordinates[seg_id];
|
||||
if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
|
||||
writeln!(
|
||||
result,
|
||||
"<circle cx=\"{coord_x}\" cy=\"{coord_y}\" r=\"5\" stroke=\"red\">",
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
" <title>logical size {}</title>",
|
||||
seg.size.unwrap()
|
||||
)?;
|
||||
write!(result, "</circle>")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
313
libs/tenant_size_model/tests/tests.rs
Normal file
313
libs/tenant_size_model/tests/tests.rs
Normal file
@@ -0,0 +1,313 @@
|
||||
//! Tenant size model tests.
|
||||
|
||||
use tenant_size_model::{Segment, SizeResult, StorageModel};
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
struct ScenarioBuilder {
|
||||
segments: Vec<Segment>,
|
||||
|
||||
/// Mapping from the branch name to the index of a segment describing its latest state.
|
||||
branches: HashMap<String, usize>,
|
||||
}
|
||||
|
||||
impl ScenarioBuilder {
|
||||
/// Creates a new storage with the given default branch name.
|
||||
pub fn new(initial_branch: &str) -> ScenarioBuilder {
|
||||
let init_segment = Segment {
|
||||
parent: None,
|
||||
lsn: 0,
|
||||
size: Some(0),
|
||||
needed: false, // determined later
|
||||
};
|
||||
|
||||
ScenarioBuilder {
|
||||
segments: vec![init_segment],
|
||||
branches: HashMap::from([(initial_branch.into(), 0)]),
|
||||
}
|
||||
}
|
||||
|
||||
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
|
||||
pub fn modify_branch(&mut self, branch: &str, lsn_bytes: u64, size_bytes: i64) {
|
||||
let lastseg_id = *self.branches.get(branch).unwrap();
|
||||
let newseg_id = self.segments.len();
|
||||
let lastseg = &mut self.segments[lastseg_id];
|
||||
|
||||
let newseg = Segment {
|
||||
parent: Some(lastseg_id),
|
||||
lsn: lastseg.lsn + lsn_bytes,
|
||||
size: Some((lastseg.size.unwrap() as i64 + size_bytes) as u64),
|
||||
needed: false,
|
||||
};
|
||||
|
||||
self.segments.push(newseg);
|
||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, branch: &str, bytes: u64) {
|
||||
self.modify_branch(branch, bytes, bytes as i64);
|
||||
}
|
||||
|
||||
pub fn update(&mut self, branch: &str, bytes: u64) {
|
||||
self.modify_branch(branch, bytes, 0i64);
|
||||
}
|
||||
|
||||
pub fn _delete(&mut self, branch: &str, bytes: u64) {
|
||||
self.modify_branch(branch, bytes, -(bytes as i64));
|
||||
}
|
||||
|
||||
/// Panics if the parent branch cannot be found.
|
||||
pub fn branch(&mut self, parent: &str, name: &str) {
|
||||
// Find the right segment
|
||||
let branchseg_id = *self
|
||||
.branches
|
||||
.get(parent)
|
||||
.expect("should had found the parent by key");
|
||||
let _branchseg = &mut self.segments[branchseg_id];
|
||||
|
||||
// Create branch name for it
|
||||
self.branches.insert(name.to_string(), branchseg_id);
|
||||
}
|
||||
|
||||
pub fn calculate(&mut self, retention_period: u64) -> (StorageModel, SizeResult) {
|
||||
// Phase 1: Mark all the segments that need to be retained
|
||||
for (_branch, &last_seg_id) in self.branches.iter() {
|
||||
let last_seg = &self.segments[last_seg_id];
|
||||
let cutoff_lsn = last_seg.lsn.saturating_sub(retention_period);
|
||||
let mut seg_id = last_seg_id;
|
||||
loop {
|
||||
let seg = &mut self.segments[seg_id];
|
||||
if seg.lsn <= cutoff_lsn {
|
||||
break;
|
||||
}
|
||||
seg.needed = true;
|
||||
if let Some(prev_seg_id) = seg.parent {
|
||||
seg_id = prev_seg_id;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Perform the calculation
|
||||
let storage_model = StorageModel {
|
||||
segments: self.segments.clone(),
|
||||
};
|
||||
let size_result = storage_model.calculate();
|
||||
(storage_model, size_result)
|
||||
}
|
||||
}
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
#[test]
|
||||
fn scenario_1() {
|
||||
// Create main branch
|
||||
let mut scenario = ScenarioBuilder::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
scenario.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Calculate the synthetic size with retention horizon 1000
|
||||
let (_model, result) = scenario.calculate(1000);
|
||||
|
||||
// The end of the branch is at LSN 10000. Need to retain
|
||||
// a logical snapshot at LSN 9000, plus the WAL between 9000-10000.
|
||||
// The logical snapshot has size 5000.
|
||||
assert_eq!(result.total_size, 5000 + 1000);
|
||||
}
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
#[test]
|
||||
fn scenario_2() {
|
||||
// Create main branch
|
||||
let mut scenario = ScenarioBuilder::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
scenario.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Branch
|
||||
scenario.branch("main", "child");
|
||||
scenario.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
scenario.update("main", 1_000);
|
||||
|
||||
//
|
||||
// The history looks like this now:
|
||||
//
|
||||
// 10000 11000
|
||||
// *----*----*--------------* main
|
||||
// |
|
||||
// | 11000
|
||||
// +-------------- child
|
||||
//
|
||||
//
|
||||
// With retention horizon 1000, we need to retain logical snapshot
|
||||
// at the branch point, size 5000, and the WAL from 10000-11000 on
|
||||
// both branches.
|
||||
let (_model, result) = scenario.calculate(1000);
|
||||
|
||||
assert_eq!(result.total_size, 5000 + 1000 + 1000);
|
||||
}
|
||||
|
||||
// Like 2, but more updates on main
|
||||
#[test]
|
||||
fn scenario_3() {
|
||||
// Create main branch
|
||||
let mut scenario = ScenarioBuilder::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
scenario.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Branch
|
||||
scenario.branch("main", "child");
|
||||
scenario.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..5 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
//
|
||||
// The history looks like this now:
|
||||
//
|
||||
// 10000 15000
|
||||
// *----*----*------------------------------------* main
|
||||
// |
|
||||
// | 11000
|
||||
// +-------------- child
|
||||
//
|
||||
//
|
||||
// With retention horizon 1000, it's still cheapest to retain
|
||||
// - snapshot at branch point (size 5000)
|
||||
// - WAL on child between 10000-11000
|
||||
// - WAL on main between 10000-15000
|
||||
//
|
||||
// This is in total 5000 + 1000 + 5000
|
||||
//
|
||||
let (_model, result) = scenario.calculate(1000);
|
||||
|
||||
assert_eq!(result.total_size, 5000 + 1000 + 5000);
|
||||
}
|
||||
|
||||
// Diverged branches
|
||||
#[test]
|
||||
fn scenario_4() {
|
||||
// Create main branch
|
||||
let mut scenario = ScenarioBuilder::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
scenario.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Branch
|
||||
scenario.branch("main", "child");
|
||||
scenario.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..8 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
//
|
||||
// The history looks like this now:
|
||||
//
|
||||
// 10000 18000
|
||||
// *----*----*------------------------------------* main
|
||||
// |
|
||||
// | 11000
|
||||
// +-------------- child
|
||||
//
|
||||
//
|
||||
// With retention horizon 1000, it's now cheapest to retain
|
||||
// separate snapshots on both branches:
|
||||
// - snapshot on main branch at LSN 17000 (size 5000)
|
||||
// - WAL on main between 17000-18000
|
||||
// - snapshot on child branch at LSN 10000 (size 5000)
|
||||
// - WAL on child between 10000-11000
|
||||
//
|
||||
// This is in total 5000 + 1000 + 5000 + 1000 = 12000
|
||||
//
|
||||
// (If we used the the method from the previous scenario, and
|
||||
// kept only snapshot at the branch point, we'd need to keep
|
||||
// all the WAL between 10000-18000 on the main branch, so
|
||||
// the total size would be 5000 + 1000 + 8000 = 14000. The
|
||||
// calculation always picks the cheapest alternative)
|
||||
|
||||
let (_model, result) = scenario.calculate(1000);
|
||||
|
||||
assert_eq!(result.total_size, 5000 + 1000 + 5000 + 1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scenario_5() {
|
||||
let mut scenario = ScenarioBuilder::new("a");
|
||||
scenario.insert("a", 5000);
|
||||
scenario.branch("a", "b");
|
||||
scenario.update("b", 4000);
|
||||
scenario.update("a", 2000);
|
||||
scenario.branch("a", "c");
|
||||
scenario.insert("c", 4000);
|
||||
scenario.insert("a", 2000);
|
||||
|
||||
let (_model, result) = scenario.calculate(1000);
|
||||
|
||||
assert_eq!(result.total_size, 17000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scenario_6() {
|
||||
let branches = [
|
||||
"7ff1edab8182025f15ae33482edb590a",
|
||||
"b1719e044db05401a05a2ed588a3ad3f",
|
||||
"0xb68d6691c895ad0a70809470020929ef",
|
||||
];
|
||||
|
||||
// compared to other scenarios, this one uses bytes instead of kB
|
||||
|
||||
let mut scenario = ScenarioBuilder::new("");
|
||||
|
||||
scenario.branch("", branches[0]); // at 0
|
||||
scenario.modify_branch(branches[0], 108951064, 43696128); // at 108951064
|
||||
scenario.branch(branches[0], branches[1]); // at 108951064
|
||||
scenario.modify_branch(branches[1], 15560408, -1851392); // at 124511472
|
||||
scenario.modify_branch(branches[0], 174464360, -1531904); // at 283415424
|
||||
scenario.branch(branches[0], branches[2]); // at 283415424
|
||||
scenario.modify_branch(branches[2], 15906192, 8192); // at 299321616
|
||||
scenario.modify_branch(branches[0], 18909976, 32768); // at 302325400
|
||||
|
||||
let (model, result) = scenario.calculate(100_000);
|
||||
|
||||
// FIXME: We previously calculated 333_792_000. But with this PR, we get
|
||||
// a much lower number. At a quick look at the model output and the
|
||||
// calculations here, the new result seems correct to me.
|
||||
eprintln!(
|
||||
" MODEL: {}",
|
||||
serde_json::to_string(&model.segments).unwrap()
|
||||
);
|
||||
eprintln!(
|
||||
"RESULT: {}",
|
||||
serde_json::to_string(&result.segments).unwrap()
|
||||
);
|
||||
|
||||
assert_eq!(result.total_size, 136_236_928);
|
||||
}
|
||||
@@ -4,13 +4,14 @@ use anyhow::{anyhow, Context};
|
||||
use hyper::header::{HeaderName, AUTHORIZATION};
|
||||
use hyper::http::HeaderValue;
|
||||
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
|
||||
use hyper::{Method, StatusCode};
|
||||
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::RequestInfo;
|
||||
use routerify::{Middleware, Router, RouterBuilder, RouterService};
|
||||
use tokio::task::JoinError;
|
||||
use tracing::info;
|
||||
use tracing;
|
||||
|
||||
use std::future::Future;
|
||||
use std::net::TcpListener;
|
||||
@@ -27,7 +28,14 @@ static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
});
|
||||
|
||||
async fn logger(res: Response<Body>, info: RequestInfo) -> Result<Response<Body>, ApiError> {
|
||||
info!("{} {} {}", info.method(), info.uri().path(), res.status(),);
|
||||
// cannot factor out the Level to avoid the repetition
|
||||
// because tracing can only work with const Level
|
||||
// which is not the case here
|
||||
if info.method() == Method::GET && res.status() == StatusCode::OK {
|
||||
tracing::debug!("{} {} {}", info.method(), info.uri().path(), res.status());
|
||||
} else {
|
||||
tracing::info!("{} {} {}", info.method(), info.uri().path(), res.status());
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
@@ -203,7 +211,7 @@ pub fn serve_thread_main<S>(
|
||||
where
|
||||
S: Future<Output = ()> + Send + Sync,
|
||||
{
|
||||
info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
|
||||
tracing::info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
|
||||
|
||||
// Create a Service from the router above to handle incoming requests.
|
||||
let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
|
||||
|
||||
@@ -45,3 +45,115 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Disable the default rust panic hook by using `set_hook`.
|
||||
///
|
||||
/// For neon binaries, the assumption is that tracing is configured before with [`init`], after
|
||||
/// that sentry is configured (if needed). sentry will install it's own on top of this, always
|
||||
/// processing the panic before we log it.
|
||||
///
|
||||
/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
|
||||
/// If the assumptions about the initialization order are not held, use
|
||||
/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
|
||||
/// lost.
|
||||
#[must_use]
|
||||
pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
|
||||
std::panic::set_hook(Box::new(tracing_panic_hook));
|
||||
TracingPanicHookGuard::new()
|
||||
}
|
||||
|
||||
/// Drop guard which restores the std panic hook on drop.
|
||||
///
|
||||
/// Tracing should not be used when it's not configured, but we cannot really latch on to any
|
||||
/// imaginary lifetime of tracing.
|
||||
pub struct TracingPanicHookGuard {
|
||||
act: bool,
|
||||
}
|
||||
|
||||
impl TracingPanicHookGuard {
|
||||
fn new() -> Self {
|
||||
TracingPanicHookGuard { act: true }
|
||||
}
|
||||
|
||||
/// Make this hook guard not do anything when dropped.
|
||||
pub fn forget(&mut self) {
|
||||
self.act = false;
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TracingPanicHookGuard {
|
||||
fn drop(&mut self) {
|
||||
if self.act {
|
||||
let _ = std::panic::take_hook();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Named symbol for our panic hook, which logs the panic.
|
||||
fn tracing_panic_hook(info: &std::panic::PanicInfo) {
|
||||
// following rust 1.66.1 std implementation:
|
||||
// https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
|
||||
let location = info.location();
|
||||
|
||||
let msg = match info.payload().downcast_ref::<&'static str>() {
|
||||
Some(s) => *s,
|
||||
None => match info.payload().downcast_ref::<String>() {
|
||||
Some(s) => &s[..],
|
||||
None => "Box<dyn Any>",
|
||||
},
|
||||
};
|
||||
|
||||
let thread = std::thread::current();
|
||||
let thread = thread.name().unwrap_or("<unnamed>");
|
||||
let backtrace = std::backtrace::Backtrace::capture();
|
||||
|
||||
let _entered = if let Some(location) = location {
|
||||
tracing::error_span!("panic", %thread, location = %PrettyLocation(location))
|
||||
} else {
|
||||
// very unlikely to hit here, but the guarantees of std could change
|
||||
tracing::error_span!("panic", %thread)
|
||||
}
|
||||
.entered();
|
||||
|
||||
if backtrace.status() == std::backtrace::BacktraceStatus::Captured {
|
||||
// this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really
|
||||
// get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to
|
||||
// string, maybe even to a TLS one but tracing already does that.
|
||||
tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}");
|
||||
} else {
|
||||
tracing::error!("{msg}");
|
||||
}
|
||||
|
||||
// ensure that we log something on the panic if this hook is left after tracing has been
|
||||
// unconfigured. worst case when teardown is racing the panic is to log the panic twice.
|
||||
tracing::dispatcher::get_default(|d| {
|
||||
if let Some(_none) = d.downcast_ref::<tracing::subscriber::NoSubscriber>() {
|
||||
let location = location.map(PrettyLocation);
|
||||
log_panic_to_stderr(thread, msg, location, &backtrace);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[cold]
|
||||
fn log_panic_to_stderr(
|
||||
thread: &str,
|
||||
msg: &str,
|
||||
location: Option<PrettyLocation<'_, '_>>,
|
||||
backtrace: &std::backtrace::Backtrace,
|
||||
) {
|
||||
eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}");
|
||||
}
|
||||
|
||||
struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>);
|
||||
|
||||
impl std::fmt::Display for PrettyLocation<'_, '_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for PrettyLocation<'_, '_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
<Self as std::fmt::Display>::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ const_format.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
crc32c.workspace = true
|
||||
crossbeam-utils.workspace = true
|
||||
either.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
@@ -51,7 +52,7 @@ thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit.workspace = true
|
||||
toml_edit = { workspace = true, features = [ "serde" ] }
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
walkdir.workspace = true
|
||||
|
||||
@@ -88,6 +88,13 @@ fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
};
|
||||
|
||||
// Initialize logging, which must be initialized before the custom panic hook is installed.
|
||||
logging::init(conf.log_format)?;
|
||||
|
||||
// mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
|
||||
// disarming this hook on pageserver, because we never tear down tracing.
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(
|
||||
Some(GIT_VERSION.into()),
|
||||
@@ -210,9 +217,6 @@ fn start_pageserver(
|
||||
launch_ts: &'static LaunchTimestamp,
|
||||
conf: &'static PageServerConf,
|
||||
) -> anyhow::Result<()> {
|
||||
// Initialize logging
|
||||
logging::init(conf.log_format)?;
|
||||
|
||||
// Print version and launch timestamp to the log,
|
||||
// and expose them as prometheus metrics.
|
||||
// A changed version string indicates changed software.
|
||||
|
||||
@@ -731,6 +731,13 @@ impl PageServerConf {
|
||||
})?);
|
||||
}
|
||||
|
||||
if let Some(eviction_policy) = item.get("eviction_policy") {
|
||||
t_conf.eviction_policy = Some(
|
||||
toml_edit::de::from_item(eviction_policy.clone())
|
||||
.context("parse eviction_policy")?,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(t_conf)
|
||||
}
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
|
||||
const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize)]
|
||||
#[derive(Serialize, Debug)]
|
||||
struct Ids {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
tenant_id: TenantId,
|
||||
@@ -75,7 +75,7 @@ pub async fn collect_metrics(
|
||||
// define client here to reuse it for all requests
|
||||
let client = reqwest::Client::new();
|
||||
let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
|
||||
let mut prev_iteration_time: Option<std::time::Instant> = None;
|
||||
let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
@@ -86,11 +86,11 @@ pub async fn collect_metrics(
|
||||
_ = ticker.tick() => {
|
||||
|
||||
// send cached metrics every cached_metric_collection_interval
|
||||
let send_cached = prev_iteration_time
|
||||
.map(|x| x.elapsed() >= cached_metric_collection_interval)
|
||||
.unwrap_or(false);
|
||||
let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
|
||||
|
||||
prev_iteration_time = Some(std::time::Instant::now());
|
||||
if send_cached {
|
||||
prev_iteration_time = std::time::Instant::now();
|
||||
}
|
||||
|
||||
collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
|
||||
}
|
||||
@@ -287,6 +287,12 @@ pub async fn collect_metrics_iteration(
|
||||
}
|
||||
} else {
|
||||
error!("metrics endpoint refused the sent metrics: {:?}", res);
|
||||
for metric in chunk_to_send.iter() {
|
||||
// Report if the metric value is suspiciously large
|
||||
if metric.value > (1u64 << 40) {
|
||||
error!("potentially abnormal metric value: {:?}", metric);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
|
||||
@@ -437,6 +437,13 @@ paths:
|
||||
type: boolean
|
||||
description: |
|
||||
When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
|
||||
- name: retention_period
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
description: |
|
||||
Override the default retention period (in bytes) used for size calculation.
|
||||
get:
|
||||
description: |
|
||||
Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
|
||||
|
||||
@@ -7,19 +7,21 @@ use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
|
||||
use super::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
};
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::mgr::TenantMapInsertError;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
@@ -38,7 +40,7 @@ use utils::{
|
||||
|
||||
// Imports only used for testing APIs
|
||||
#[cfg(feature = "testing")]
|
||||
use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
|
||||
use super::models::ConfigureFailpointsRequest;
|
||||
|
||||
struct State {
|
||||
conf: &'static PageServerConf,
|
||||
@@ -328,7 +330,7 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
|
||||
|
||||
let result = match result {
|
||||
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
|
||||
LsnForTimestamp::Future(_lsn) => "future".into(),
|
||||
LsnForTimestamp::Future(lsn) => format!("{lsn}"),
|
||||
LsnForTimestamp::Past(_lsn) => "past".into(),
|
||||
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
|
||||
};
|
||||
@@ -479,11 +481,19 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
/// to debug any of the calculations. Requires `tenant_id` request parameter, supports
|
||||
/// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
|
||||
/// values.
|
||||
///
|
||||
/// 'retention_period' query parameter overrides the cutoff that is used to calculate the size
|
||||
/// (only if it is shorter than the real cutoff).
|
||||
///
|
||||
/// Note: we don't update the cached size and prometheus metric here.
|
||||
/// The retention period might be different, and it's nice to have a method to just calculate it
|
||||
/// without modifying anything anyway.
|
||||
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
|
||||
let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
|
||||
let headers = request.headers();
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
@@ -492,24 +502,29 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
|
||||
// this can be long operation
|
||||
let inputs = tenant
|
||||
.gather_size_inputs(&ctx)
|
||||
.gather_size_inputs(retention_period, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let size = if !inputs_only.unwrap_or(false) {
|
||||
Some(
|
||||
tenant
|
||||
.calc_and_update_cached_synthetic_size(&inputs)
|
||||
.map_err(ApiError::InternalServerError)?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let mut sizes = None;
|
||||
if !inputs_only.unwrap_or(false) {
|
||||
let storage_model = inputs
|
||||
.calculate_model()
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let size = storage_model.calculate();
|
||||
|
||||
/// Private response type with the additional "unstable" `inputs` field.
|
||||
///
|
||||
/// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is
|
||||
/// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`.
|
||||
// If request header expects html, return html
|
||||
if headers["Accept"] == "text/html" {
|
||||
return synthetic_size_html_response(inputs, storage_model, size);
|
||||
}
|
||||
sizes = Some(size);
|
||||
} else if headers["Accept"] == "text/html" {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"inputs_only parameter is incompatible with html output request"
|
||||
)));
|
||||
}
|
||||
|
||||
/// The type resides in the pageserver not to expose `ModelInputs`.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Serialize)]
|
||||
struct TenantHistorySize {
|
||||
@@ -519,6 +534,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
///
|
||||
/// Will be none if `?inputs_only=true` was given.
|
||||
size: Option<u64>,
|
||||
/// Size of each segment used in the model.
|
||||
/// Will be null if `?inputs_only=true` was given.
|
||||
segment_sizes: Option<Vec<tenant_size_model::SegmentSizeResult>>,
|
||||
inputs: crate::tenant::size::ModelInputs,
|
||||
}
|
||||
|
||||
@@ -526,7 +544,8 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
StatusCode::OK,
|
||||
TenantHistorySize {
|
||||
id: tenant_id,
|
||||
size,
|
||||
size: sizes.as_ref().map(|x| x.total_size),
|
||||
segment_sizes: sizes.map(|x| x.segments),
|
||||
inputs,
|
||||
},
|
||||
)
|
||||
@@ -591,6 +610,62 @@ async fn evict_timeline_layer_handler(request: Request<Body>) -> Result<Response
|
||||
}
|
||||
}
|
||||
|
||||
/// Get tenant_size SVG graph along with the JSON data.
|
||||
fn synthetic_size_html_response(
|
||||
inputs: ModelInputs,
|
||||
storage_model: StorageModel,
|
||||
sizes: SizeResult,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let mut timeline_ids: Vec<String> = Vec::new();
|
||||
let mut timeline_map: HashMap<TimelineId, usize> = HashMap::new();
|
||||
for (index, ti) in inputs.timeline_inputs.iter().enumerate() {
|
||||
timeline_map.insert(ti.timeline_id, index);
|
||||
timeline_ids.push(ti.timeline_id.to_string());
|
||||
}
|
||||
let seg_to_branch: Vec<usize> = inputs
|
||||
.segments
|
||||
.iter()
|
||||
.map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
|
||||
.collect();
|
||||
|
||||
let svg =
|
||||
tenant_size_model::svg::draw_svg(&storage_model, &timeline_ids, &seg_to_branch, &sizes)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let mut response = String::new();
|
||||
|
||||
use std::fmt::Write;
|
||||
write!(response, "<html>\n<body>\n").unwrap();
|
||||
write!(response, "<div>\n{svg}\n</div>").unwrap();
|
||||
writeln!(response, "Project size: {}", sizes.total_size).unwrap();
|
||||
writeln!(response, "<pre>").unwrap();
|
||||
writeln!(
|
||||
response,
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&inputs).unwrap()
|
||||
)
|
||||
.unwrap();
|
||||
writeln!(
|
||||
response,
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&sizes.segments).unwrap()
|
||||
)
|
||||
.unwrap();
|
||||
writeln!(response, "</pre>").unwrap();
|
||||
write!(response, "</body>\n</html>\n").unwrap();
|
||||
|
||||
html_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
|
||||
let response = Response::builder()
|
||||
.status(status)
|
||||
.header(hyper::header::CONTENT_TYPE, "text/html")
|
||||
.body(Body::from(data.as_bytes().to_vec()))
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
// Helper function to standardize the error messages we produce on bad durations
|
||||
//
|
||||
// Intended to be used with anyhow's `with_context`, e.g.:
|
||||
@@ -797,6 +872,14 @@ async fn update_tenant_config_handler(
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(eviction_policy) = request_data.eviction_policy {
|
||||
tenant_conf.eviction_policy = Some(
|
||||
serde_json::from_value(eviction_policy)
|
||||
.context("parse field `eviction_policy`")
|
||||
.map_err(ApiError::BadRequest)?,
|
||||
);
|
||||
}
|
||||
|
||||
let state = get_state(&request);
|
||||
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
|
||||
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
|
||||
@@ -842,7 +925,6 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
}
|
||||
|
||||
// Run GC immediately on given timeline.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
@@ -946,6 +1028,17 @@ async fn active_timeline_of_active_tenant(
|
||||
.map_err(ApiError::NotFound)
|
||||
}
|
||||
|
||||
async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
// Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook().
|
||||
// For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it.
|
||||
// Use catch_unwind to ensure that tokio nor hyper are distracted by our panic.
|
||||
let query = req.uri().query();
|
||||
let _ = std::panic::catch_unwind(|| {
|
||||
panic!("unconditional panic for testing panic hook integration; request query: {query:?}")
|
||||
});
|
||||
json_response(StatusCode::NO_CONTENT, ())
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(
|
||||
StatusCode::NOT_FOUND,
|
||||
@@ -1011,7 +1104,7 @@ pub fn make_router(
|
||||
.get("/v1/tenant", tenant_list_handler)
|
||||
.post("/v1/tenant", tenant_create_handler)
|
||||
.get("/v1/tenant/:tenant_id", tenant_status)
|
||||
.get("/v1/tenant/:tenant_id/size", tenant_size_handler)
|
||||
.get("/v1/tenant/:tenant_id/synthetic_size", tenant_size_handler)
|
||||
.put("/v1/tenant/config", update_tenant_config_handler)
|
||||
.get("/v1/tenant/:tenant_id/config", get_tenant_config_handler)
|
||||
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
|
||||
@@ -1030,7 +1123,7 @@ pub fn make_router(
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc",
|
||||
testing_api!("run timeline GC", timeline_gc_handler),
|
||||
timeline_gc_handler,
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
|
||||
@@ -1064,5 +1157,6 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||
evict_timeline_layer_handler,
|
||||
)
|
||||
.get("/v1/panic", always_panic_handler)
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -290,6 +290,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Locate LSN, such that all transactions that committed before
|
||||
/// 'search_timestamp' are visible, but nothing newer is.
|
||||
///
|
||||
@@ -303,7 +304,11 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<LsnForTimestamp, PageReconstructError> {
|
||||
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
||||
let min_lsn = *gc_cutoff_lsn_guard;
|
||||
// We use this method to figure out the branching LSN for new branch, but
|
||||
// GC cutoff could be before branching point and we cannot create new branch
|
||||
// with LSN < `ancestor_lsn`. Thus, pick the maximum of these two just to be
|
||||
// on the safe side.
|
||||
let min_lsn = std::cmp::max(*gc_cutoff_lsn_guard, self.get_ancestor_lsn());
|
||||
let max_lsn = self.get_last_record_lsn();
|
||||
|
||||
// LSNs are always 8-byte aligned. low/mid/high represent the
|
||||
@@ -327,12 +332,22 @@ impl Timeline {
|
||||
)
|
||||
.await?;
|
||||
|
||||
if cmp {
|
||||
high = mid;
|
||||
} else {
|
||||
if !cmp {
|
||||
// We either 1) found commit with timestamp **before** `search_timestamp`;
|
||||
// or 2) we haven't found any commit records at all.
|
||||
// Search with a larger LSN, in case 1) to try to find a more recent commit
|
||||
// (but still **before** target timestamp); and in case 2) to fetch more
|
||||
// SLRU segments for `clog`.
|
||||
low = mid + 1;
|
||||
} else {
|
||||
// We found only more recent commits, search in the older range.
|
||||
high = mid;
|
||||
}
|
||||
}
|
||||
// If `found_smaller == true`, `low` is the LSN of the first commit record
|
||||
// **before** the `search_timestamp` + 1 (to hit the while loop exit condition).
|
||||
// Substitute 1 to get exactly the commit LSN.
|
||||
let commit_lsn = Lsn((low - 1) * 8);
|
||||
match (found_smaller, found_larger) {
|
||||
(false, false) => {
|
||||
// This can happen if no commit records have been processed yet, e.g.
|
||||
@@ -340,32 +355,26 @@ impl Timeline {
|
||||
Ok(LsnForTimestamp::NoData(max_lsn))
|
||||
}
|
||||
(true, false) => {
|
||||
// Didn't find any commit timestamps larger than the request
|
||||
Ok(LsnForTimestamp::Future(max_lsn))
|
||||
// Only found a commit with timestamp smaller than the request.
|
||||
// It's still a valid case for branch creation, return it.
|
||||
// And `update_gc_info()` ignores LSN for a `LsnForTimestamp::Future`
|
||||
// case, anyway.
|
||||
Ok(LsnForTimestamp::Future(commit_lsn))
|
||||
}
|
||||
(false, true) => {
|
||||
// Didn't find any commit timestamps smaller than the request
|
||||
Ok(LsnForTimestamp::Past(max_lsn))
|
||||
}
|
||||
(true, true) => {
|
||||
// low is the LSN of the first commit record *after* the search_timestamp,
|
||||
// Back off by one to get to the point just before the commit.
|
||||
//
|
||||
// FIXME: it would be better to get the LSN of the previous commit.
|
||||
// Otherwise, if you restore to the returned LSN, the database will
|
||||
// include physical changes from later commits that will be marked
|
||||
// as aborted, and will need to be vacuumed away.
|
||||
Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
|
||||
}
|
||||
(true, true) => Ok(LsnForTimestamp::Present(commit_lsn)),
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any
|
||||
/// commits that committed after 'search_timestamp', at LSN 'probe_lsn'.
|
||||
/// Subroutine of `find_lsn_for_timestamp()`. Returns `true`, if there are any
|
||||
/// commits that committed after `search_timestamp`, at LSN `probe_lsn`.
|
||||
///
|
||||
/// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
|
||||
/// with a smaller/larger timestamp.
|
||||
/// Additionally, sets `found_smaller` / `found_larger`, if encounters any commits
|
||||
/// with a smaller / larger timestamp.
|
||||
///
|
||||
pub async fn is_latest_commit_timestamp_ge_than(
|
||||
&self,
|
||||
|
||||
@@ -7,11 +7,11 @@ use std::fmt;
|
||||
use std::ops::{AddAssign, Range};
|
||||
use std::time::Duration;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
|
||||
/// Key used in the Repository kv-store.
|
||||
///
|
||||
/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
|
||||
/// for what we actually store in these fields.
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
|
||||
pub struct Key {
|
||||
pub field1: u8,
|
||||
pub field2: u32,
|
||||
|
||||
@@ -231,6 +231,9 @@ pub enum TaskKind {
|
||||
// Compaction. One per tenant.
|
||||
Compaction,
|
||||
|
||||
// Eviction. One per timeline.
|
||||
Eviction,
|
||||
|
||||
// Initial logical size calculation
|
||||
InitialLogicalSizeCalculation,
|
||||
|
||||
|
||||
@@ -2418,6 +2418,9 @@ impl Tenant {
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
|
||||
pub async fn gather_size_inputs(
|
||||
&self,
|
||||
// `max_retention_period` overrides the cutoff that is used to calculate the size
|
||||
// (only if it is shorter than the real cutoff).
|
||||
max_retention_period: Option<u64>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<size::ModelInputs> {
|
||||
let logical_sizes_at_once = self
|
||||
@@ -2425,32 +2428,41 @@ impl Tenant {
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.inner();
|
||||
|
||||
// TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries
|
||||
// are for testing/experimenting, we tolerate this.
|
||||
// TODO: Having a single mutex block concurrent reads is not great for performance.
|
||||
//
|
||||
// But the only case where we need to run multiple of these at once is when we
|
||||
// request a size for a tenant manually via API, while another background calculation
|
||||
// is in progress (which is not a common case).
|
||||
//
|
||||
// See more for on the issue #2748 condenced out of the initial PR review.
|
||||
let mut shared_cache = self.cached_logical_sizes.lock().await;
|
||||
|
||||
size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await
|
||||
size::gather_inputs(
|
||||
self,
|
||||
logical_sizes_at_once,
|
||||
max_retention_period,
|
||||
&mut shared_cache,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Calculate synthetic tenant size
|
||||
/// Calculate synthetic tenant size and cache the result.
|
||||
/// This is periodically called by background worker.
|
||||
/// result is cached in tenant struct
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
|
||||
pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
|
||||
let inputs = self.gather_size_inputs(ctx).await?;
|
||||
let inputs = self.gather_size_inputs(None, ctx).await?;
|
||||
|
||||
self.calc_and_update_cached_synthetic_size(&inputs)
|
||||
}
|
||||
|
||||
/// Calculate synthetic size , cache it and set metric value
|
||||
pub fn calc_and_update_cached_synthetic_size(
|
||||
&self,
|
||||
inputs: &size::ModelInputs,
|
||||
) -> anyhow::Result<u64> {
|
||||
let size = inputs.calculate()?;
|
||||
|
||||
self.set_cached_synthetic_size(size);
|
||||
|
||||
Ok(size)
|
||||
}
|
||||
|
||||
/// Cache given synthetic size and update the metric value
|
||||
pub fn set_cached_synthetic_size(&self, size: u64) {
|
||||
self.cached_synthetic_tenant_size
|
||||
.store(size, Ordering::Relaxed);
|
||||
|
||||
@@ -2458,8 +2470,6 @@ impl Tenant {
|
||||
.get_metric_with_label_values(&[&self.tenant_id.to_string()])
|
||||
.unwrap()
|
||||
.set(size);
|
||||
|
||||
Ok(size)
|
||||
}
|
||||
|
||||
pub fn get_cached_synthetic_size(&self) -> u64 {
|
||||
@@ -2757,6 +2767,7 @@ pub mod harness {
|
||||
lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
|
||||
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
|
||||
trace_read_requests: Some(tenant_conf.trace_read_requests),
|
||||
eviction_policy: Some(tenant_conf.eviction_policy),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,6 +91,7 @@ pub struct TenantConf {
|
||||
/// to avoid eager reconnects.
|
||||
pub max_lsn_wal_lag: NonZeroU64,
|
||||
pub trace_read_requests: bool,
|
||||
pub eviction_policy: EvictionPolicy,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -153,6 +154,34 @@ pub struct TenantConfOpt {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub trace_read_requests: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub eviction_policy: Option<EvictionPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum EvictionPolicy {
|
||||
NoEviction,
|
||||
LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
|
||||
}
|
||||
|
||||
impl EvictionPolicy {
|
||||
pub fn discriminant_str(&self) -> &'static str {
|
||||
match self {
|
||||
EvictionPolicy::NoEviction => "NoEviction",
|
||||
EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct EvictionPolicyLayerAccessThreshold {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub period: Duration,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub threshold: Duration,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -189,6 +218,7 @@ impl TenantConfOpt {
|
||||
trace_read_requests: self
|
||||
.trace_read_requests
|
||||
.unwrap_or(global_conf.trace_read_requests),
|
||||
eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -261,6 +291,7 @@ impl Default for TenantConf {
|
||||
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
||||
.expect("cannot parse default max walreceiver Lsn wal lag"),
|
||||
trace_read_requests: false,
|
||||
eviction_policy: EvictionPolicy::NoEviction,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -154,6 +154,12 @@ where
|
||||
expected: &Arc<L>,
|
||||
new: Arc<L>,
|
||||
) -> anyhow::Result<Replacement<Arc<L>>> {
|
||||
fail::fail_point!("layermap-replace-notfound", |_| Ok(
|
||||
// this is not what happens if an L0 layer was not found a anyhow error but perhaps
|
||||
// that should be changed. this is good enough to show a replacement failure.
|
||||
Replacement::NotFound
|
||||
));
|
||||
|
||||
self.layer_map.replace_historic_noflush(expected, new)
|
||||
}
|
||||
|
||||
@@ -731,16 +737,30 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
|
||||
///
|
||||
/// Returns `true` if the two `Arc` point to the same layer, false otherwise.
|
||||
#[inline(always)]
|
||||
fn compare_arced_layers(left: &Arc<L>, right: &Arc<L>) -> bool {
|
||||
// FIXME: ptr_eq might fail to return true for 'dyn' references because of multiple vtables
|
||||
// can be created in compilation. Clippy complains about this. In practice it seems to
|
||||
// work.
|
||||
pub fn compare_arced_layers(left: &Arc<L>, right: &Arc<L>) -> bool {
|
||||
// "dyn Trait" objects are "fat pointers" in that they have two components:
|
||||
// - pointer to the object
|
||||
// - pointer to the vtable
|
||||
//
|
||||
// In future rust versions this might become Arc::as_ptr(left) as *const () ==
|
||||
// Arc::as_ptr(right) as *const (), we could change to that before.
|
||||
#[allow(clippy::vtable_address_comparisons)]
|
||||
Arc::ptr_eq(left, right)
|
||||
// rust does not provide a guarantee that these vtables are unique, but however
|
||||
// `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
|
||||
// pointer and the vtable need to be equal.
|
||||
//
|
||||
// See: https://github.com/rust-lang/rust/issues/103763
|
||||
//
|
||||
// A future version of rust will most likely use this form below, where we cast each
|
||||
// pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
|
||||
// not affect the comparison.
|
||||
//
|
||||
// See: https://github.com/rust-lang/rust/pull/106450
|
||||
let left = Arc::as_ptr(left) as *const ();
|
||||
let right = Arc::as_ptr(right) as *const ();
|
||||
|
||||
left == right
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -540,13 +540,11 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use {
|
||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub async fn immediate_gc(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
|
||||
@@ -571,14 +571,15 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Launch a delete operation in the background.
|
||||
///
|
||||
/// The operation does not modify local state but assumes the local files have already been
|
||||
/// deleted, and is used to mirror those changes to remote.
|
||||
///
|
||||
/// Note: This schedules an index file upload before the deletions. The
|
||||
/// deletion won't actually be performed, until any previously scheduled
|
||||
/// upload operations, and the index file upload, have completed
|
||||
/// succesfully.
|
||||
///
|
||||
pub fn schedule_layer_file_deletion(
|
||||
self: &Arc<Self>,
|
||||
names: &[LayerFileName],
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -13,6 +13,7 @@ use crate::task_mgr::TaskKind;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use either::Either;
|
||||
use enum_map::EnumMap;
|
||||
use enumset::EnumSet;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
@@ -92,7 +93,23 @@ pub enum ValueReconstructResult {
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct LayerAccessStats(Mutex<LayerAccessStatsInner>);
|
||||
pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
|
||||
|
||||
/// This struct holds two instances of [`LayerAccessStatsInner`].
|
||||
/// Accesses are recorded to both instances.
|
||||
/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`].
|
||||
/// The `for_eviction_policy` is never reset.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
struct LayerAccessStatsLocked {
|
||||
for_scraping_api: LayerAccessStatsInner,
|
||||
for_eviction_policy: LayerAccessStatsInner,
|
||||
}
|
||||
|
||||
impl LayerAccessStatsLocked {
|
||||
fn iter_mut(&mut self) -> impl Iterator<Item = &mut LayerAccessStatsInner> {
|
||||
[&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
struct LayerAccessStatsInner {
|
||||
@@ -103,11 +120,11 @@ struct LayerAccessStatsInner {
|
||||
last_residence_changes: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct LayerAccessStatFullDetails {
|
||||
when: SystemTime,
|
||||
task_kind: TaskKind,
|
||||
access_kind: LayerAccessKind,
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(super) struct LayerAccessStatFullDetails {
|
||||
pub(super) when: SystemTime,
|
||||
pub(super) task_kind: TaskKind,
|
||||
pub(super) access_kind: LayerAccessKind,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, strum_macros::EnumString)]
|
||||
@@ -126,7 +143,7 @@ fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 {
|
||||
}
|
||||
|
||||
impl LayerAccessStatFullDetails {
|
||||
fn to_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
|
||||
fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
|
||||
let Self {
|
||||
when,
|
||||
task_kind,
|
||||
@@ -142,13 +159,13 @@ impl LayerAccessStatFullDetails {
|
||||
|
||||
impl LayerAccessStats {
|
||||
pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
|
||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsInner::default()));
|
||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
||||
new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
|
||||
new
|
||||
}
|
||||
|
||||
pub(crate) fn for_new_layer_file() -> Self {
|
||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsInner::default()));
|
||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
||||
new.record_residence_event(
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
@@ -176,38 +193,43 @@ impl LayerAccessStats {
|
||||
status: LayerResidenceStatus,
|
||||
reason: LayerResidenceEventReason,
|
||||
) {
|
||||
let mut inner = self.0.lock().unwrap();
|
||||
inner
|
||||
.last_residence_changes
|
||||
.write(LayerResidenceEvent::new(status, reason));
|
||||
let mut locked = self.0.lock().unwrap();
|
||||
locked.iter_mut().for_each(|inner| {
|
||||
inner
|
||||
.last_residence_changes
|
||||
.write(LayerResidenceEvent::new(status, reason))
|
||||
});
|
||||
}
|
||||
|
||||
fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
|
||||
let mut inner = self.0.lock().unwrap();
|
||||
let this_access = LayerAccessStatFullDetails {
|
||||
when: SystemTime::now(),
|
||||
task_kind,
|
||||
access_kind,
|
||||
};
|
||||
inner
|
||||
.first_access
|
||||
.get_or_insert_with(|| this_access.clone());
|
||||
inner.count_by_access_kind[access_kind] += 1;
|
||||
inner.task_kind_flag |= task_kind;
|
||||
inner.last_accesses.write(this_access);
|
||||
|
||||
let mut locked = self.0.lock().unwrap();
|
||||
locked.iter_mut().for_each(|inner| {
|
||||
inner.first_access.get_or_insert(this_access);
|
||||
inner.count_by_access_kind[access_kind] += 1;
|
||||
inner.task_kind_flag |= task_kind;
|
||||
inner.last_accesses.write(this_access);
|
||||
})
|
||||
}
|
||||
fn to_api_model(
|
||||
|
||||
fn as_api_model(
|
||||
&self,
|
||||
reset: LayerAccessStatsReset,
|
||||
) -> pageserver_api::models::LayerAccessStats {
|
||||
let mut inner = self.0.lock().unwrap();
|
||||
let mut locked = self.0.lock().unwrap();
|
||||
let inner = &mut locked.for_scraping_api;
|
||||
let LayerAccessStatsInner {
|
||||
first_access,
|
||||
count_by_access_kind,
|
||||
task_kind_flag,
|
||||
last_accesses,
|
||||
last_residence_changes,
|
||||
} = &*inner;
|
||||
} = inner;
|
||||
let ret = pageserver_api::models::LayerAccessStats {
|
||||
access_count_by_access_kind: count_by_access_kind
|
||||
.iter()
|
||||
@@ -217,8 +239,8 @@ impl LayerAccessStats {
|
||||
.iter()
|
||||
.map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
|
||||
.collect(),
|
||||
first: first_access.as_ref().map(|a| a.to_api_model()),
|
||||
accesses_history: last_accesses.map(|m| m.to_api_model()),
|
||||
first: first_access.as_ref().map(|a| a.as_api_model()),
|
||||
accesses_history: last_accesses.map(|m| m.as_api_model()),
|
||||
residence_events_history: last_residence_changes.clone(),
|
||||
};
|
||||
match reset {
|
||||
@@ -232,6 +254,20 @@ impl LayerAccessStats {
|
||||
}
|
||||
ret
|
||||
}
|
||||
|
||||
pub(super) fn most_recent_access_or_residence_event(
|
||||
&self,
|
||||
) -> Either<LayerAccessStatFullDetails, LayerResidenceEvent> {
|
||||
let locked = self.0.lock().unwrap();
|
||||
let inner = &locked.for_eviction_policy;
|
||||
match inner.last_accesses.recent() {
|
||||
Some(a) => Either::Left(*a),
|
||||
None => match inner.last_residence_changes.recent() {
|
||||
Some(e) => Either::Right(e.clone()),
|
||||
None => unreachable!("constructors for LayerAccessStats ensure that there's always a residence change event"),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
|
||||
@@ -449,3 +485,14 @@ enum PathOrConf {
|
||||
Path(PathBuf),
|
||||
Conf(&'static PageServerConf),
|
||||
}
|
||||
|
||||
/// Range wrapping newtype, which uses display to render Debug.
|
||||
///
|
||||
/// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
|
||||
struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range<T>);
|
||||
|
||||
impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}..{}", self.0.start, self.0.end)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -194,8 +194,10 @@ pub struct DeltaLayer {
|
||||
|
||||
impl std::fmt::Debug for DeltaLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
use super::RangeDisplayDebug;
|
||||
|
||||
f.debug_struct("DeltaLayer")
|
||||
.field("key_range", &self.key_range)
|
||||
.field("key_range", &RangeDisplayDebug(&self.key_range))
|
||||
.field("lsn_range", &self.lsn_range)
|
||||
.field("file_size", &self.file_size)
|
||||
.field("inner", &self.inner)
|
||||
@@ -450,7 +452,7 @@ impl PersistentLayer for DeltaLayer {
|
||||
let layer_file_name = self.filename().file_name();
|
||||
let lsn_range = self.get_lsn_range();
|
||||
|
||||
let access_stats = self.access_stats.to_api_model(reset);
|
||||
let access_stats = self.access_stats.as_api_model(reset);
|
||||
|
||||
HistoricLayerInfo::Delta {
|
||||
layer_file_name,
|
||||
|
||||
@@ -10,12 +10,23 @@ use std::str::FromStr;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
// Note: Timeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
#[derive(PartialEq, Eq, Clone, Hash)]
|
||||
pub struct DeltaFileName {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DeltaFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
use super::RangeDisplayDebug;
|
||||
|
||||
f.debug_struct("DeltaFileName")
|
||||
.field("key_range", &RangeDisplayDebug(&self.key_range))
|
||||
.field("lsn_range", &self.lsn_range)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for DeltaFileName {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
@@ -100,12 +111,23 @@ impl fmt::Display for DeltaFileName {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
#[derive(PartialEq, Eq, Clone, Hash)]
|
||||
pub struct ImageFileName {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
use super::RangeDisplayDebug;
|
||||
|
||||
f.debug_struct("ImageFileName")
|
||||
.field("key_range", &RangeDisplayDebug(&self.key_range))
|
||||
.field("lsn", &self.lsn)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for ImageFileName {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
|
||||
@@ -119,8 +119,10 @@ pub struct ImageLayer {
|
||||
|
||||
impl std::fmt::Debug for ImageLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
use super::RangeDisplayDebug;
|
||||
|
||||
f.debug_struct("ImageLayer")
|
||||
.field("key_range", &self.key_range)
|
||||
.field("key_range", &RangeDisplayDebug(&self.key_range))
|
||||
.field("file_size", &self.file_size)
|
||||
.field("lsn", &self.lsn)
|
||||
.field("inner", &self.inner)
|
||||
@@ -269,7 +271,7 @@ impl PersistentLayer for ImageLayer {
|
||||
layer_file_size: Some(self.file_size),
|
||||
lsn_start: lsn_range.start,
|
||||
remote: false,
|
||||
access_stats: self.access_stats.to_api_model(reset),
|
||||
access_stats: self.access_stats.as_api_model(reset),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -49,6 +49,17 @@ pub struct RemoteLayer {
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
|
||||
|
||||
/// Has `LayerMap::replace` failed for this (true) or not (false).
|
||||
///
|
||||
/// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
|
||||
/// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
|
||||
/// unprocessable, because a LayerMap::replace failed.
|
||||
///
|
||||
/// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
|
||||
/// a possible fast loop between `Timeline::get_reconstruct_data` and
|
||||
/// `Timeline::download_remote_layer`, which also logs.
|
||||
pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for RemoteLayer {
|
||||
@@ -171,7 +182,7 @@ impl PersistentLayer for RemoteLayer {
|
||||
lsn_start: lsn_range.start,
|
||||
lsn_end: lsn_range.end,
|
||||
remote: true,
|
||||
access_stats: self.access_stats.to_api_model(reset),
|
||||
access_stats: self.access_stats.as_api_model(reset),
|
||||
}
|
||||
} else {
|
||||
HistoricLayerInfo::Image {
|
||||
@@ -179,7 +190,7 @@ impl PersistentLayer for RemoteLayer {
|
||||
layer_file_size: self.layer_metadata.file_size(),
|
||||
lsn_start: lsn_range.start,
|
||||
remote: true,
|
||||
access_stats: self.access_stats.to_api_model(reset),
|
||||
access_stats: self.access_stats.as_api_model(reset),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -207,6 +218,7 @@ impl RemoteLayer {
|
||||
file_name: fname.to_owned().into(),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
|
||||
access_stats,
|
||||
}
|
||||
}
|
||||
@@ -228,6 +240,7 @@ impl RemoteLayer {
|
||||
file_name: fname.to_owned().into(),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
|
||||
access_stats,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
@@ -11,6 +11,7 @@ use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::id::TenantId;
|
||||
|
||||
@@ -53,37 +54,55 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
info!("starting");
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
let mut first = true;
|
||||
loop {
|
||||
trace!("waking up");
|
||||
|
||||
let tenant = tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request");
|
||||
return;
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
|
||||
ControlFlow::Break(()) => return,
|
||||
ControlFlow::Continue(tenant) => tenant,
|
||||
},
|
||||
};
|
||||
};
|
||||
|
||||
let mut sleep_duration = tenant.get_compaction_period();
|
||||
if sleep_duration == Duration::ZERO {
|
||||
info!("automatic compaction is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
sleep_duration = Duration::from_secs(10);
|
||||
} else {
|
||||
// Run compaction
|
||||
if let Err(e) = tenant.compaction_iteration(&ctx).await {
|
||||
sleep_duration = wait_duration;
|
||||
error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
let period = tenant.get_compaction_period();
|
||||
|
||||
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
|
||||
// loop, #3501. There are also additional "allowed_errors" in tests.
|
||||
if first {
|
||||
first = false;
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let started_at = Instant::now();
|
||||
|
||||
let sleep_duration = if period == Duration::ZERO {
|
||||
info!("automatic compaction is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
Duration::from_secs(10)
|
||||
} else {
|
||||
// Run compaction
|
||||
if let Err(e) = tenant.compaction_iteration(&ctx).await {
|
||||
error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
|
||||
wait_duration
|
||||
} else {
|
||||
period
|
||||
}
|
||||
};
|
||||
|
||||
warn_when_period_overrun(started_at.elapsed(), period, "compaction");
|
||||
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
},
|
||||
@@ -105,14 +124,16 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
info!("starting");
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
// GC might require downloading, to find the cutoff LSN that corresponds to the
|
||||
// cutoff specified as time.
|
||||
let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
let mut first = true;
|
||||
loop {
|
||||
trace!("waking up");
|
||||
|
||||
let tenant = tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request");
|
||||
return;
|
||||
},
|
||||
@@ -122,27 +143,38 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
},
|
||||
};
|
||||
|
||||
let gc_period = tenant.get_gc_period();
|
||||
let gc_horizon = tenant.get_gc_horizon();
|
||||
let mut sleep_duration = gc_period;
|
||||
if sleep_duration == Duration::ZERO {
|
||||
info!("automatic GC is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
sleep_duration = Duration::from_secs(10);
|
||||
} else {
|
||||
// Run gc
|
||||
if gc_horizon > 0 {
|
||||
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await
|
||||
{
|
||||
sleep_duration = wait_duration;
|
||||
error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
}
|
||||
let period = tenant.get_gc_period();
|
||||
|
||||
if first {
|
||||
first = false;
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let started_at = Instant::now();
|
||||
|
||||
let gc_horizon = tenant.get_gc_horizon();
|
||||
let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
|
||||
info!("automatic GC is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
Duration::from_secs(10)
|
||||
} else {
|
||||
// Run gc
|
||||
let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
|
||||
if let Err(e) = res {
|
||||
error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
|
||||
wait_duration
|
||||
} else {
|
||||
period
|
||||
}
|
||||
};
|
||||
|
||||
warn_when_period_overrun(started_at.elapsed(), period, "gc");
|
||||
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
},
|
||||
@@ -197,3 +229,49 @@ async fn wait_for_active_tenant(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("cancelled")]
|
||||
pub(crate) struct Cancelled;
|
||||
|
||||
/// Provide a random delay for background task initialization.
|
||||
///
|
||||
/// This delay prevents a thundering herd of background tasks and will likely keep them running on
|
||||
/// different periods for more stable load.
|
||||
pub(crate) async fn random_init_delay(
|
||||
period: Duration,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), Cancelled> {
|
||||
use rand::Rng;
|
||||
|
||||
let d = {
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
// gen_range asserts that the range cannot be empty, which it could be because period can
|
||||
// be set to zero to disable gc or compaction, so lets set it to be at least 10s.
|
||||
let period = std::cmp::max(period, Duration::from_secs(10));
|
||||
|
||||
// semi-ok default as the source of jitter
|
||||
rng.gen_range(Duration::ZERO..=period)
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => Err(Cancelled),
|
||||
_ = tokio::time::sleep(d) => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
|
||||
// Duration::ZERO will happen because it's the "disable [bgtask]" value.
|
||||
if elapsed >= period && period != Duration::ZERO {
|
||||
// humantime does no significant digits clamping whereas Duration's debug is a bit more
|
||||
// intelligent. however it makes sense to keep the "configuration format" for period, even
|
||||
// though there's no way to output the actual config value.
|
||||
warn!(
|
||||
?elapsed,
|
||||
period = %humantime::format_duration(period),
|
||||
task,
|
||||
"task iteration took longer than the configured period"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
//!
|
||||
|
||||
mod eviction_task;
|
||||
mod walreceiver;
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
@@ -18,6 +19,7 @@ use tracing::*;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use std::cmp::{max, min, Ordering};
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::ops::{Deref, Range};
|
||||
@@ -47,7 +49,7 @@ use crate::metrics::TimelineMetrics;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||
use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::config::{EvictionPolicy, TenantConfOpt};
|
||||
use pageserver_api::reltag::RelTag;
|
||||
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
@@ -81,6 +83,25 @@ enum FlushLoopState {
|
||||
Exited,
|
||||
}
|
||||
|
||||
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Hole {
|
||||
key_range: Range<Key>,
|
||||
coverage_size: usize,
|
||||
}
|
||||
|
||||
impl Ord for Hole {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other.coverage_size.cmp(&self.coverage_size) // inverse order
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for Hole {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Timeline {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||
@@ -291,18 +312,9 @@ impl LogicalSize {
|
||||
// we change the type.
|
||||
match self.initial_logical_size.get() {
|
||||
Some(initial_size) => {
|
||||
let absolute_size_increment = u64::try_from(
|
||||
size_increment
|
||||
.checked_abs()
|
||||
.with_context(|| format!("Size added after initial {size_increment} is not expected to be i64::MIN"))?,
|
||||
).expect("casting nonnegative i64 to u64 should not fail");
|
||||
|
||||
if size_increment < 0 {
|
||||
initial_size.checked_sub(absolute_size_increment)
|
||||
} else {
|
||||
initial_size.checked_add(absolute_size_increment)
|
||||
}.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
|
||||
.map(CurrentLogicalSize::Exact)
|
||||
initial_size.checked_add_signed(size_increment)
|
||||
.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
|
||||
.map(CurrentLogicalSize::Exact)
|
||||
}
|
||||
None => {
|
||||
let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
|
||||
@@ -621,7 +633,10 @@ impl Timeline {
|
||||
self.flush_frozen_layers_and_wait().await
|
||||
}
|
||||
|
||||
/// Outermost timeline compaction operation; downloads needed layers.
|
||||
pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
const ROUNDS: usize = 2;
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
|
||||
// Last record Lsn could be zero in case the timeline was just created
|
||||
@@ -630,6 +645,86 @@ impl Timeline {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// retry two times to allow first round to find layers which need to be downloaded, then
|
||||
// download them, then retry compaction
|
||||
for round in 0..ROUNDS {
|
||||
// should we error out with the most specific error?
|
||||
let last_round = round == ROUNDS - 1;
|
||||
|
||||
let res = self.compact_inner(ctx).await;
|
||||
|
||||
// If `create_image_layers' or `compact_level0` scheduled any
|
||||
// uploads or deletions, but didn't update the index file yet,
|
||||
// do it now.
|
||||
//
|
||||
// This isn't necessary for correctness, the remote state is
|
||||
// consistent without the uploads and deletions, and we would
|
||||
// update the index file on next flush iteration too. But it
|
||||
// could take a while until that happens.
|
||||
//
|
||||
// Additionally, only do this on the terminal round before sleeping.
|
||||
if last_round {
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
}
|
||||
}
|
||||
|
||||
let rls = match res {
|
||||
Ok(()) => return Ok(()),
|
||||
Err(CompactionError::DownloadRequired(rls)) if !last_round => {
|
||||
// this can be done at most one time before exiting, waiting
|
||||
rls
|
||||
}
|
||||
Err(CompactionError::DownloadRequired(rls)) => {
|
||||
anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len())
|
||||
}
|
||||
Err(CompactionError::Other(e)) => {
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
|
||||
// this path can be visited in the second round of retrying, if first one found that we
|
||||
// must first download some remote layers
|
||||
let total = rls.len();
|
||||
|
||||
let mut downloads = rls
|
||||
.into_iter()
|
||||
.map(|rl| self.download_remote_layer(rl))
|
||||
.collect::<futures::stream::FuturesUnordered<_>>();
|
||||
|
||||
let mut failed = 0;
|
||||
|
||||
let cancelled = task_mgr::shutdown_watcher();
|
||||
tokio::pin!(cancelled);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"),
|
||||
res = downloads.next() => {
|
||||
match res {
|
||||
Some(Ok(())) => {},
|
||||
Some(Err(e)) => {
|
||||
warn!("Downloading remote layer for compaction failed: {e:#}");
|
||||
failed += 1;
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if failed != 0 {
|
||||
anyhow::bail!("{failed} out of {total} layers failed to download, retrying later");
|
||||
}
|
||||
|
||||
// if everything downloaded fine, lets try again
|
||||
}
|
||||
|
||||
unreachable!("retry loop exits")
|
||||
}
|
||||
|
||||
/// Compaction which might need to be retried after downloading remote layers.
|
||||
async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> {
|
||||
//
|
||||
// High level strategy for compaction / image creation:
|
||||
//
|
||||
@@ -668,7 +763,7 @@ impl Timeline {
|
||||
// Is the timeline being deleted?
|
||||
let state = *self.state.borrow();
|
||||
if state == TimelineState::Stopping {
|
||||
anyhow::bail!("timeline is Stopping");
|
||||
return Err(anyhow::anyhow!("timeline is Stopping").into());
|
||||
}
|
||||
|
||||
let target_file_size = self.get_checkpoint_distance();
|
||||
@@ -688,7 +783,8 @@ impl Timeline {
|
||||
// "enough".
|
||||
let layer_paths_to_upload = self
|
||||
.create_image_layers(&partitioning, lsn, false, ctx)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
for (path, layer_metadata) in layer_paths_to_upload {
|
||||
remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
|
||||
@@ -700,18 +796,6 @@ impl Timeline {
|
||||
self.compact_level0(&layer_removal_cs, target_file_size, ctx)
|
||||
.await?;
|
||||
timer.stop_and_record();
|
||||
|
||||
// If `create_image_layers' or `compact_level0` scheduled any
|
||||
// uploads or deletions, but didn't update the index file yet,
|
||||
// do it now.
|
||||
//
|
||||
// This isn't necessary for correctness, the remote state is
|
||||
// consistent without the uploads and deletions, and we would
|
||||
// update the index file on next flush iteration too. But it
|
||||
// could take a while until that happens.
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
@@ -801,6 +885,7 @@ impl Timeline {
|
||||
pub fn activate(self: &Arc<Self>) {
|
||||
self.set_state(TimelineState::Active);
|
||||
self.launch_wal_receiver();
|
||||
self.launch_eviction_task();
|
||||
}
|
||||
|
||||
pub fn set_state(&self, new_state: TimelineState) {
|
||||
@@ -867,19 +952,101 @@ impl Timeline {
|
||||
Ok(Some(true))
|
||||
}
|
||||
|
||||
/// Like [`evict_layer_batch`], but for just one layer.
|
||||
/// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
|
||||
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
||||
let Some(local_layer) = self.find_layer(layer_file_name) else { return Ok(None) };
|
||||
if local_layer.is_remote_layer() {
|
||||
return Ok(Some(false));
|
||||
}
|
||||
let Some(remote_client) = &self.remote_client else { return Ok(Some(false)) };
|
||||
let remote_client = self
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;
|
||||
|
||||
// ensure the current layer is uploaded for sure
|
||||
let cancel = CancellationToken::new();
|
||||
let results = self
|
||||
.evict_layer_batch(remote_client, &[local_layer], cancel)
|
||||
.await?;
|
||||
assert_eq!(results.len(), 1);
|
||||
let result: Option<anyhow::Result<bool>> = results.into_iter().next().unwrap();
|
||||
match result {
|
||||
None => anyhow::bail!("task_mgr shutdown requested"),
|
||||
Some(Ok(b)) => Ok(Some(b)),
|
||||
Some(Err(e)) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
/// Evict multiple layers at once, continuing through errors.
|
||||
///
|
||||
/// Try to evict the given `layers_to_evict` by
|
||||
///
|
||||
/// 1. Replacing the given layer object in the layer map with a corresponding [`RemoteLayer`] object.
|
||||
/// 2. Deleting the now unreferenced layer file from disk.
|
||||
///
|
||||
/// The `remote_client` should be this timeline's `self.remote_client`.
|
||||
/// We make the caller provide it so that they are responsible for handling the case
|
||||
/// where someone wants to evict the layer but no remote storage is configured.
|
||||
///
|
||||
/// Returns either `Err()` or `Ok(results)` where `results.len() == layers_to_evict.len()`.
|
||||
/// If `Err()` is returned, no eviction was attempted.
|
||||
/// Each position of `Ok(results)` corresponds to the layer in `layers_to_evict`.
|
||||
/// Meaning of each `result[i]`:
|
||||
/// - `Some(Err(...))` if layer replacement failed for an unexpected reason
|
||||
/// - `Some(Ok(true))` if everything went well.
|
||||
/// - `Some(Ok(false))` if there was an expected reason why the layer could not be replaced, e.g.:
|
||||
/// - evictee was not yet downloaded
|
||||
/// - replacement failed for an expectable reason (e.g., layer removed by GC before we grabbed all locks)
|
||||
/// - `None` if no eviction attempt was made for the layer because `cancel.is_cancelled() == true`.
|
||||
async fn evict_layer_batch(
|
||||
&self,
|
||||
remote_client: &Arc<RemoteTimelineClient>,
|
||||
layers_to_evict: &[Arc<dyn PersistentLayer>],
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
|
||||
// ensure that the layers have finished uploading
|
||||
// (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
|
||||
remote_client
|
||||
.wait_completion()
|
||||
.await
|
||||
.context("wait for layer upload ops to complete")?;
|
||||
|
||||
// now lock out layer removal (compaction, gc, timeline deletion)
|
||||
let layer_removal_guard = self.layer_removal_cs.lock().await;
|
||||
|
||||
// start the batch update
|
||||
let mut layer_map = self.layers.write().unwrap();
|
||||
let mut batch_updates = layer_map.batch_update();
|
||||
|
||||
let mut results = Vec::with_capacity(layers_to_evict.len());
|
||||
|
||||
for l in layers_to_evict.iter() {
|
||||
let res = if cancel.is_cancelled() {
|
||||
None
|
||||
} else {
|
||||
Some(self.evict_layer_batch_impl(&layer_removal_guard, l, &mut batch_updates))
|
||||
};
|
||||
results.push(res);
|
||||
}
|
||||
|
||||
// commit the updates & release locks
|
||||
batch_updates.flush();
|
||||
drop(layer_map);
|
||||
drop(layer_removal_guard);
|
||||
|
||||
assert_eq!(results.len(), layers_to_evict.len());
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
fn evict_layer_batch_impl(
|
||||
&self,
|
||||
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||
local_layer: &Arc<dyn PersistentLayer>,
|
||||
batch_updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
|
||||
) -> anyhow::Result<bool> {
|
||||
use super::layer_map::Replacement;
|
||||
|
||||
if local_layer.is_remote_layer() {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let layer_metadata = LayerFileMetadata::new(
|
||||
local_layer
|
||||
.file_size()
|
||||
@@ -906,16 +1073,38 @@ impl Timeline {
|
||||
),
|
||||
});
|
||||
|
||||
let gc_lock = self.layer_removal_cs.lock().await;
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let mut updates = layers.batch_update();
|
||||
self.delete_historic_layer(&gc_lock, local_layer, &mut updates)?;
|
||||
updates.insert_historic(new_remote_layer);
|
||||
updates.flush();
|
||||
drop(layers);
|
||||
drop(gc_lock);
|
||||
let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? {
|
||||
Replacement::Replaced { .. } => {
|
||||
let layer_size = local_layer.file_size();
|
||||
|
||||
Ok(Some(true))
|
||||
if let Err(e) = local_layer.delete() {
|
||||
error!("failed to remove layer file on evict after replacement: {e:#?}");
|
||||
}
|
||||
|
||||
if let Some(layer_size) = layer_size {
|
||||
self.metrics.resident_physical_size_gauge.sub(layer_size);
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
Replacement::NotFound => {
|
||||
debug!(evicted=?local_layer, "layer was no longer in layer map");
|
||||
false
|
||||
}
|
||||
Replacement::RemovalBuffered => {
|
||||
unreachable!("not doing anything else in this batch")
|
||||
}
|
||||
Replacement::Unexpected(other) => {
|
||||
error!(
|
||||
local_layer.ptr=?Arc::as_ptr(local_layer),
|
||||
other.ptr=?Arc::as_ptr(&other),
|
||||
?other,
|
||||
"failed to replace");
|
||||
false
|
||||
}
|
||||
};
|
||||
|
||||
Ok(replaced)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -956,6 +1145,13 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||
}
|
||||
|
||||
fn get_eviction_policy(&self) -> EvictionPolicy {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap();
|
||||
tenant_conf
|
||||
.eviction_policy
|
||||
.unwrap_or(self.conf.default_tenant_conf.eviction_policy)
|
||||
}
|
||||
|
||||
/// Open a Timeline handle.
|
||||
///
|
||||
/// Loads the metadata for the timeline into memory, but not the layer map.
|
||||
@@ -1512,13 +1708,31 @@ impl Timeline {
|
||||
}
|
||||
x @ Err(_) => x.context("Failed to calculate logical size")?,
|
||||
};
|
||||
|
||||
// we cannot query current_logical_size.current_size() to know the current
|
||||
// *negative* value, only truncated to u64.
|
||||
let added = self_clone
|
||||
.current_logical_size
|
||||
.size_added_after_initial
|
||||
.load(AtomicOrdering::Relaxed);
|
||||
|
||||
let sum = calculated_size.saturating_add_signed(added);
|
||||
|
||||
// set the gauge value before it can be set in `update_current_logical_size`.
|
||||
self_clone.metrics.current_logical_size_gauge.set(sum);
|
||||
|
||||
match self_clone
|
||||
.current_logical_size
|
||||
.initial_logical_size
|
||||
.set(calculated_size)
|
||||
{
|
||||
Ok(()) => (),
|
||||
Err(existing_size) => {
|
||||
Err(_what_we_just_attempted_to_set) => {
|
||||
let existing_size = self_clone
|
||||
.current_logical_size
|
||||
.initial_logical_size
|
||||
.get()
|
||||
.expect("once_cell set was lost, then get failed, impossible.");
|
||||
// This shouldn't happen because the semaphore is initialized with 1.
|
||||
// But if it happens, just complain & report success so there are no further retries.
|
||||
error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing")
|
||||
@@ -1576,15 +1790,9 @@ impl Timeline {
|
||||
let calculation = async {
|
||||
let cancel = cancel.child_token();
|
||||
let ctx = ctx.attached_child();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
// Run in a separate thread since this can do a lot of
|
||||
// synchronous file IO without .await inbetween
|
||||
// if there are no RemoteLayers that would require downloading.
|
||||
let h = tokio::runtime::Handle::current();
|
||||
h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel, &ctx))
|
||||
})
|
||||
.await
|
||||
.context("Failed to spawn calculation result task")?
|
||||
self_calculation
|
||||
.calculate_logical_size(init_lsn, cancel, &ctx)
|
||||
.await
|
||||
};
|
||||
let timeline_state_cancellation = async {
|
||||
loop {
|
||||
@@ -1617,7 +1825,7 @@ impl Timeline {
|
||||
tokio::pin!(calculation);
|
||||
loop {
|
||||
tokio::select! {
|
||||
res = &mut calculation => { return res }
|
||||
res = &mut calculation => { return res }
|
||||
reason = timeline_state_cancellation => {
|
||||
debug!(reason = reason, "cancelling calculation");
|
||||
cancel.cancel();
|
||||
@@ -1701,10 +1909,15 @@ impl Timeline {
|
||||
// one value while current_logical_size is set to the
|
||||
// other.
|
||||
match logical_size.current_size() {
|
||||
Ok(new_current_size) => self
|
||||
Ok(CurrentLogicalSize::Exact(new_current_size)) => self
|
||||
.metrics
|
||||
.current_logical_size_gauge
|
||||
.set(new_current_size.size()),
|
||||
.set(new_current_size),
|
||||
Ok(CurrentLogicalSize::Approximate(_)) => {
|
||||
// don't update the gauge yet, this allows us not to update the gauge back and
|
||||
// forth between the initial size calculation task.
|
||||
}
|
||||
// this is overflow
|
||||
Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"),
|
||||
}
|
||||
}
|
||||
@@ -2273,7 +2486,7 @@ impl Timeline {
|
||||
// Only one thread may call this function at a time (for this
|
||||
// timeline). If two threads tried to flush the same frozen
|
||||
// layer to disk at the same time, that would not work.
|
||||
assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer));
|
||||
assert!(LayerMap::compare_arced_layers(&l.unwrap(), &frozen_layer));
|
||||
|
||||
// release lock on 'layers'
|
||||
}
|
||||
@@ -2409,10 +2622,13 @@ impl Timeline {
|
||||
) -> anyhow::Result<(KeyPartitioning, Lsn)> {
|
||||
{
|
||||
let partitioning_guard = self.partitioning.lock().unwrap();
|
||||
if partitioning_guard.1 != Lsn(0)
|
||||
&& lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold
|
||||
{
|
||||
// no repartitioning needed
|
||||
let distance = lsn.0 - partitioning_guard.1 .0;
|
||||
if partitioning_guard.1 != Lsn(0) && distance <= self.repartition_threshold {
|
||||
debug!(
|
||||
distance,
|
||||
threshold = self.repartition_threshold,
|
||||
"no repartitioning needed"
|
||||
);
|
||||
return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
|
||||
}
|
||||
}
|
||||
@@ -2430,8 +2646,12 @@ impl Timeline {
|
||||
|
||||
// Is it time to create a new image layer for the given partition?
|
||||
fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
let layers = self.layers.read().unwrap();
|
||||
|
||||
let mut max_deltas = 0;
|
||||
|
||||
for part_range in &partition.ranges {
|
||||
let image_coverage = layers.image_coverage(part_range, lsn)?;
|
||||
for (img_range, last_img) in image_coverage {
|
||||
@@ -2453,21 +2673,25 @@ impl Timeline {
|
||||
// are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
|
||||
// after we read last_record_lsn, which is passed here in the 'lsn' argument.
|
||||
if img_lsn < lsn {
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
let num_deltas =
|
||||
layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
|
||||
|
||||
debug!(
|
||||
"key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
|
||||
img_range.start, img_range.end, num_deltas, img_lsn, lsn
|
||||
);
|
||||
max_deltas = max_deltas.max(num_deltas);
|
||||
if num_deltas >= threshold {
|
||||
debug!(
|
||||
"key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
|
||||
img_range.start, img_range.end, num_deltas, img_lsn, lsn
|
||||
);
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!(
|
||||
max_deltas,
|
||||
"none of the partitioned ranges had >= {threshold} deltas"
|
||||
);
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
@@ -2580,25 +2804,55 @@ impl Timeline {
|
||||
Ok(layer_paths_to_upload)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct CompactLevel0Phase1Result {
|
||||
new_layers: Vec<DeltaLayer>,
|
||||
deltas_to_compact: Vec<Arc<dyn PersistentLayer>>,
|
||||
}
|
||||
|
||||
/// Top-level failure to compact.
|
||||
#[derive(Debug)]
|
||||
enum CompactionError {
|
||||
/// L0 compaction requires layers to be downloaded.
|
||||
///
|
||||
/// This should not happen repeatedly, but will be retried once by top-level
|
||||
/// `Timeline::compact`.
|
||||
DownloadRequired(Vec<Arc<RemoteLayer>>),
|
||||
/// Compaction cannot be done right now; page reconstruction and so on.
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<anyhow::Error> for CompactionError {
|
||||
fn from(value: anyhow::Error) -> Self {
|
||||
CompactionError::Other(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
/// Level0 files first phase of compaction, explained in the [`compact_inner`] comment.
|
||||
///
|
||||
/// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
|
||||
/// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
|
||||
/// start of level0 files compaction, the on-demand download should be revisited as well.
|
||||
async fn compact_level0_phase1(
|
||||
&self,
|
||||
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||
target_file_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<CompactLevel0Phase1Result> {
|
||||
) -> Result<CompactLevel0Phase1Result, CompactionError> {
|
||||
let layers = self.layers.read().unwrap();
|
||||
let mut level0_deltas = layers.get_level0_deltas()?;
|
||||
drop(layers);
|
||||
|
||||
// Only compact if enough layers have accumulated.
|
||||
if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() {
|
||||
return Ok(Default::default());
|
||||
let threshold = self.get_compaction_threshold();
|
||||
if level0_deltas.is_empty() || level0_deltas.len() < threshold {
|
||||
debug!(
|
||||
level0_deltas = level0_deltas.len(),
|
||||
threshold, "too few deltas to compact"
|
||||
);
|
||||
return Ok(CompactLevel0Phase1Result::default());
|
||||
}
|
||||
|
||||
// Gather the files to compact in this iteration.
|
||||
@@ -2634,6 +2888,24 @@ impl Timeline {
|
||||
end: deltas_to_compact.last().unwrap().get_lsn_range().end,
|
||||
};
|
||||
|
||||
let remotes = deltas_to_compact
|
||||
.iter()
|
||||
.filter(|l| l.is_remote_layer())
|
||||
.inspect(|l| info!("compact requires download of {}", l.filename().file_name()))
|
||||
.map(|l| {
|
||||
l.clone()
|
||||
.downcast_remote_layer()
|
||||
.expect("just checked it is remote layer")
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if !remotes.is_empty() {
|
||||
// caller is holding the lock to layer_removal_cs, and we don't want to download while
|
||||
// holding that; in future download_remote_layer might take it as well. this is
|
||||
// regardless of earlier image creation downloading on-demand, while holding the lock.
|
||||
return Err(CompactionError::DownloadRequired(remotes));
|
||||
}
|
||||
|
||||
info!(
|
||||
"Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
|
||||
lsn_range.start,
|
||||
@@ -2641,9 +2913,11 @@ impl Timeline {
|
||||
deltas_to_compact.len(),
|
||||
level0_deltas.len()
|
||||
);
|
||||
|
||||
for l in deltas_to_compact.iter() {
|
||||
info!("compact includes {}", l.filename().file_name());
|
||||
}
|
||||
|
||||
// We don't need the original list of layers anymore. Drop it so that
|
||||
// we don't accidentally use it later in the function.
|
||||
drop(level0_deltas);
|
||||
@@ -2687,6 +2961,47 @@ impl Timeline {
|
||||
},
|
||||
)?;
|
||||
|
||||
// Determine N largest holes where N is number of compacted layers.
|
||||
let max_holes = deltas_to_compact.len();
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
let layers = self.layers.read().unwrap(); // Is'n it better to hold original layers lock till here?
|
||||
let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
|
||||
let min_hole_coverage_size = 3; // TODO: something more flexible?
|
||||
|
||||
// min-heap (reserve space for one more element added before eviction)
|
||||
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
|
||||
let mut prev: Option<Key> = None;
|
||||
for (next_key, _next_lsn, _size) in itertools::process_results(
|
||||
deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
|
||||
|iter_iter| iter_iter.kmerge_by(|a, b| a.0 <= b.0),
|
||||
)? {
|
||||
if let Some(prev_key) = prev {
|
||||
// just first fast filter
|
||||
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
|
||||
let key_range = prev_key..next_key;
|
||||
// Measuring hole by just subtraction of i128 representation of key range boundaries
|
||||
// has not so much sense, because largest holes will corresponds field1/field2 changes.
|
||||
// But we are mostly interested to eliminate holes which cause generation of excessive image layers.
|
||||
// That is why it is better to measure size of hole as number of covering image layers.
|
||||
let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
|
||||
if coverage_size >= min_hole_coverage_size {
|
||||
heap.push(Hole {
|
||||
key_range,
|
||||
coverage_size,
|
||||
});
|
||||
if heap.len() > max_holes {
|
||||
heap.pop(); // remove smallest hole
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
prev = Some(next_key.next());
|
||||
}
|
||||
drop(layers);
|
||||
let mut holes = heap.into_vec();
|
||||
holes.sort_unstable_by_key(|hole| hole.key_range.start);
|
||||
let mut next_hole = 0; // index of next hole in holes vector
|
||||
|
||||
// Merge the contents of all the input delta layers into a new set
|
||||
// of delta layers, based on the current partitioning.
|
||||
//
|
||||
@@ -2781,14 +3096,22 @@ impl Timeline {
|
||||
}
|
||||
if writer.is_some() {
|
||||
let written_size = writer.as_mut().unwrap().size();
|
||||
// check if key cause layer overflow...
|
||||
let contains_hole =
|
||||
next_hole < holes.len() && key >= holes[next_hole].key_range.end;
|
||||
// check if key cause layer overflow or contains hole...
|
||||
if is_dup_layer
|
||||
|| dup_end_lsn.is_valid()
|
||||
|| written_size + key_values_total_size > target_file_size
|
||||
|| contains_hole
|
||||
{
|
||||
// ... if so, flush previous layer and prepare to write new one
|
||||
new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?);
|
||||
writer = None;
|
||||
|
||||
if contains_hole {
|
||||
// skip hole
|
||||
next_hole += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Remember size of key value because at next iteration we will access next item
|
||||
@@ -2813,7 +3136,9 @@ impl Timeline {
|
||||
}
|
||||
|
||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||
anyhow::bail!("failpoint delta-layer-writer-fail-before-finish");
|
||||
return Err(
|
||||
anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into(),
|
||||
);
|
||||
});
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||
@@ -2832,7 +3157,7 @@ impl Timeline {
|
||||
|
||||
// Fsync all the layer files and directory using multiple threads to
|
||||
// minimize latency.
|
||||
par_fsync::par_fsync(&layer_paths)?;
|
||||
par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
|
||||
|
||||
layer_paths.pop().unwrap();
|
||||
}
|
||||
@@ -2854,11 +3179,13 @@ impl Timeline {
|
||||
layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||
target_file_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), CompactionError> {
|
||||
let CompactLevel0Phase1Result {
|
||||
new_layers,
|
||||
deltas_to_compact,
|
||||
} = self.compact_level0_phase1(target_file_size, ctx).await?;
|
||||
} = self
|
||||
.compact_level0_phase1(layer_removal_cs, target_file_size, ctx)
|
||||
.await?;
|
||||
|
||||
if new_layers.is_empty() && deltas_to_compact.is_empty() {
|
||||
// nothing to do
|
||||
@@ -2882,7 +3209,12 @@ impl Timeline {
|
||||
for l in new_layers {
|
||||
let new_delta_path = l.path();
|
||||
|
||||
let metadata = new_delta_path.metadata()?;
|
||||
let metadata = new_delta_path.metadata().with_context(|| {
|
||||
format!(
|
||||
"read file metadata for new created layer {}",
|
||||
new_delta_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.schedule_layer_file_upload(
|
||||
@@ -3116,7 +3448,7 @@ impl Timeline {
|
||||
|
||||
let mut layers_to_remove = Vec::new();
|
||||
|
||||
// Scan all on-disk layers in the timeline.
|
||||
// Scan all layers in the timeline (remote or on-disk).
|
||||
//
|
||||
// Garbage collect the layer if all conditions are satisfied:
|
||||
// 1. it is older than cutoff LSN;
|
||||
@@ -3355,14 +3687,26 @@ impl Timeline {
|
||||
&self,
|
||||
remote_layer: Arc<RemoteLayer>,
|
||||
) -> anyhow::Result<()> {
|
||||
use std::sync::atomic::Ordering::Relaxed;
|
||||
|
||||
let permit = match Arc::clone(&remote_layer.ongoing_download)
|
||||
.acquire_owned()
|
||||
.await
|
||||
{
|
||||
Ok(permit) => permit,
|
||||
Err(_closed) => {
|
||||
info!("download of layer has already finished");
|
||||
return Ok(());
|
||||
if remote_layer.download_replacement_failure.load(Relaxed) {
|
||||
// this path will be hit often, in case there are upper retries. however
|
||||
// hitting this error will prevent a busy loop between get_reconstruct_data and
|
||||
// download, so an error is prefered.
|
||||
//
|
||||
// TODO: we really should poison the timeline, but panicking is not yet
|
||||
// supported. Related: https://github.com/neondatabase/neon/issues/3621
|
||||
anyhow::bail!("an earlier download succeeded but LayerMap::replace failed")
|
||||
} else {
|
||||
info!("download of layer has already finished");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -3398,8 +3742,8 @@ impl Timeline {
|
||||
{
|
||||
use crate::tenant::layer_map::Replacement;
|
||||
let l: Arc<dyn PersistentLayer> = remote_layer.clone();
|
||||
match updates.replace_historic(&l, new_layer) {
|
||||
Ok(Replacement::Replaced { .. }) => { /* expected */ }
|
||||
let failure = match updates.replace_historic(&l, new_layer) {
|
||||
Ok(Replacement::Replaced { .. }) => false,
|
||||
Ok(Replacement::NotFound) => {
|
||||
// TODO: the downloaded file should probably be removed, otherwise
|
||||
// it will be added to the layermap on next load? we should
|
||||
@@ -3407,6 +3751,7 @@ impl Timeline {
|
||||
//
|
||||
// See: https://github.com/neondatabase/neon/issues/3533
|
||||
error!("replacing downloaded layer into layermap failed because layer was not found");
|
||||
true
|
||||
}
|
||||
Ok(Replacement::RemovalBuffered) => {
|
||||
unreachable!("current implementation does not remove anything")
|
||||
@@ -3422,14 +3767,35 @@ impl Timeline {
|
||||
error!(
|
||||
expected.ptr = ?Arc::as_ptr(&l),
|
||||
other.ptr = ?Arc::as_ptr(&other),
|
||||
?other,
|
||||
"replacing downloaded layer into layermap failed because another layer was found instead of expected"
|
||||
);
|
||||
true
|
||||
}
|
||||
Err(e) => {
|
||||
// this is a precondition failure, the layer filename derived
|
||||
// attributes didn't match up, which doesn't seem likely.
|
||||
error!("replacing downloaded layer into layermap failed: {e:#?}")
|
||||
error!("replacing downloaded layer into layermap failed: {e:#?}");
|
||||
true
|
||||
}
|
||||
};
|
||||
|
||||
if failure {
|
||||
// mark the remote layer permanently failed; the timeline is most
|
||||
// likely unusable after this. sadly we cannot just poison the layermap
|
||||
// lock with panic, because that would create an issue with shutdown.
|
||||
//
|
||||
// this does not change the retry semantics on failed downloads.
|
||||
//
|
||||
// use of Relaxed is valid because closing of the semaphore gives
|
||||
// happens-before and wakes up any waiters; we write this value before
|
||||
// and any waiters (or would be waiters) will load it after closing
|
||||
// semaphore.
|
||||
//
|
||||
// See: https://github.com/neondatabase/neon/issues/3533
|
||||
remote_layer
|
||||
.download_replacement_failure
|
||||
.store(true, Relaxed);
|
||||
}
|
||||
}
|
||||
updates.flush();
|
||||
@@ -3442,6 +3808,7 @@ impl Timeline {
|
||||
remote_layer.ongoing_download.close();
|
||||
} else {
|
||||
// Keep semaphore open. We'll drop the permit at the end of the function.
|
||||
info!("on-demand download failed: {:?}", result.as_ref().unwrap_err());
|
||||
}
|
||||
|
||||
// Don't treat it as an error if the task that triggered the download
|
||||
@@ -3455,7 +3822,7 @@ impl Timeline {
|
||||
drop(permit);
|
||||
|
||||
Ok(())
|
||||
},
|
||||
}.in_current_span(),
|
||||
);
|
||||
|
||||
receiver.await.context("download task cancelled")?
|
||||
|
||||
219
pageserver/src/tenant/timeline/eviction_task.rs
Normal file
219
pageserver/src/tenant/timeline/eviction_task.rs
Normal file
@@ -0,0 +1,219 @@
|
||||
//! The per-timeline layer eviction task.
|
||||
|
||||
use std::{
|
||||
ops::ControlFlow,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use either::Either;
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
|
||||
use crate::{
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
||||
storage_layer::PersistentLayer,
|
||||
},
|
||||
};
|
||||
|
||||
use super::Timeline;
|
||||
|
||||
impl Timeline {
|
||||
pub(super) fn launch_eviction_task(self: &Arc<Self>) {
|
||||
let self_clone = Arc::clone(self);
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::Eviction,
|
||||
Some(self.tenant_id),
|
||||
Some(self.timeline_id),
|
||||
&format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
|
||||
false,
|
||||
async move {
|
||||
self_clone.eviction_task(task_mgr::shutdown_token()).await;
|
||||
info!("eviction task finishing");
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
|
||||
async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
|
||||
use crate::tenant::tasks::random_init_delay;
|
||||
{
|
||||
let policy = self.get_eviction_policy();
|
||||
let period = match policy {
|
||||
EvictionPolicy::LayerAccessThreshold(lat) => lat.period,
|
||||
EvictionPolicy::NoEviction => Duration::from_secs(10),
|
||||
};
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
info!("shutting down");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
loop {
|
||||
let policy = self.get_eviction_policy();
|
||||
let cf = self.eviction_iteration(&policy, cancel.clone()).await;
|
||||
|
||||
match cf {
|
||||
ControlFlow::Break(()) => break,
|
||||
ControlFlow::Continue(sleep_until) => {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("shutting down");
|
||||
break;
|
||||
}
|
||||
_ = tokio::time::sleep_until(sleep_until) => { }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))]
|
||||
async fn eviction_iteration(
|
||||
self: &Arc<Self>,
|
||||
policy: &EvictionPolicy,
|
||||
cancel: CancellationToken,
|
||||
) -> ControlFlow<(), Instant> {
|
||||
debug!("eviction iteration: {policy:?}");
|
||||
match policy {
|
||||
EvictionPolicy::NoEviction => {
|
||||
// check again in 10 seconds; XXX config watch mechanism
|
||||
ControlFlow::Continue(Instant::now() + Duration::from_secs(10))
|
||||
}
|
||||
EvictionPolicy::LayerAccessThreshold(p) => {
|
||||
let start = Instant::now();
|
||||
match self.eviction_iteration_threshold(p, cancel).await {
|
||||
ControlFlow::Break(()) => return ControlFlow::Break(()),
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
let elapsed = start.elapsed();
|
||||
crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
|
||||
ControlFlow::Continue(start + p.period)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn eviction_iteration_threshold(
|
||||
self: &Arc<Self>,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: CancellationToken,
|
||||
) -> ControlFlow<()> {
|
||||
let now = SystemTime::now();
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Default)]
|
||||
struct EvictionStats {
|
||||
candidates: usize,
|
||||
evicted: usize,
|
||||
errors: usize,
|
||||
not_evictable: usize,
|
||||
skipped_for_shutdown: usize,
|
||||
}
|
||||
let mut stats = EvictionStats::default();
|
||||
// Gather layers for eviction.
|
||||
// NB: all the checks can be invalidated as soon as we release the layer map lock.
|
||||
// We don't want to hold the layer map lock during eviction.
|
||||
// So, we just need to deal with this.
|
||||
let candidates: Vec<Arc<dyn PersistentLayer>> = {
|
||||
let layers = self.layers.read().unwrap();
|
||||
let mut candidates = Vec::new();
|
||||
for hist_layer in layers.iter_historic_layers() {
|
||||
if hist_layer.is_remote_layer() {
|
||||
continue;
|
||||
}
|
||||
let last_activity_ts = match hist_layer
|
||||
.access_stats()
|
||||
.most_recent_access_or_residence_event()
|
||||
{
|
||||
Either::Left(mra) => mra.when,
|
||||
Either::Right(re) => re.timestamp,
|
||||
};
|
||||
let no_activity_for = match now.duration_since(last_activity_ts) {
|
||||
Ok(d) => d,
|
||||
Err(_e) => {
|
||||
// We reach here if `now` < `last_activity_ts`, which can legitimately
|
||||
// happen if there is an access between us getting `now`, and us getting
|
||||
// the access stats from the layer.
|
||||
//
|
||||
// The other reason why it can happen is system clock skew because
|
||||
// SystemTime::now() is not monotonic, so, even if there is no access
|
||||
// to the layer after we get `now` at the beginning of this function,
|
||||
// it could be that `now` < `last_activity_ts`.
|
||||
//
|
||||
// To distinguish the cases, we would need to record `Instant`s in the
|
||||
// access stats (i.e., monotonic timestamps), but then, the timestamps
|
||||
// values in the access stats would need to be `Instant`'s, and hence
|
||||
// they would be meaningless outside of the pageserver process.
|
||||
// At the time of writing, the trade-off is that access stats are more
|
||||
// valuable than detecting clock skew.
|
||||
continue;
|
||||
}
|
||||
};
|
||||
if no_activity_for > p.threshold {
|
||||
candidates.push(hist_layer)
|
||||
}
|
||||
}
|
||||
candidates
|
||||
};
|
||||
stats.candidates = candidates.len();
|
||||
|
||||
let remote_client = match self.remote_client.as_ref() {
|
||||
None => {
|
||||
error!(
|
||||
num_candidates = candidates.len(),
|
||||
"no remote storage configured, cannot evict layers"
|
||||
);
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
Some(c) => c,
|
||||
};
|
||||
|
||||
let results = match self
|
||||
.evict_layer_batch(remote_client, &candidates[..], cancel)
|
||||
.await
|
||||
{
|
||||
Err(pre_err) => {
|
||||
stats.errors += candidates.len();
|
||||
error!("could not do any evictions: {pre_err:#}");
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
Ok(results) => results,
|
||||
};
|
||||
assert_eq!(results.len(), candidates.len());
|
||||
for (l, result) in candidates.iter().zip(results) {
|
||||
match result {
|
||||
None => {
|
||||
stats.skipped_for_shutdown += 1;
|
||||
}
|
||||
Some(Ok(true)) => {
|
||||
debug!("evicted layer {l:?}");
|
||||
stats.evicted += 1;
|
||||
}
|
||||
Some(Ok(false)) => {
|
||||
debug!("layer is not evictable: {l:?}");
|
||||
stats.not_evictable += 1;
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
// This variant is the case where an unexpected error happened during eviction.
|
||||
// Expected errors that result in non-eviction are `Some(Ok(false))`.
|
||||
// So, dump Debug here to gather as much info as possible in this rare case.
|
||||
warn!("failed to evict layer {l:?}: {e:?}");
|
||||
stats.errors += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if stats.candidates == stats.not_evictable {
|
||||
debug!(stats=?stats, "eviction iteration complete");
|
||||
} else if stats.errors > 0 || stats.not_evictable > 0 {
|
||||
warn!(stats=?stats, "eviction iteration complete");
|
||||
} else {
|
||||
info!(stats=?stats, "eviction iteration complete");
|
||||
}
|
||||
ControlFlow::Continue(())
|
||||
}
|
||||
}
|
||||
@@ -13,7 +13,7 @@ use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, ti
|
||||
|
||||
use super::TaskStateUpdate;
|
||||
use crate::broker_client::get_broker_client;
|
||||
use crate::context::RequestContext;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::WALRECEIVER_RUNTIME;
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::Timeline;
|
||||
@@ -413,7 +413,7 @@ impl WalreceiverState {
|
||||
let timeline = Arc::clone(&self.timeline);
|
||||
let ctx = ctx.detached_child(
|
||||
TaskKind::WalReceiverConnectionHandler,
|
||||
ctx.download_behavior(),
|
||||
DownloadBehavior::Download,
|
||||
);
|
||||
let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
|
||||
async move {
|
||||
|
||||
@@ -27,8 +27,7 @@ use std::fs::OpenOptions;
|
||||
use std::io::prelude::*;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::os::fd::RawFd;
|
||||
use std::os::unix::io::AsRawFd;
|
||||
use std::os::unix::io::{AsRawFd, RawFd};
|
||||
use std::os::unix::prelude::CommandExt;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Stdio;
|
||||
|
||||
111
poetry.lock
generated
111
poetry.lock
generated
@@ -865,50 +865,47 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "cryptography"
|
||||
version = "38.0.3"
|
||||
version = "39.0.1"
|
||||
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "cryptography-38.0.3-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:984fe150f350a3c91e84de405fe49e688aa6092b3525f407a18b9646f6612320"},
|
||||
{file = "cryptography-38.0.3-cp36-abi3-macosx_10_10_x86_64.whl", hash = "sha256:ed7b00096790213e09eb11c97cc6e2b757f15f3d2f85833cd2d3ec3fe37c1722"},
|
||||
{file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:bbf203f1a814007ce24bd4d51362991d5cb90ba0c177a9c08825f2cc304d871f"},
|
||||
{file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:554bec92ee7d1e9d10ded2f7e92a5d70c1f74ba9524947c0ba0c850c7b011828"},
|
||||
{file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1b52c9e5f8aa2b802d48bd693190341fae201ea51c7a167d69fc48b60e8a959"},
|
||||
{file = "cryptography-38.0.3-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:728f2694fa743a996d7784a6194da430f197d5c58e2f4e278612b359f455e4a2"},
|
||||
{file = "cryptography-38.0.3-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:dfb4f4dd568de1b6af9f4cda334adf7d72cf5bc052516e1b2608b683375dd95c"},
|
||||
{file = "cryptography-38.0.3-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5419a127426084933076132d317911e3c6eb77568a1ce23c3ac1e12d111e61e0"},
|
||||
{file = "cryptography-38.0.3-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:9b24bcff7853ed18a63cfb0c2b008936a9554af24af2fb146e16d8e1aed75748"},
|
||||
{file = "cryptography-38.0.3-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:25c1d1f19729fb09d42e06b4bf9895212292cb27bb50229f5aa64d039ab29146"},
|
||||
{file = "cryptography-38.0.3-cp36-abi3-win32.whl", hash = "sha256:7f836217000342d448e1c9a342e9163149e45d5b5eca76a30e84503a5a96cab0"},
|
||||
{file = "cryptography-38.0.3-cp36-abi3-win_amd64.whl", hash = "sha256:c46837ea467ed1efea562bbeb543994c2d1f6e800785bd5a2c98bc096f5cb220"},
|
||||
{file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06fc3cc7b6f6cca87bd56ec80a580c88f1da5306f505876a71c8cfa7050257dd"},
|
||||
{file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:65535bc550b70bd6271984d9863a37741352b4aad6fb1b3344a54e6950249b55"},
|
||||
{file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5e89468fbd2fcd733b5899333bc54d0d06c80e04cd23d8c6f3e0542358c6060b"},
|
||||
{file = "cryptography-38.0.3-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6ab9516b85bebe7aa83f309bacc5f44a61eeb90d0b4ec125d2d003ce41932d36"},
|
||||
{file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:068147f32fa662c81aebab95c74679b401b12b57494872886eb5c1139250ec5d"},
|
||||
{file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:402852a0aea73833d982cabb6d0c3bb582c15483d29fb7085ef2c42bfa7e38d7"},
|
||||
{file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:b1b35d9d3a65542ed2e9d90115dfd16bbc027b3f07ee3304fc83580f26e43249"},
|
||||
{file = "cryptography-38.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:6addc3b6d593cd980989261dc1cce38263c76954d758c3c94de51f1e010c9a50"},
|
||||
{file = "cryptography-38.0.3-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:be243c7e2bfcf6cc4cb350c0d5cdf15ca6383bbcb2a8ef51d3c9411a9d4386f0"},
|
||||
{file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78cf5eefac2b52c10398a42765bfa981ce2372cbc0457e6bf9658f41ec3c41d8"},
|
||||
{file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:4e269dcd9b102c5a3d72be3c45d8ce20377b8076a43cbed6f660a1afe365e436"},
|
||||
{file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8d41a46251bf0634e21fac50ffd643216ccecfaf3701a063257fe0b2be1b6548"},
|
||||
{file = "cryptography-38.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:785e4056b5a8b28f05a533fab69febf5004458e20dad7e2e13a3120d8ecec75a"},
|
||||
{file = "cryptography-38.0.3.tar.gz", hash = "sha256:bfbe6ee19615b07a98b1d2287d6a6073f734735b49ee45b11324d85efc4d5cbd"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:6687ef6d0a6497e2b58e7c5b852b53f62142cfa7cd1555795758934da363a965"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:706843b48f9a3f9b9911979761c91541e3d90db1ca905fd63fee540a217698bc"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:5d2d8b87a490bfcd407ed9d49093793d0f75198a35e6eb1a923ce1ee86c62b41"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83e17b26de248c33f3acffb922748151d71827d6021d98c70e6c1a25ddd78505"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e124352fd3db36a9d4a21c1aa27fd5d051e621845cb87fb851c08f4f75ce8be6"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:5aa67414fcdfa22cf052e640cb5ddc461924a045cacf325cd164e65312d99502"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:35f7c7d015d474f4011e859e93e789c87d21f6f4880ebdc29896a60403328f1f"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f24077a3b5298a5a06a8e0536e3ea9ec60e4c7ac486755e5fb6e6ea9b3500106"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:f0c64d1bd842ca2633e74a1a28033d139368ad959872533b1bab8c80e8240a0c"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:0f8da300b5c8af9f98111ffd512910bc792b4c77392a9523624680f7956a99d4"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-win32.whl", hash = "sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac"},
|
||||
{file = "cryptography-39.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad"},
|
||||
{file = "cryptography-39.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388"},
|
||||
{file = "cryptography-39.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336"},
|
||||
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2"},
|
||||
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:6f8ba7f0328b79f08bdacc3e4e66fb4d7aab0c3584e0bd41328dce5262e26b2e"},
|
||||
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ef8b72fa70b348724ff1218267e7f7375b8de4e8194d1636ee60510aae104cd0"},
|
||||
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:aec5a6c9864be7df2240c382740fcf3b96928c46604eaa7f3091f58b878c0bb6"},
|
||||
{file = "cryptography-39.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdd188c8a6ef8769f148f88f859884507b954cc64db6b52f66ef199bb9ad660a"},
|
||||
{file = "cryptography-39.0.1.tar.gz", hash = "sha256:d1f6198ee6d9148405e49887803907fe8962a23e6c6f83ea7d98f1c0de375695"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cffi = ">=1.12"
|
||||
|
||||
[package.extras]
|
||||
docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"]
|
||||
docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
|
||||
docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
|
||||
pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"]
|
||||
pep8test = ["black", "check-manifest", "mypy", "ruff", "types-pytz", "types-requests"]
|
||||
sdist = ["setuptools-rust (>=0.11.4)"]
|
||||
ssh = ["bcrypt (>=3.1.5)"]
|
||||
test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pytz"]
|
||||
test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist", "pytz"]
|
||||
test-randomorder = ["pytest-randomly"]
|
||||
tox = ["tox"]
|
||||
|
||||
[[package]]
|
||||
name = "docker"
|
||||
@@ -1311,66 +1308,63 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "moto"
|
||||
version = "3.1.18"
|
||||
description = "A library that allows your python tests to easily mock out the boto library"
|
||||
version = "4.1.2"
|
||||
description = ""
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"},
|
||||
{file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"},
|
||||
{file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"},
|
||||
{file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""}
|
||||
boto3 = ">=1.9.201"
|
||||
botocore = ">=1.12.201"
|
||||
cfn-lint = {version = ">=0.4.0", optional = true, markers = "extra == \"server\""}
|
||||
cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""}
|
||||
cryptography = ">=3.3.1"
|
||||
docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""}
|
||||
ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""}
|
||||
flask = {version = "<2.2.0", optional = true, markers = "extra == \"server\""}
|
||||
flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""}
|
||||
flask-cors = {version = "*", optional = true, markers = "extra == \"server\""}
|
||||
graphql-core = {version = "*", optional = true, markers = "extra == \"server\""}
|
||||
idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""}
|
||||
Jinja2 = ">=2.10.1"
|
||||
jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""}
|
||||
MarkupSafe = "!=2.0.0a1"
|
||||
openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""}
|
||||
pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""}
|
||||
python-dateutil = ">=2.1,<3.0.0"
|
||||
python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""}
|
||||
pytz = "*"
|
||||
PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""}
|
||||
requests = ">=2.5"
|
||||
responses = ">=0.9.0"
|
||||
responses = ">=0.13.0"
|
||||
setuptools = {version = "*", optional = true, markers = "extra == \"server\""}
|
||||
sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""}
|
||||
werkzeug = ">=0.5,<2.2.0"
|
||||
werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1"
|
||||
xmltodict = "*"
|
||||
|
||||
[package.extras]
|
||||
all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
|
||||
all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
|
||||
apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
|
||||
apigatewayv2 = ["PyYAML (>=5.1)"]
|
||||
appsync = ["graphql-core"]
|
||||
awslambda = ["docker (>=2.5.1)"]
|
||||
batch = ["docker (>=2.5.1)"]
|
||||
cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
|
||||
cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
|
||||
cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
|
||||
ds = ["sshpubkeys (>=3.1.0)"]
|
||||
dynamodb = ["docker (>=2.5.1)"]
|
||||
dynamodb2 = ["docker (>=2.5.1)"]
|
||||
dynamodbstreams = ["docker (>=2.5.1)"]
|
||||
ebs = ["sshpubkeys (>=3.1.0)"]
|
||||
ec2 = ["sshpubkeys (>=3.1.0)"]
|
||||
efs = ["sshpubkeys (>=3.1.0)"]
|
||||
eks = ["sshpubkeys (>=3.1.0)"]
|
||||
glue = ["pyparsing (>=3.0.7)"]
|
||||
iotdata = ["jsondiff (>=1.1.2)"]
|
||||
route53resolver = ["sshpubkeys (>=3.1.0)"]
|
||||
s3 = ["PyYAML (>=5.1)"]
|
||||
server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (<2.2.0)", "flask-cors", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
|
||||
ssm = ["PyYAML (>=5.1)", "dataclasses"]
|
||||
server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
|
||||
ssm = ["PyYAML (>=5.1)"]
|
||||
xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
|
||||
|
||||
[[package]]
|
||||
@@ -2092,18 +2086,6 @@ cryptography = ["cryptography (>=3.4.0)"]
|
||||
pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"]
|
||||
pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "pytz"
|
||||
version = "2022.1"
|
||||
description = "World timezone definitions, modern and historical"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"},
|
||||
{file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pywin32"
|
||||
version = "301"
|
||||
@@ -2452,16 +2434,19 @@ test = ["websockets"]
|
||||
|
||||
[[package]]
|
||||
name = "werkzeug"
|
||||
version = "2.1.2"
|
||||
version = "2.2.3"
|
||||
description = "The comprehensive WSGI web application library."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "Werkzeug-2.1.2-py3-none-any.whl", hash = "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255"},
|
||||
{file = "Werkzeug-2.1.2.tar.gz", hash = "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6"},
|
||||
{file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"},
|
||||
{file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
MarkupSafe = ">=2.1.1"
|
||||
|
||||
[package.extras]
|
||||
watchdog = ["watchdog"]
|
||||
|
||||
@@ -2658,4 +2643,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "7563a38912963d8cf20c99acb06fe55623e65b799c4b88d37dc672e5384c96a3"
|
||||
content-hash = "3038940781ef59d1ed28cedf46120ad6623e21e602c38ad3c359428d79fa1efd"
|
||||
|
||||
@@ -28,6 +28,7 @@ itertools.workspace = true
|
||||
md5.workspace = true
|
||||
metrics.workspace = true
|
||||
once_cell.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
parking_lot.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
pq_proto.workspace = true
|
||||
@@ -35,6 +36,8 @@ prometheus.workspace = true
|
||||
rand.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
reqwest-middleware.workspace = true
|
||||
reqwest-tracing.workspace = true
|
||||
routerify.workspace = true
|
||||
rustls-pemfile.workspace = true
|
||||
rustls.workspace = true
|
||||
@@ -43,12 +46,15 @@ serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
sha2.workspace = true
|
||||
socket2.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
thiserror.workspace = true
|
||||
tls-listener.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio = { workspace = true, features = ["signal"] }
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
mod classic;
|
||||
|
||||
mod hacks;
|
||||
mod link;
|
||||
use futures::TryFutureExt;
|
||||
|
||||
pub use link::LinkAuthError;
|
||||
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
auth::{self, ClientCredentials},
|
||||
console::{
|
||||
self,
|
||||
provider::{CachedNodeInfo, ConsoleReqExtra},
|
||||
@@ -13,9 +13,10 @@ use crate::{
|
||||
},
|
||||
stream, url,
|
||||
};
|
||||
use futures::TryFutureExt;
|
||||
use std::borrow::Cow;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
use tracing::info;
|
||||
|
||||
/// A product of successful authentication.
|
||||
pub struct AuthSuccess<T> {
|
||||
@@ -105,97 +106,49 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: get rid of explicit lifetimes in this block (there's a bug in rustc).
|
||||
// Read more: https://github.com/rust-lang/rust/issues/99190
|
||||
// Alleged fix: https://github.com/rust-lang/rust/pull/89056
|
||||
impl<'l> BackendType<'l, ClientCredentials<'_>> {
|
||||
/// Do something special if user didn't provide the `project` parameter.
|
||||
async fn try_password_hack<'a>(
|
||||
&'a mut self,
|
||||
extra: &'a ConsoleReqExtra<'a>,
|
||||
client: &'a mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<Option<AuthSuccess<CachedNodeInfo>>> {
|
||||
use BackendType::*;
|
||||
|
||||
// If there's no project so far, that entails that client doesn't
|
||||
// support SNI or other means of passing the project name.
|
||||
// We now expect to see a very specific payload in the place of password.
|
||||
let fetch_magic_payload = |client| async {
|
||||
warn!("project name not specified, resorting to the password hack auth flow");
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::PasswordHack)
|
||||
.await?
|
||||
.authenticate()
|
||||
.await?;
|
||||
|
||||
info!(project = &payload.project, "received missing parameter");
|
||||
auth::Result::Ok(payload)
|
||||
};
|
||||
|
||||
// If we want to use cleartext password flow, we can read the password
|
||||
// from the client and pretend that it's a magic payload (PasswordHack hack).
|
||||
let fetch_plaintext_password = |client| async {
|
||||
info!("using cleartext password flow");
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::CleartextPassword)
|
||||
.await?
|
||||
.authenticate()
|
||||
.await?;
|
||||
|
||||
auth::Result::Ok(auth::password_hack::PasswordHackPayload {
|
||||
project: String::new(),
|
||||
password: payload,
|
||||
})
|
||||
};
|
||||
|
||||
// TODO: find a proper way to merge those very similar blocks.
|
||||
let (mut node, password) = match self {
|
||||
Console(api, creds) if creds.project.is_none() => {
|
||||
let payload = fetch_magic_payload(client).await?;
|
||||
creds.project = Some(payload.project.into());
|
||||
let node = api.wake_compute(extra, creds).await?;
|
||||
|
||||
(node, payload.password)
|
||||
}
|
||||
// This is a hack to allow cleartext password in secure connections (wss).
|
||||
Console(api, creds) if creds.use_cleartext_password_flow => {
|
||||
let payload = fetch_plaintext_password(client).await?;
|
||||
let node = api.wake_compute(extra, creds).await?;
|
||||
|
||||
(node, payload.password)
|
||||
}
|
||||
Postgres(api, creds) if creds.project.is_none() => {
|
||||
let payload = fetch_magic_payload(client).await?;
|
||||
creds.project = Some(payload.project.into());
|
||||
let node = api.wake_compute(extra, creds).await?;
|
||||
|
||||
(node, payload.password)
|
||||
}
|
||||
_ => return Ok(None),
|
||||
};
|
||||
|
||||
node.config.password(password);
|
||||
Ok(Some(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: node,
|
||||
}))
|
||||
/// True to its name, this function encapsulates our current auth trade-offs.
|
||||
/// Here, we choose the appropriate auth flow based on circumstances.
|
||||
async fn auth_quirks(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
allow_cleartext: bool,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
// If there's no project so far, that entails that client doesn't
|
||||
// support SNI or other means of passing the endpoint (project) name.
|
||||
// We now expect to see a very specific payload in the place of password.
|
||||
if creds.project.is_none() {
|
||||
// Password will be checked by the compute node later.
|
||||
return hacks::password_hack(api, extra, creds, client).await;
|
||||
}
|
||||
|
||||
// Password hack should set the project name.
|
||||
// TODO: make `creds.project` more type-safe.
|
||||
assert!(creds.project.is_some());
|
||||
|
||||
// Perform cleartext auth if we're allowed to do that.
|
||||
// Currently, we use it for websocket connections (latency).
|
||||
if allow_cleartext {
|
||||
// Password will be checked by the compute node later.
|
||||
return hacks::cleartext_hack(api, extra, creds, client).await;
|
||||
}
|
||||
|
||||
// Finally, proceed with the main auth flow (SCRAM-based).
|
||||
classic::authenticate(api, extra, creds, client).await
|
||||
}
|
||||
|
||||
impl BackendType<'_, ClientCredentials<'_>> {
|
||||
/// Authenticate the client via the requested backend, possibly using credentials.
|
||||
pub async fn authenticate<'a>(
|
||||
#[tracing::instrument(fields(allow_cleartext), skip_all)]
|
||||
pub async fn authenticate(
|
||||
&mut self,
|
||||
extra: &'a ConsoleReqExtra<'a>,
|
||||
client: &'a mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
allow_cleartext: bool,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
use BackendType::*;
|
||||
|
||||
// Handle cases when `project` is missing in `creds`.
|
||||
// TODO: type safety: return `creds` with irrefutable `project`.
|
||||
if let Some(res) = self.try_password_hack(extra, client).await? {
|
||||
info!("user successfully authenticated (using the password hack)");
|
||||
return Ok(res);
|
||||
}
|
||||
|
||||
let res = match self {
|
||||
Console(api, creds) => {
|
||||
info!(
|
||||
@@ -204,20 +157,24 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> {
|
||||
"performing authentication using the console"
|
||||
);
|
||||
|
||||
assert!(creds.project.is_some());
|
||||
classic::handle_user(api.as_ref(), extra, creds, client).await?
|
||||
let api = api.as_ref();
|
||||
auth_quirks(api, extra, creds, client, allow_cleartext).await?
|
||||
}
|
||||
Postgres(api, creds) => {
|
||||
info!("performing mock authentication using a local postgres instance");
|
||||
info!(
|
||||
user = creds.user,
|
||||
project = creds.project(),
|
||||
"performing authentication using a local postgres instance"
|
||||
);
|
||||
|
||||
assert!(creds.project.is_some());
|
||||
classic::handle_user(api.as_ref(), extra, creds, client).await?
|
||||
let api = api.as_ref();
|
||||
auth_quirks(api, extra, creds, client, allow_cleartext).await?
|
||||
}
|
||||
// NOTE: this auth backend doesn't use client credentials.
|
||||
Link(url) => {
|
||||
info!("performing link authentication");
|
||||
|
||||
link::handle_user(url, client)
|
||||
link::authenticate(url, client)
|
||||
.await?
|
||||
.map(CachedNodeInfo::new_uncached)
|
||||
}
|
||||
@@ -229,9 +186,9 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> {
|
||||
|
||||
/// When applicable, wake the compute node, gaining its connection info in the process.
|
||||
/// The link auth flow doesn't support this, so we return [`None`] in that case.
|
||||
pub async fn wake_compute<'a>(
|
||||
pub async fn wake_compute(
|
||||
&self,
|
||||
extra: &'a ConsoleReqExtra<'a>,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
|
||||
use BackendType::*;
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ use crate::{
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::info;
|
||||
|
||||
pub(super) async fn handle_user(
|
||||
pub(super) async fn authenticate(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials<'_>,
|
||||
|
||||
66
proxy/src/auth/backend/hacks.rs
Normal file
66
proxy/src/auth/backend/hacks.rs
Normal file
@@ -0,0 +1,66 @@
|
||||
use super::AuthSuccess;
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
console::{
|
||||
self,
|
||||
provider::{CachedNodeInfo, ConsoleReqExtra},
|
||||
},
|
||||
stream,
|
||||
};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
|
||||
/// Compared to [SCRAM](crate::scram), cleartext password auth saves
|
||||
/// one round trip and *expensive* computations (>= 4096 HMAC iterations).
|
||||
/// These properties are benefical for serverless JS workers, so we
|
||||
/// use this mechanism for websocket connections.
|
||||
pub async fn cleartext_hack(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
warn!("cleartext auth flow override is enabled, proceeding");
|
||||
let password = AuthFlow::new(client)
|
||||
.begin(auth::CleartextPassword)
|
||||
.await?
|
||||
.authenticate()
|
||||
.await?;
|
||||
|
||||
let mut node = api.wake_compute(extra, creds).await?;
|
||||
node.config.password(password);
|
||||
|
||||
// Report tentative success; compute node will check the password anyway.
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: node,
|
||||
})
|
||||
}
|
||||
|
||||
/// Workaround for clients which don't provide an endpoint (project) name.
|
||||
/// Very similar to [`cleartext_hack`], but there's a specific password format.
|
||||
pub async fn password_hack(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
warn!("project not specified, resorting to the password hack auth flow");
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::PasswordHack)
|
||||
.await?
|
||||
.authenticate()
|
||||
.await?;
|
||||
|
||||
info!(project = &payload.project, "received missing parameter");
|
||||
creds.project = Some(payload.project.into());
|
||||
|
||||
let mut node = api.wake_compute(extra, creds).await?;
|
||||
node.config.password(payload.password);
|
||||
|
||||
// Report tentative success; compute node will check the password anyway.
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: node,
|
||||
})
|
||||
}
|
||||
@@ -53,7 +53,7 @@ pub fn new_psql_session_id() -> String {
|
||||
hex::encode(rand::random::<[u8; 8]>())
|
||||
}
|
||||
|
||||
pub(super) async fn handle_user(
|
||||
pub(super) async fn authenticate(
|
||||
link_uri: &reqwest::Url,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<AuthSuccess<NodeInfo>> {
|
||||
@@ -78,6 +78,8 @@ pub(super) async fn handle_user(
|
||||
|
||||
client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
|
||||
|
||||
// This config should be self-contained, because we won't
|
||||
// take username or dbname from client's startup message.
|
||||
let mut config = compute::ConnCfg::new();
|
||||
config
|
||||
.host(&db_info.host)
|
||||
|
||||
@@ -11,12 +11,16 @@ pub enum ClientCredsParseError {
|
||||
#[error("Parameter '{0}' is missing in startup packet.")]
|
||||
MissingKey(&'static str),
|
||||
|
||||
#[error("Inconsistent project name inferred from SNI ('{}') and project option ('{}').", .domain, .option)]
|
||||
#[error(
|
||||
"Inconsistent project name inferred from \
|
||||
SNI ('{}') and project option ('{}').",
|
||||
.domain, .option,
|
||||
)]
|
||||
InconsistentProjectNames { domain: String, option: String },
|
||||
|
||||
#[error(
|
||||
"SNI ('{}') inconsistently formatted with respect to common name ('{}'). \
|
||||
SNI should be formatted as '<project-name>.{}'.",
|
||||
SNI should be formatted as '<project-name>.{}'.",
|
||||
.sni, .cn, .cn,
|
||||
)]
|
||||
InconsistentSni { sni: String, cn: String },
|
||||
@@ -32,12 +36,8 @@ impl UserFacingError for ClientCredsParseError {}
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ClientCredentials<'a> {
|
||||
pub user: &'a str,
|
||||
pub dbname: &'a str,
|
||||
// TODO: this is a severe misnomer! We should think of a new name ASAP.
|
||||
pub project: Option<Cow<'a, str>>,
|
||||
/// If `True`, we'll use the old cleartext password flow. This is used for
|
||||
/// websocket connections, which want to minimize the number of round trips.
|
||||
pub use_cleartext_password_flow: bool,
|
||||
}
|
||||
|
||||
impl ClientCredentials<'_> {
|
||||
@@ -52,14 +52,12 @@ impl<'a> ClientCredentials<'a> {
|
||||
params: &'a StartupMessageParams,
|
||||
sni: Option<&str>,
|
||||
common_name: Option<&str>,
|
||||
use_cleartext_password_flow: bool,
|
||||
) -> Result<Self, ClientCredsParseError> {
|
||||
use ClientCredsParseError::*;
|
||||
|
||||
// Some parameters are stored in the startup message.
|
||||
let get_param = |key| params.get(key).ok_or(MissingKey(key));
|
||||
let user = get_param("user")?;
|
||||
let dbname = get_param("database")?;
|
||||
|
||||
// Project name might be passed via PG's command-line options.
|
||||
let project_option = params.options_raw().and_then(|mut options| {
|
||||
@@ -98,20 +96,9 @@ impl<'a> ClientCredentials<'a> {
|
||||
}
|
||||
.transpose()?;
|
||||
|
||||
info!(
|
||||
user = user,
|
||||
dbname = dbname,
|
||||
project = project.as_deref(),
|
||||
use_cleartext_password_flow = use_cleartext_password_flow,
|
||||
"credentials"
|
||||
);
|
||||
info!(user, project = project.as_deref(), "credentials");
|
||||
|
||||
Ok(Self {
|
||||
user,
|
||||
dbname,
|
||||
project,
|
||||
use_cleartext_password_flow,
|
||||
})
|
||||
Ok(Self { user, project })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,25 +118,27 @@ mod tests {
|
||||
use ClientCredsParseError::*;
|
||||
|
||||
#[test]
|
||||
#[ignore = "TODO: fix how database is handled"]
|
||||
fn parse_bare_minimum() -> anyhow::Result<()> {
|
||||
// According to postgresql, only `user` should be required.
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
|
||||
// TODO: check that `creds.dbname` is None.
|
||||
let creds = ClientCredentials::parse(&options, None, None, false)?;
|
||||
let creds = ClientCredentials::parse(&options, None, None)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project, None);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_missing_project() -> anyhow::Result<()> {
|
||||
let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]);
|
||||
fn parse_excessive() -> anyhow::Result<()> {
|
||||
let options = StartupMessageParams::new([
|
||||
("user", "john_doe"),
|
||||
("database", "world"), // should be ignored
|
||||
("foo", "bar"), // should be ignored
|
||||
]);
|
||||
|
||||
let creds = ClientCredentials::parse(&options, None, None, false)?;
|
||||
let creds = ClientCredentials::parse(&options, None, None)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.dbname, "world");
|
||||
assert_eq!(creds.project, None);
|
||||
|
||||
Ok(())
|
||||
@@ -157,14 +146,13 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_project_from_sni() -> anyhow::Result<()> {
|
||||
let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]);
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
|
||||
let sni = Some("foo.localhost");
|
||||
let common_name = Some("localhost");
|
||||
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.dbname, "world");
|
||||
assert_eq!(creds.project.as_deref(), Some("foo"));
|
||||
|
||||
Ok(())
|
||||
@@ -174,13 +162,11 @@ mod tests {
|
||||
fn parse_project_from_options() -> anyhow::Result<()> {
|
||||
let options = StartupMessageParams::new([
|
||||
("user", "john_doe"),
|
||||
("database", "world"),
|
||||
("options", "-ckey=1 project=bar -c geqo=off"),
|
||||
]);
|
||||
|
||||
let creds = ClientCredentials::parse(&options, None, None, false)?;
|
||||
let creds = ClientCredentials::parse(&options, None, None)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.dbname, "world");
|
||||
assert_eq!(creds.project.as_deref(), Some("bar"));
|
||||
|
||||
Ok(())
|
||||
@@ -188,18 +174,13 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_projects_identical() -> anyhow::Result<()> {
|
||||
let options = StartupMessageParams::new([
|
||||
("user", "john_doe"),
|
||||
("database", "world"),
|
||||
("options", "project=baz"),
|
||||
]);
|
||||
let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]);
|
||||
|
||||
let sni = Some("baz.localhost");
|
||||
let common_name = Some("localhost");
|
||||
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.dbname, "world");
|
||||
assert_eq!(creds.project.as_deref(), Some("baz"));
|
||||
|
||||
Ok(())
|
||||
@@ -207,17 +188,13 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_projects_different() {
|
||||
let options = StartupMessageParams::new([
|
||||
("user", "john_doe"),
|
||||
("database", "world"),
|
||||
("options", "project=first"),
|
||||
]);
|
||||
let options =
|
||||
StartupMessageParams::new([("user", "john_doe"), ("options", "project=first")]);
|
||||
|
||||
let sni = Some("second.localhost");
|
||||
let common_name = Some("localhost");
|
||||
|
||||
let err =
|
||||
ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
|
||||
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
|
||||
match err {
|
||||
InconsistentProjectNames { domain, option } => {
|
||||
assert_eq!(option, "first");
|
||||
@@ -229,13 +206,12 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_inconsistent_sni() {
|
||||
let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]);
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
|
||||
let sni = Some("project.localhost");
|
||||
let common_name = Some("example.com");
|
||||
|
||||
let err =
|
||||
ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
|
||||
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
|
||||
match err {
|
||||
InconsistentSni { sni, cn } => {
|
||||
assert_eq!(sni, "project.localhost");
|
||||
|
||||
@@ -65,14 +65,21 @@ impl ConnCfg {
|
||||
|
||||
/// Apply startup message params to the connection config.
|
||||
pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
|
||||
if let Some(options) = params.options_raw() {
|
||||
// We must drop all proxy-specific parameters.
|
||||
#[allow(unstable_name_collisions)]
|
||||
let options: String = options
|
||||
.filter(|opt| !opt.starts_with("project="))
|
||||
.intersperse(" ") // TODO: use impl from std once it's stabilized
|
||||
.collect();
|
||||
// Only set `user` if it's not present in the config.
|
||||
// Link auth flow takes username from the console's response.
|
||||
if let (None, Some(user)) = (self.get_user(), params.get("user")) {
|
||||
self.user(user);
|
||||
}
|
||||
|
||||
// Only set `dbname` if it's not present in the config.
|
||||
// Link auth flow takes dbname from the console's response.
|
||||
if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
|
||||
self.dbname(dbname);
|
||||
}
|
||||
|
||||
// Don't add `options` if they were only used for specifying a project.
|
||||
// Connection pools don't support `options`, because they affect backend startup.
|
||||
if let Some(options) = filtered_options(params) {
|
||||
self.options(&options);
|
||||
}
|
||||
|
||||
@@ -213,3 +220,46 @@ impl ConnCfg {
|
||||
Ok(connection)
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve `options` from a startup message, dropping all proxy-secific flags.
|
||||
fn filtered_options(params: &StartupMessageParams) -> Option<String> {
|
||||
#[allow(unstable_name_collisions)]
|
||||
let options: String = params
|
||||
.options_raw()?
|
||||
.filter(|opt| !opt.starts_with("project="))
|
||||
.intersperse(" ") // TODO: use impl from std once it's stabilized
|
||||
.collect();
|
||||
|
||||
// Don't even bother with empty options.
|
||||
if options.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(options)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_filtered_options() {
|
||||
// Empty options is unlikely to be useful anyway.
|
||||
let params = StartupMessageParams::new([("options", "")]);
|
||||
assert_eq!(filtered_options(¶ms), None);
|
||||
|
||||
// It's likely that clients will only use options to specify endpoint/project.
|
||||
let params = StartupMessageParams::new([("options", "project=foo")]);
|
||||
assert_eq!(filtered_options(¶ms), None);
|
||||
|
||||
// Same, because unescaped whitespaces are no-op.
|
||||
let params = StartupMessageParams::new([("options", " project=foo ")]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), None);
|
||||
|
||||
let params = StartupMessageParams::new([("options", r"\ project=foo \ ")]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), Some(r"\ \ "));
|
||||
|
||||
let params = StartupMessageParams::new([("options", "project = foo")]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@ pub struct ProxyConfig {
|
||||
pub metric_collection: Option<MetricCollectionConfig>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MetricCollectionConfig {
|
||||
pub endpoint: reqwest::Url,
|
||||
pub interval: Duration,
|
||||
|
||||
@@ -5,10 +5,7 @@ use crate::{
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::Lazy;
|
||||
use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
|
||||
use std::{
|
||||
net::{TcpListener, TcpStream},
|
||||
thread,
|
||||
};
|
||||
use std::{net::TcpStream, thread};
|
||||
use tracing::{error, info, info_span};
|
||||
use utils::{
|
||||
postgres_backend::{self, AuthType, PostgresBackend},
|
||||
@@ -34,23 +31,24 @@ pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::N
|
||||
CPLANE_WAITERS.notify(psql_session_id, msg)
|
||||
}
|
||||
|
||||
/// Console management API listener thread.
|
||||
/// Console management API listener task.
|
||||
/// It spawns console response handlers needed for the link auth.
|
||||
pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
|
||||
pub async fn task_main(listener: tokio::net::TcpListener) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("mgmt has shut down");
|
||||
}
|
||||
|
||||
listener
|
||||
.set_nonblocking(false)
|
||||
.context("failed to set listener to blocking")?;
|
||||
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept().context("failed to accept a new client")?;
|
||||
let (socket, peer_addr) = listener.accept().await?;
|
||||
info!("accepted connection from {peer_addr}");
|
||||
|
||||
let socket = socket.into_std()?;
|
||||
socket
|
||||
.set_nodelay(true)
|
||||
.context("failed to set client socket option")?;
|
||||
socket
|
||||
.set_nonblocking(false)
|
||||
.context("failed to set client socket option")?;
|
||||
|
||||
// TODO: replace with async tasks.
|
||||
thread::spawn(move || {
|
||||
|
||||
@@ -11,8 +11,10 @@ use async_trait::async_trait;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub mod errors {
|
||||
use crate::error::{io_error, UserFacingError};
|
||||
use reqwest::StatusCode as HttpStatusCode;
|
||||
use crate::{
|
||||
error::{io_error, UserFacingError},
|
||||
http,
|
||||
};
|
||||
use thiserror::Error;
|
||||
|
||||
/// A go-to error message which doesn't leak any detail.
|
||||
@@ -24,7 +26,7 @@ pub mod errors {
|
||||
/// Error returned by the console itself.
|
||||
#[error("{REQUEST_FAILED} with {}: {}", .status, .text)]
|
||||
Console {
|
||||
status: HttpStatusCode,
|
||||
status: http::StatusCode,
|
||||
text: Box<str>,
|
||||
},
|
||||
|
||||
@@ -35,7 +37,7 @@ pub mod errors {
|
||||
|
||||
impl ApiError {
|
||||
/// Returns HTTP status code if it's the reason for failure.
|
||||
pub fn http_status_code(&self) -> Option<HttpStatusCode> {
|
||||
pub fn http_status_code(&self) -> Option<http::StatusCode> {
|
||||
use ApiError::*;
|
||||
match self {
|
||||
Console { status, .. } => Some(*status),
|
||||
@@ -51,15 +53,15 @@ pub mod errors {
|
||||
// To minimize risks, only select errors are forwarded to users.
|
||||
// Ask @neondatabase/control-plane for review before adding more.
|
||||
Console { status, .. } => match *status {
|
||||
HttpStatusCode::NOT_FOUND => {
|
||||
http::StatusCode::NOT_FOUND => {
|
||||
// Status 404: failed to get a project-related resource.
|
||||
format!("{REQUEST_FAILED}: endpoint cannot be found")
|
||||
}
|
||||
HttpStatusCode::NOT_ACCEPTABLE => {
|
||||
http::StatusCode::NOT_ACCEPTABLE => {
|
||||
// Status 406: endpoint is disabled (we don't allow connections).
|
||||
format!("{REQUEST_FAILED}: endpoint is disabled")
|
||||
}
|
||||
HttpStatusCode::LOCKED => {
|
||||
http::StatusCode::LOCKED => {
|
||||
// Status 423: project might be in maintenance mode (or bad state).
|
||||
format!("{REQUEST_FAILED}: endpoint is temporary unavailable")
|
||||
}
|
||||
@@ -70,13 +72,18 @@ pub mod errors {
|
||||
}
|
||||
}
|
||||
|
||||
// Helps eliminate graceless `.map_err` calls without introducing another ctor.
|
||||
impl From<reqwest::Error> for ApiError {
|
||||
fn from(e: reqwest::Error) -> Self {
|
||||
io_error(e).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<reqwest_middleware::Error> for ApiError {
|
||||
fn from(e: reqwest_middleware::Error) -> Self {
|
||||
io_error(e).into()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum GetAuthInfoError {
|
||||
// We shouldn't include the actual secret here.
|
||||
|
||||
@@ -82,16 +82,11 @@ impl Api {
|
||||
.await
|
||||
}
|
||||
|
||||
async fn do_wake_compute(
|
||||
&self,
|
||||
creds: &ClientCredentials<'_>,
|
||||
) -> Result<NodeInfo, WakeComputeError> {
|
||||
async fn do_wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
|
||||
let mut config = compute::ConnCfg::new();
|
||||
config
|
||||
.host(self.endpoint.host_str().unwrap_or("localhost"))
|
||||
.port(self.endpoint.port().unwrap_or(5432))
|
||||
.dbname(creds.dbname)
|
||||
.user(creds.user);
|
||||
.port(self.endpoint.port().unwrap_or(5432));
|
||||
|
||||
let node = NodeInfo {
|
||||
config,
|
||||
@@ -117,9 +112,9 @@ impl super::Api for Api {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
_extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials<'_>,
|
||||
_creds: &ClientCredentials<'_>,
|
||||
) -> Result<CachedNodeInfo, WakeComputeError> {
|
||||
self.do_wake_compute(creds)
|
||||
self.do_wake_compute()
|
||||
.map_ok(CachedNodeInfo::new_uncached)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@ use super::{
|
||||
use crate::{auth::ClientCredentials, compute, http, scram};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use reqwest::StatusCode as HttpStatusCode;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -52,7 +51,7 @@ impl Api {
|
||||
Ok(body) => body,
|
||||
// Error 404 is special: it's ok not to have a secret.
|
||||
Err(e) => match e.http_status_code() {
|
||||
Some(HttpStatusCode::NOT_FOUND) => return Ok(None),
|
||||
Some(http::StatusCode::NOT_FOUND) => return Ok(None),
|
||||
_otherwise => return Err(e.into()),
|
||||
},
|
||||
};
|
||||
@@ -97,12 +96,11 @@ impl Api {
|
||||
Some(x) => x,
|
||||
};
|
||||
|
||||
// Don't set anything but host and port! This config will be cached.
|
||||
// We'll set username and such later using the startup message.
|
||||
// TODO: add more type safety (in progress).
|
||||
let mut config = compute::ConnCfg::new();
|
||||
config
|
||||
.host(host)
|
||||
.port(port)
|
||||
.dbname(creds.dbname)
|
||||
.user(creds.user);
|
||||
config.host(host).port(port);
|
||||
|
||||
let node = NodeInfo {
|
||||
config,
|
||||
@@ -155,7 +153,7 @@ impl super::Api for Api {
|
||||
|
||||
/// Parse http response body, taking status code into account.
|
||||
async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
|
||||
response: reqwest::Response,
|
||||
response: http::Response,
|
||||
) -> Result<T, ApiError> {
|
||||
let status = response.status();
|
||||
if status.is_success() {
|
||||
|
||||
@@ -1,7 +1,24 @@
|
||||
//! HTTP client and server impls.
|
||||
//! Other modules should use stuff from this module instead of
|
||||
//! directly relying on deps like `reqwest` (think loose coupling).
|
||||
|
||||
pub mod server;
|
||||
pub mod websocket;
|
||||
|
||||
pub use reqwest::{Request, Response, StatusCode};
|
||||
pub use reqwest_middleware::{ClientWithMiddleware, Error};
|
||||
|
||||
use crate::url::ApiUrl;
|
||||
use reqwest_middleware::RequestBuilder;
|
||||
|
||||
/// This is the preferred way to create new http clients,
|
||||
/// because it takes care of observability (OpenTelemetry).
|
||||
/// We deliberately don't want to replace this with a public static.
|
||||
pub fn new_client() -> ClientWithMiddleware {
|
||||
reqwest_middleware::ClientBuilder::new(reqwest::Client::new())
|
||||
.with(reqwest_tracing::TracingMiddleware::default())
|
||||
.build()
|
||||
}
|
||||
|
||||
/// Thin convenience wrapper for an API provided by an http endpoint.
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -9,13 +26,17 @@ pub struct Endpoint {
|
||||
/// API's base URL.
|
||||
endpoint: ApiUrl,
|
||||
/// Connection manager with built-in pooling.
|
||||
client: reqwest::Client,
|
||||
client: ClientWithMiddleware,
|
||||
}
|
||||
|
||||
impl Endpoint {
|
||||
/// Construct a new HTTP endpoint wrapper.
|
||||
pub fn new(endpoint: ApiUrl, client: reqwest::Client) -> Self {
|
||||
Self { endpoint, client }
|
||||
/// Http client is not constructed under the hood so that it can be shared.
|
||||
pub fn new(endpoint: ApiUrl, client: impl Into<ClientWithMiddleware>) -> Self {
|
||||
Self {
|
||||
endpoint,
|
||||
client: client.into(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
@@ -23,19 +44,16 @@ impl Endpoint {
|
||||
&self.endpoint
|
||||
}
|
||||
|
||||
/// Return a [builder](reqwest::RequestBuilder) for a `GET` request,
|
||||
/// Return a [builder](RequestBuilder) for a `GET` request,
|
||||
/// appending a single `path` segment to the base endpoint URL.
|
||||
pub fn get(&self, path: &str) -> reqwest::RequestBuilder {
|
||||
pub fn get(&self, path: &str) -> RequestBuilder {
|
||||
let mut url = self.endpoint.clone();
|
||||
url.path_segments_mut().push(path);
|
||||
self.client.get(url.into_inner())
|
||||
}
|
||||
|
||||
/// Execute a [request](reqwest::Request).
|
||||
pub async fn execute(
|
||||
&self,
|
||||
request: reqwest::Request,
|
||||
) -> Result<reqwest::Response, reqwest::Error> {
|
||||
pub async fn execute(&self, request: Request) -> Result<Response, Error> {
|
||||
self.client.execute(request).await
|
||||
}
|
||||
}
|
||||
@@ -43,11 +61,12 @@ impl Endpoint {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use reqwest::Client;
|
||||
|
||||
#[test]
|
||||
fn optional_query_params() -> anyhow::Result<()> {
|
||||
let url = "http://example.com".parse()?;
|
||||
let endpoint = Endpoint::new(url, reqwest::Client::new());
|
||||
let endpoint = Endpoint::new(url, Client::new());
|
||||
|
||||
// Validate that this pattern makes sense.
|
||||
let req = endpoint
|
||||
@@ -66,7 +85,7 @@ mod tests {
|
||||
#[test]
|
||||
fn uuid_params() -> anyhow::Result<()> {
|
||||
let url = "http://example.com".parse()?;
|
||||
let endpoint = Endpoint::new(url, reqwest::Client::new());
|
||||
let endpoint = Endpoint::new(url, Client::new());
|
||||
|
||||
let req = endpoint
|
||||
.get("frobnicate")
|
||||
|
||||
@@ -1,161 +1,136 @@
|
||||
use crate::{
|
||||
cancellation::CancelMap, config::ProxyConfig, error::io_error, proxy::handle_ws_client,
|
||||
};
|
||||
use bytes::{Buf, Bytes};
|
||||
use futures::{Sink, Stream, StreamExt};
|
||||
use hyper::server::accept;
|
||||
use hyper::server::conn::AddrIncoming;
|
||||
use hyper::upgrade::Upgraded;
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use hyper_tungstenite::{tungstenite, WebSocketStream};
|
||||
use hyper_tungstenite::{tungstenite::Message, HyperWebsocket};
|
||||
use hyper::{
|
||||
server::{accept, conn::AddrIncoming},
|
||||
upgrade::Upgraded,
|
||||
Body, Request, Response, StatusCode,
|
||||
};
|
||||
use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
|
||||
use pin_project_lite::pin_project;
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
use std::convert::Infallible;
|
||||
use std::future::ready;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::task::{ready, Context, Poll};
|
||||
use std::{
|
||||
convert::Infallible,
|
||||
future::ready,
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
task::{ready, Context, Poll},
|
||||
};
|
||||
use tls_listener::TlsListener;
|
||||
|
||||
use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
|
||||
|
||||
use tokio::{
|
||||
io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
|
||||
net::TcpListener,
|
||||
};
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
use utils::http::{error::ApiError, json::json_response};
|
||||
|
||||
use crate::cancellation::CancelMap;
|
||||
use crate::config::ProxyConfig;
|
||||
use crate::proxy::handle_ws_client;
|
||||
// TODO: use `std::sync::Exclusive` once it's stabilized.
|
||||
// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
|
||||
use sync_wrapper::SyncWrapper;
|
||||
|
||||
pin_project! {
|
||||
/// This is a wrapper around a WebSocketStream that implements AsyncRead and AsyncWrite.
|
||||
pub struct WebSocketRW {
|
||||
/// This is a wrapper around a [`WebSocketStream`] that
|
||||
/// implements [`AsyncRead`] and [`AsyncWrite`].
|
||||
pub struct WebSocketRw {
|
||||
#[pin]
|
||||
stream: WebSocketStream<Upgraded>,
|
||||
chunk: Option<bytes::Bytes>,
|
||||
stream: SyncWrapper<WebSocketStream<Upgraded>>,
|
||||
bytes: Bytes,
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: explain why this is safe or try to remove `unsafe impl`.
|
||||
unsafe impl Sync for WebSocketRW {}
|
||||
|
||||
impl WebSocketRW {
|
||||
impl WebSocketRw {
|
||||
pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
|
||||
Self {
|
||||
stream,
|
||||
chunk: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn has_chunk(&self) -> bool {
|
||||
if let Some(ref chunk) = self.chunk {
|
||||
chunk.remaining() > 0
|
||||
} else {
|
||||
false
|
||||
stream: stream.into(),
|
||||
bytes: Bytes::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn ws_err_into(e: tungstenite::Error) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, e.to_string())
|
||||
}
|
||||
|
||||
impl AsyncWrite for WebSocketRW {
|
||||
impl AsyncWrite for WebSocketRw {
|
||||
fn poll_write(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
buf: &[u8],
|
||||
) -> Poll<Result<usize, io::Error>> {
|
||||
let mut this = self.project();
|
||||
match this.stream.as_mut().poll_ready(cx) {
|
||||
Poll::Ready(Ok(())) => {
|
||||
if let Err(e) = this
|
||||
.stream
|
||||
.as_mut()
|
||||
.start_send(Message::Binary(buf.to_vec()))
|
||||
{
|
||||
Poll::Ready(Err(ws_err_into(e)))
|
||||
} else {
|
||||
Poll::Ready(Ok(buf.len()))
|
||||
}
|
||||
}
|
||||
Poll::Ready(Err(e)) => Poll::Ready(Err(ws_err_into(e))),
|
||||
Poll::Pending => {
|
||||
cx.waker().wake_by_ref();
|
||||
Poll::Pending
|
||||
}
|
||||
) -> Poll<io::Result<usize>> {
|
||||
let mut stream = self.project().stream.get_pin_mut();
|
||||
|
||||
ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
|
||||
match stream.as_mut().start_send(Message::Binary(buf.into())) {
|
||||
Ok(()) => Poll::Ready(Ok(buf.len())),
|
||||
Err(e) => Poll::Ready(Err(io_error(e))),
|
||||
}
|
||||
}
|
||||
|
||||
fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
|
||||
self.project().stream.poll_flush(cx).map_err(ws_err_into)
|
||||
fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
|
||||
let stream = self.project().stream.get_pin_mut();
|
||||
stream.poll_flush(cx).map_err(io_error)
|
||||
}
|
||||
|
||||
fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
|
||||
self.project().stream.poll_close(cx).map_err(ws_err_into)
|
||||
fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
|
||||
let stream = self.project().stream.get_pin_mut();
|
||||
stream.poll_close(cx).map_err(io_error)
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncRead for WebSocketRW {
|
||||
impl AsyncRead for WebSocketRw {
|
||||
fn poll_read(
|
||||
mut self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
buf: &mut ReadBuf<'_>,
|
||||
) -> Poll<io::Result<()>> {
|
||||
if buf.remaining() == 0 {
|
||||
return Poll::Ready(Ok(()));
|
||||
if buf.remaining() > 0 {
|
||||
let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
|
||||
let len = std::cmp::min(bytes.len(), buf.remaining());
|
||||
buf.put_slice(&bytes[..len]);
|
||||
self.consume(len);
|
||||
}
|
||||
|
||||
let inner_buf = match ready!(self.as_mut().poll_fill_buf(cx)) {
|
||||
Ok(buf) => buf,
|
||||
Err(err) => return Poll::Ready(Err(err)),
|
||||
};
|
||||
let len = std::cmp::min(inner_buf.len(), buf.remaining());
|
||||
buf.put_slice(&inner_buf[..len]);
|
||||
|
||||
self.consume(len);
|
||||
Poll::Ready(Ok(()))
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncBufRead for WebSocketRW {
|
||||
fn poll_fill_buf(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
|
||||
impl AsyncBufRead for WebSocketRw {
|
||||
fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
|
||||
// Please refer to poll_fill_buf's documentation.
|
||||
const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
|
||||
|
||||
let mut this = self.project();
|
||||
loop {
|
||||
if self.as_mut().has_chunk() {
|
||||
let buf = self.project().chunk.as_ref().unwrap().chunk();
|
||||
return Poll::Ready(Ok(buf));
|
||||
} else {
|
||||
match ready!(self.as_mut().project().stream.poll_next(cx)) {
|
||||
Some(Ok(message)) => match message {
|
||||
Message::Text(_) => {}
|
||||
Message::Binary(chunk) => {
|
||||
*self.as_mut().project().chunk = Some(Bytes::from(chunk));
|
||||
}
|
||||
Message::Ping(_) => {
|
||||
// No need to send a reply: tungstenite takes care of this for you.
|
||||
}
|
||||
Message::Pong(_) => {}
|
||||
Message::Close(_) => {
|
||||
// No need to send a reply: tungstenite takes care of this for you.
|
||||
return Poll::Ready(Ok(&[]));
|
||||
}
|
||||
Message::Frame(_) => {
|
||||
unreachable!();
|
||||
}
|
||||
},
|
||||
Some(Err(err)) => return Poll::Ready(Err(ws_err_into(err))),
|
||||
None => return Poll::Ready(Ok(&[])),
|
||||
}
|
||||
if !this.bytes.chunk().is_empty() {
|
||||
let chunk = (*this.bytes).chunk();
|
||||
return Poll::Ready(Ok(chunk));
|
||||
}
|
||||
|
||||
let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
|
||||
match res.transpose().map_err(io_error)? {
|
||||
Some(message) => match message {
|
||||
Message::Ping(_) => {}
|
||||
Message::Pong(_) => {}
|
||||
Message::Text(text) => {
|
||||
// We expect to see only binary messages.
|
||||
let error = "unexpected text message in the websocket";
|
||||
warn!(length = text.len(), error);
|
||||
return Poll::Ready(Err(io_error(error)));
|
||||
}
|
||||
Message::Frame(_) => {
|
||||
// This case is impossible according to Frame's doc.
|
||||
panic!("unexpected raw frame in the websocket");
|
||||
}
|
||||
Message::Binary(chunk) => {
|
||||
assert!(this.bytes.is_empty());
|
||||
*this.bytes = Bytes::from(chunk);
|
||||
}
|
||||
Message::Close(_) => return EOF,
|
||||
},
|
||||
None => return EOF,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn consume(self: Pin<&mut Self>, amt: usize) {
|
||||
if amt > 0 {
|
||||
self.project()
|
||||
.chunk
|
||||
.as_mut()
|
||||
.expect("No chunk present")
|
||||
.advance(amt);
|
||||
}
|
||||
fn consume(self: Pin<&mut Self>, amount: usize) {
|
||||
self.project().bytes.advance(amount);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -171,7 +146,7 @@ async fn serve_websocket(
|
||||
config,
|
||||
cancel_map,
|
||||
session_id,
|
||||
WebSocketRW::new(websocket),
|
||||
WebSocketRw::new(websocket),
|
||||
hostname,
|
||||
)
|
||||
.await?;
|
||||
@@ -199,7 +174,7 @@ async fn ws_handler(
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
|
||||
{
|
||||
error!("error in websocket connection: {:?}", e);
|
||||
error!("error in websocket connection: {e:?}");
|
||||
}
|
||||
});
|
||||
|
||||
@@ -211,8 +186,8 @@ async fn ws_handler(
|
||||
}
|
||||
|
||||
pub async fn task_main(
|
||||
ws_listener: TcpListener,
|
||||
config: &'static ProxyConfig,
|
||||
ws_listener: TcpListener,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("websocket server has shut down");
|
||||
@@ -227,11 +202,12 @@ pub async fn task_main(
|
||||
}
|
||||
};
|
||||
|
||||
let addr_incoming = AddrIncoming::from_listener(ws_listener)?;
|
||||
let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
|
||||
let _ = addr_incoming.set_nodelay(true);
|
||||
|
||||
let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
|
||||
if let Err(err) = conn {
|
||||
error!("failed to accept TLS connection for websockets: {:?}", err);
|
||||
error!("failed to accept TLS connection for websockets: {err:?}");
|
||||
ready(false)
|
||||
} else {
|
||||
ready(true)
|
||||
|
||||
47
proxy/src/logging.rs
Normal file
47
proxy/src/logging.rs
Normal file
@@ -0,0 +1,47 @@
|
||||
use tracing_opentelemetry::OpenTelemetryLayer;
|
||||
use tracing_subscriber::{
|
||||
filter::{EnvFilter, LevelFilter},
|
||||
prelude::*,
|
||||
};
|
||||
|
||||
/// Initialize logging and OpenTelemetry tracing and exporter.
|
||||
///
|
||||
/// Logging can be configured using `RUST_LOG` environment variable.
|
||||
///
|
||||
/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up
|
||||
/// configuration from environment variables. For example, to change the
|
||||
/// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`.
|
||||
/// See <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables>
|
||||
pub async fn init() -> anyhow::Result<LoggingGuard> {
|
||||
let env_filter = EnvFilter::builder()
|
||||
.with_default_directive(LevelFilter::INFO.into())
|
||||
.from_env_lossy();
|
||||
|
||||
let fmt_layer = tracing_subscriber::fmt::layer()
|
||||
.with_ansi(atty::is(atty::Stream::Stderr))
|
||||
.with_writer(std::io::stderr)
|
||||
.with_target(false);
|
||||
|
||||
let otlp_layer = tracing_utils::init_tracing("proxy")
|
||||
.await
|
||||
.map(OpenTelemetryLayer::new);
|
||||
|
||||
tracing_subscriber::registry()
|
||||
.with(env_filter)
|
||||
.with(otlp_layer)
|
||||
.with(fmt_layer)
|
||||
.try_init()?;
|
||||
|
||||
Ok(LoggingGuard)
|
||||
}
|
||||
|
||||
pub struct LoggingGuard;
|
||||
|
||||
impl Drop for LoggingGuard {
|
||||
fn drop(&mut self) {
|
||||
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
||||
// pending traces before we exit.
|
||||
tracing::info!("shutting down the tracing machinery");
|
||||
tracing_utils::shutdown_tracing();
|
||||
}
|
||||
}
|
||||
@@ -12,6 +12,7 @@ mod config;
|
||||
mod console;
|
||||
mod error;
|
||||
mod http;
|
||||
mod logging;
|
||||
mod metrics;
|
||||
mod parse;
|
||||
mod proxy;
|
||||
@@ -27,7 +28,7 @@ use config::ProxyConfig;
|
||||
use futures::FutureExt;
|
||||
use std::{borrow::Cow, future::Future, net::SocketAddr};
|
||||
use tokio::{net::TcpListener, task::JoinError};
|
||||
use tracing::{info, info_span, Instrument};
|
||||
use tracing::{info, warn};
|
||||
use utils::{project_git_version, sentry_init::init_sentry};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
@@ -41,12 +42,8 @@ async fn flatten_err(
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
tracing_subscriber::fmt()
|
||||
.with_ansi(atty::is(atty::Stream::Stdout))
|
||||
.with_target(false)
|
||||
.init();
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _logging_guard = logging::init().await?;
|
||||
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
info!("Version: {GIT_VERSION}");
|
||||
@@ -64,16 +61,17 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let mgmt_address: SocketAddr = args.get_one::<String>("mgmt").unwrap().parse()?;
|
||||
info!("Starting mgmt on {mgmt_address}");
|
||||
let mgmt_listener = TcpListener::bind(mgmt_address).await?.into_std()?;
|
||||
let mgmt_listener = TcpListener::bind(mgmt_address).await?;
|
||||
|
||||
let proxy_address: SocketAddr = args.get_one::<String>("proxy").unwrap().parse()?;
|
||||
info!("Starting proxy on {proxy_address}");
|
||||
let proxy_listener = TcpListener::bind(proxy_address).await?;
|
||||
|
||||
let mut tasks = vec![
|
||||
tokio::spawn(handle_signals()),
|
||||
tokio::spawn(http::server::task_main(http_listener)),
|
||||
tokio::spawn(proxy::task_main(config, proxy_listener)),
|
||||
tokio::task::spawn_blocking(move || console::mgmt::thread_main(mgmt_listener)),
|
||||
tokio::spawn(console::mgmt::task_main(mgmt_listener)),
|
||||
];
|
||||
|
||||
if let Some(wss_address) = args.get_one::<String>("wss") {
|
||||
@@ -82,35 +80,52 @@ async fn main() -> anyhow::Result<()> {
|
||||
let wss_listener = TcpListener::bind(wss_address).await?;
|
||||
|
||||
tasks.push(tokio::spawn(http::websocket::task_main(
|
||||
wss_listener,
|
||||
config,
|
||||
wss_listener,
|
||||
)));
|
||||
}
|
||||
|
||||
// TODO: refactor.
|
||||
if let Some(metric_collection) = &config.metric_collection {
|
||||
let hostname = hostname::get()?
|
||||
.into_string()
|
||||
.map_err(|e| anyhow::anyhow!("failed to get hostname {e:?}"))?;
|
||||
|
||||
tasks.push(tokio::spawn(
|
||||
metrics::collect_metrics(
|
||||
&metric_collection.endpoint,
|
||||
metric_collection.interval,
|
||||
hostname,
|
||||
)
|
||||
.instrument(info_span!("collect_metrics")),
|
||||
));
|
||||
if let Some(metrics_config) = &config.metric_collection {
|
||||
tasks.push(tokio::spawn(metrics::task_main(metrics_config)));
|
||||
}
|
||||
|
||||
// This will block until all tasks have completed.
|
||||
// Furthermore, the first one to fail will cancel the rest.
|
||||
// This combinator will block until either all tasks complete or
|
||||
// one of them finishes with an error (others will be cancelled).
|
||||
let tasks = tasks.into_iter().map(flatten_err);
|
||||
let _: Vec<()> = futures::future::try_join_all(tasks).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handle unix signals appropriately.
|
||||
async fn handle_signals() -> anyhow::Result<()> {
|
||||
use tokio::signal::unix::{signal, SignalKind};
|
||||
|
||||
let mut hangup = signal(SignalKind::hangup())?;
|
||||
let mut interrupt = signal(SignalKind::interrupt())?;
|
||||
let mut terminate = signal(SignalKind::terminate())?;
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
// Hangup is commonly used for config reload.
|
||||
_ = hangup.recv() => {
|
||||
warn!("received SIGHUP; config reload is not supported");
|
||||
}
|
||||
// Shut down the whole application.
|
||||
_ = interrupt.recv() => {
|
||||
warn!("received SIGINT, exiting immediately");
|
||||
bail!("interrupted");
|
||||
}
|
||||
// TODO: Don't accept new proxy connections.
|
||||
// TODO: Shut down once all exisiting connections have been closed.
|
||||
_ = terminate.recv() => {
|
||||
warn!("received SIGTERM, exiting immediately");
|
||||
bail!("terminated");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// ProxyConfig is created at proxy startup, and lives forever.
|
||||
fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> {
|
||||
let tls_config = match (
|
||||
@@ -150,7 +165,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
|
||||
}));
|
||||
|
||||
let url = args.get_one::<String>("auth-endpoint").unwrap().parse()?;
|
||||
let endpoint = http::Endpoint::new(url, reqwest::Client::new());
|
||||
let endpoint = http::Endpoint::new(url, http::new_client());
|
||||
|
||||
let api = console::provider::neon::Api::new(endpoint, caches);
|
||||
auth::BackendType::Console(Cow::Owned(api), ())
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
//!
|
||||
//! Periodically collect proxy consumption metrics
|
||||
//! and push them to a HTTP endpoint.
|
||||
//!
|
||||
use crate::{config::MetricCollectionConfig, http};
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
|
||||
use serde::Serialize;
|
||||
use std::{collections::HashMap, time::Duration};
|
||||
use tracing::{debug, error, log::info, trace};
|
||||
use std::collections::HashMap;
|
||||
use tracing::{debug, error, info, instrument, trace};
|
||||
|
||||
const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
|
||||
|
||||
@@ -19,48 +18,41 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
|
||||
/// so while the project-id is unique across regions the whole pipeline will work correctly
|
||||
/// because we enrich the event with project_id in the control-plane endpoint.
|
||||
///
|
||||
#[derive(Eq, Hash, PartialEq, Serialize)]
|
||||
#[derive(Eq, Hash, PartialEq, Serialize, Debug)]
|
||||
pub struct Ids {
|
||||
pub endpoint_id: String,
|
||||
}
|
||||
|
||||
pub async fn collect_metrics(
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
metric_collection_interval: Duration,
|
||||
hostname: String,
|
||||
) -> anyhow::Result<()> {
|
||||
pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> {
|
||||
info!("metrics collector config: {config:?}");
|
||||
scopeguard::defer! {
|
||||
info!("collect_metrics has shut down");
|
||||
info!("metrics collector has shut down");
|
||||
}
|
||||
|
||||
let mut ticker = tokio::time::interval(metric_collection_interval);
|
||||
|
||||
info!(
|
||||
"starting collect_metrics. metric_collection_endpoint: {}",
|
||||
metric_collection_endpoint
|
||||
);
|
||||
|
||||
// define client here to reuse it for all requests
|
||||
let client = reqwest::Client::new();
|
||||
let http_client = http::new_client();
|
||||
let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
|
||||
let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
|
||||
|
||||
let mut ticker = tokio::time::interval(config.interval);
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = ticker.tick() => {
|
||||
ticker.tick().await;
|
||||
|
||||
match collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await
|
||||
{
|
||||
Err(e) => {
|
||||
error!("Failed to send consumption metrics: {} ", e);
|
||||
},
|
||||
Ok(_) => { trace!("collect_metrics_iteration completed successfully") },
|
||||
}
|
||||
}
|
||||
let res = collect_metrics_iteration(
|
||||
&http_client,
|
||||
&mut cached_metrics,
|
||||
&config.endpoint,
|
||||
&hostname,
|
||||
)
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Err(e) => error!("failed to send consumption metrics: {e} "),
|
||||
Ok(_) => trace!("periodic metrics collection completed successfully"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
|
||||
fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
|
||||
let mut current_metrics: Vec<(Ids, (u64, DateTime<Utc>))> = Vec::new();
|
||||
let metrics = prometheus::default_registry().gather();
|
||||
|
||||
@@ -99,11 +91,12 @@ pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
|
||||
current_metrics
|
||||
}
|
||||
|
||||
pub async fn collect_metrics_iteration(
|
||||
client: &reqwest::Client,
|
||||
#[instrument(skip_all)]
|
||||
async fn collect_metrics_iteration(
|
||||
client: &http::ClientWithMiddleware,
|
||||
cached_metrics: &mut HashMap<Ids, (u64, DateTime<Utc>)>,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
hostname: String,
|
||||
hostname: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
info!(
|
||||
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
|
||||
@@ -134,7 +127,7 @@ pub async fn collect_metrics_iteration(
|
||||
stop_time: *curr_time,
|
||||
},
|
||||
metric: PROXY_IO_BYTES_PER_CLIENT,
|
||||
idempotency_key: idempotency_key(hostname.clone()),
|
||||
idempotency_key: idempotency_key(hostname.to_owned()),
|
||||
value,
|
||||
extra: Ids {
|
||||
endpoint_id: curr_key.endpoint_id.clone(),
|
||||
@@ -190,6 +183,12 @@ pub async fn collect_metrics_iteration(
|
||||
}
|
||||
} else {
|
||||
error!("metrics endpoint refused the sent metrics: {:?}", res);
|
||||
for metric in chunk.iter() {
|
||||
// Report if the metric value is suspiciously large
|
||||
if metric.value > (1u64 << 40) {
|
||||
error!("potentially abnormal metric value: {:?}", metric);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -17,7 +17,7 @@ use once_cell::sync::Lazy;
|
||||
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
/// Number of times we should retry the `/proxy_wake_compute` http request.
|
||||
const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
|
||||
@@ -91,13 +91,13 @@ pub async fn task_main(
|
||||
.unwrap_or_else(|e| {
|
||||
// Acknowledge that the task has finished with an error.
|
||||
error!("per-client task finished with an error: {e:#}");
|
||||
})
|
||||
.instrument(info_span!("client", session = format_args!("{session_id}"))),
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(tech debt): unite this with its twin below.
|
||||
#[tracing::instrument(fields(session_id), skip_all)]
|
||||
pub async fn handle_ws_client(
|
||||
config: &'static ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
@@ -127,7 +127,7 @@ pub async fn handle_ws_client(
|
||||
let result = config
|
||||
.auth_backend
|
||||
.as_ref()
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name, true))
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name))
|
||||
.transpose();
|
||||
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
@@ -135,10 +135,11 @@ pub async fn handle_ws_client(
|
||||
|
||||
let client = Client::new(stream, creds, ¶ms, session_id);
|
||||
cancel_map
|
||||
.with_session(|session| client.connect_to_db(session))
|
||||
.with_session(|session| client.connect_to_db(session, true))
|
||||
.await
|
||||
}
|
||||
|
||||
#[tracing::instrument(fields(session_id), skip_all)]
|
||||
async fn handle_client(
|
||||
config: &'static ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
@@ -165,7 +166,7 @@ async fn handle_client(
|
||||
let result = config
|
||||
.auth_backend
|
||||
.as_ref()
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name, false))
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name))
|
||||
.transpose();
|
||||
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
@@ -173,7 +174,7 @@ async fn handle_client(
|
||||
|
||||
let client = Client::new(stream, creds, ¶ms, session_id);
|
||||
cancel_map
|
||||
.with_session(|session| client.connect_to_db(session))
|
||||
.with_session(|session| client.connect_to_db(session, false))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -401,7 +402,11 @@ impl<'a, S> Client<'a, S> {
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
/// Let the client authenticate and connect to the designated compute node.
|
||||
async fn connect_to_db(self, session: cancellation::Session<'_>) -> anyhow::Result<()> {
|
||||
async fn connect_to_db(
|
||||
self,
|
||||
session: cancellation::Session<'_>,
|
||||
allow_cleartext: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let Self {
|
||||
mut stream,
|
||||
mut creds,
|
||||
@@ -416,10 +421,12 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
|
||||
let auth_result = async {
|
||||
// `&mut stream` doesn't let us merge those 2 lines.
|
||||
let res = creds.authenticate(&extra, &mut stream).await;
|
||||
let res = creds
|
||||
.authenticate(&extra, &mut stream, allow_cleartext)
|
||||
.await;
|
||||
|
||||
async { res }.or_else(|e| stream.throw_error(e)).await
|
||||
}
|
||||
.instrument(info_span!("auth"))
|
||||
.await?;
|
||||
|
||||
let AuthSuccess {
|
||||
|
||||
@@ -19,12 +19,12 @@ types-requests = "^2.28.5"
|
||||
types-psycopg2 = "^2.9.18"
|
||||
boto3 = "^1.26.16"
|
||||
boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
|
||||
moto = {version = "^3.0.0", extras = ["server"]}
|
||||
moto = {extras = ["server"], version = "^4.1.2"}
|
||||
backoff = "^1.11.1"
|
||||
pytest-lazy-fixture = "^0.6.3"
|
||||
prometheus-client = "^0.14.1"
|
||||
pytest-timeout = "^2.1.0"
|
||||
Werkzeug = "2.1.2"
|
||||
Werkzeug = "^2.2.3"
|
||||
pytest-order = "^1.0.1"
|
||||
allure-pytest = "^2.10.0"
|
||||
pytest-asyncio = "^0.19.0"
|
||||
|
||||
@@ -126,7 +126,12 @@ fn main() -> anyhow::Result<()> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// important to keep the order of:
|
||||
// 1. init logging
|
||||
// 2. tracing panic hook
|
||||
// 3. sentry
|
||||
logging::init(LogFormat::from_config(&args.log_format)?)?;
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
info!("version: {GIT_VERSION}");
|
||||
|
||||
let args_workdir = &args.datadir;
|
||||
|
||||
55
scripts/sk_cleanup_tenants/readme.md
Normal file
55
scripts/sk_cleanup_tenants/readme.md
Normal file
@@ -0,0 +1,55 @@
|
||||
# Cleanup script for safekeeper
|
||||
|
||||
This script can be used to remove tenant directories on safekeepers for projects which do not longer exist (deleted in console).
|
||||
|
||||
To run this script you need to upload it to safekeeper (i.e. with SSH), and run it with python3. Ansible can be used to run this script on multiple safekeepers.
|
||||
|
||||
NOTE: Console queries to check that project is deleted are slow and inefficient.
|
||||
If you want to run this script on safekeeper with many tenants, consider
|
||||
making PR to console repo to make projects search by tenant_id faster.
|
||||
|
||||
## How to run on a single node
|
||||
|
||||
```
|
||||
zsh nsh safekeeper-0.us-east-2.aws.neon.build
|
||||
|
||||
ls /storage/safekeeper/data/ | grep -v safekeeper > tenants.txt
|
||||
|
||||
mkdir -p /storage/neon-trash/2023-01-01--cleanup
|
||||
|
||||
export CONSOLE_API_TOKEN=
|
||||
python3 script.py --trash-dir /storage/neon-trash/2023-01-01--cleanup --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME --dry-run
|
||||
|
||||
cat tenants.txt | python3 script.py --trash-dir /storage/neon-trash/2023-01-01--cleanup --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME --dry-run
|
||||
|
||||
cat tenants.txt | python3 script.py --trash-dir /storage/neon-trash/2023-01-01--cleanup --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME |& tee logs.txt
|
||||
```
|
||||
|
||||
## How to use ansible (staging)
|
||||
|
||||
```
|
||||
cd ~/neon/.github/ansible
|
||||
|
||||
export AWS_DEFAULT_PROFILE=dev
|
||||
|
||||
ansible-playbook -i staging.us-east-2.hosts.yaml -e @ssm_config ../../scripts/sk_cleanup_tenants/remote.yaml
|
||||
|
||||
# add --extra-vars "api_token=" to set console api token
|
||||
```
|
||||
|
||||
## How to use ansible (prod)
|
||||
|
||||
- Change `endpoint` in `script.py` to "https://console.neon.tech/api"
|
||||
|
||||
```
|
||||
cd ~/neon/.github/ansible
|
||||
|
||||
export AWS_DEFAULT_PROFILE=prod
|
||||
|
||||
ansible-playbook -i prod.us-east-2.hosts.yaml -e @ssm_config ../../scripts/sk_cleanup_tenants/remote.yaml
|
||||
|
||||
# add --extra-vars "api_token=" to set console api token
|
||||
```
|
||||
|
||||
|
||||
> Heavily inspired with script for pageserver cleanup: https://gist.github.com/problame/bafb6ca6334f0145757238e61380c3f1/9bef1845a8291ebfa1f3a51eb79c01d12498b2b5
|
||||
80
scripts/sk_cleanup_tenants/remote.yaml
Normal file
80
scripts/sk_cleanup_tenants/remote.yaml
Normal file
@@ -0,0 +1,80 @@
|
||||
- name: Test safekeepers
|
||||
hosts: safekeepers
|
||||
gather_facts: False
|
||||
remote_user: "{{ remote_user }}"
|
||||
|
||||
vars:
|
||||
script_dir: /storage/ansible_sk_cleanup
|
||||
tenants_file: "{{ script_dir }}/tenants.txt"
|
||||
trash_dir: /storage/neon-trash/2023-01-01--changeme
|
||||
|
||||
tasks:
|
||||
|
||||
- name: create script directory
|
||||
file:
|
||||
path: "{{ script_dir }}"
|
||||
state: directory
|
||||
mode: 0755
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- name: create trash dir
|
||||
file:
|
||||
path: "{{ trash_dir }}"
|
||||
state: directory
|
||||
mode: 0755
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- name: collect all tenant_ids to tenants.txt
|
||||
shell:
|
||||
cmd: ls /storage/safekeeper/data/ | grep -v safekeeper > {{ tenants_file }}
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- name: count tenants
|
||||
shell:
|
||||
cmd: wc -l {{ tenants_file }}
|
||||
register: tenants_count
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- debug: msg="{{ tenants_count.stdout }}"
|
||||
|
||||
- name: fetch safekeeper_id
|
||||
shell:
|
||||
cmd: cat /storage/safekeeper/data/safekeeper.id
|
||||
register: safekeeper_id
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- debug: msg="{{ safekeeper_id.stdout }}"
|
||||
|
||||
- name: copy script.py to safekeeper
|
||||
copy:
|
||||
src: script.py
|
||||
dest: "{{ script_dir }}"
|
||||
mode: 0755
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- name: Run an async task
|
||||
shell:
|
||||
chdir: "{{ script_dir }}"
|
||||
cmd: "cat tenants.txt | python3 script.py --trash-dir {{ trash_dir }} --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME |& cat > {{ script_dir }}/run-`date +%Y-%m-%d-%H.%M.%S`.log"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
environment:
|
||||
CONSOLE_API_TOKEN: "{{ api_token }}"
|
||||
async: 30000
|
||||
poll: 0
|
||||
register: bg_async_task
|
||||
|
||||
- name: Check on an async task
|
||||
async_status:
|
||||
jid: "{{ bg_async_task.ansible_job_id }}"
|
||||
become: true
|
||||
register: job_result
|
||||
until: job_result.finished
|
||||
retries: 3000
|
||||
delay: 10
|
||||
133
scripts/sk_cleanup_tenants/script.py
Normal file
133
scripts/sk_cleanup_tenants/script.py
Normal file
@@ -0,0 +1,133 @@
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
level = logging.INFO
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
|
||||
datefmt="%Y-%m-%d:%H:%M:%S",
|
||||
level=level,
|
||||
)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--trash-dir", required=True, type=Path)
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
parser.add_argument("--safekeeper-id", required=True, type=int)
|
||||
parser.add_argument("--safekeeper-host", required=True, type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
access_key = os.getenv("CONSOLE_API_TOKEN")
|
||||
endpoint: str = "https://console.stage.neon.tech/api"
|
||||
|
||||
trash_dir: Path = args.trash_dir
|
||||
dry_run: bool = args.dry_run
|
||||
logging.info(f"dry_run={dry_run}")
|
||||
sk_id: int = args.safekeeper_id
|
||||
sk_host: str = args.safekeeper_host
|
||||
|
||||
assert trash_dir.is_dir()
|
||||
|
||||
###
|
||||
|
||||
|
||||
def console_get(rel_url):
|
||||
r = requests.get(
|
||||
f"{endpoint}{rel_url}",
|
||||
headers={
|
||||
"Authorization": f"Bearer {access_key}",
|
||||
"Content-Type": "application/json",
|
||||
"Accept": "application/json",
|
||||
},
|
||||
)
|
||||
r.raise_for_status()
|
||||
return r
|
||||
|
||||
|
||||
def tenant_is_deleted_in_console(tenant_id):
|
||||
r = console_get(f"/v1/admin/projects?search={tenant_id}&show_deleted=true")
|
||||
r = r.json()
|
||||
results = r["data"]
|
||||
assert len(results) == 1, f"unexpected results len: {results}"
|
||||
r = results[0]
|
||||
assert r["tenant"] == tenant_id, f"tenant id doesn't match: {r}"
|
||||
assert r["safekeepers"] is not None, f"safekeepers is None: {r}"
|
||||
assert any(sk["id"] == sk_id for sk in r["safekeepers"]), f"safekeeper id not found: {r}"
|
||||
assert "deleted" in r, f"{r}"
|
||||
return r["deleted"] is True
|
||||
|
||||
|
||||
def call_delete_tenant_api(tenant_id):
|
||||
r = requests.delete(f"http://{sk_host}:7676/v1/tenant/{tenant_id}")
|
||||
r.raise_for_status()
|
||||
return r
|
||||
|
||||
|
||||
def cleanup_tenant(tenant_id):
|
||||
|
||||
tenant_dir = Path(f"/storage/safekeeper/data/{tenant_id}")
|
||||
|
||||
if not tenant_dir.exists():
|
||||
logging.info("tenant directory doesn't exist, assuming it has been cleaned already")
|
||||
return
|
||||
|
||||
if not tenant_is_deleted_in_console(tenant_id):
|
||||
logging.info("tenant is not deleted in console, skipping")
|
||||
return
|
||||
|
||||
logging.info("assertions passed")
|
||||
|
||||
if dry_run:
|
||||
return
|
||||
|
||||
logging.info("deleting tenant")
|
||||
|
||||
tenant_dir_in_trash = trash_dir / tenant_dir.relative_to("/")
|
||||
tenant_dir_in_trash.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
assert not tenant_dir_in_trash.exists(), f"{tenant_dir_in_trash}"
|
||||
assert tenant_dir_in_trash.parent.exists(), f"{tenant_dir_in_trash}"
|
||||
# double-check
|
||||
assert tenant_dir.exists(), f"{tenant_dir}"
|
||||
assert tenant_dir.is_dir(), f"{tenant_dir}"
|
||||
|
||||
logging.info(f"copying {tenant_dir} to {tenant_dir_in_trash}")
|
||||
shutil.copytree(src=tenant_dir, dst=tenant_dir_in_trash, symlinks=False, dirs_exist_ok=False)
|
||||
|
||||
logging.info(f"deleting {tenant_dir}")
|
||||
call_delete_tenant_api(tenant_id)
|
||||
|
||||
logging.info("tenant is now deleted, checking that it's gone")
|
||||
assert not tenant_dir.exists(), f"{tenant_dir}"
|
||||
|
||||
|
||||
if os.path.exists("script.pid"):
|
||||
logging.info(
|
||||
f"script is already running, with pid={Path('script.pid').read_text()}. Terminate it first."
|
||||
)
|
||||
exit(1)
|
||||
|
||||
with open("script.pid", "w", encoding="utf-8") as f:
|
||||
f.write(str(os.getpid()))
|
||||
|
||||
logging.info(f"started script.py, pid={os.getpid()}")
|
||||
|
||||
for line in sys.stdin:
|
||||
tenant_id = line.strip()
|
||||
try:
|
||||
logging.info(f"start tenant {tenant_id}")
|
||||
cleanup_tenant(tenant_id)
|
||||
logging.info(f"done tenant {tenant_id}")
|
||||
except KeyboardInterrupt:
|
||||
print("KeyboardInterrupt exception is caught")
|
||||
break
|
||||
except: # noqa: E722
|
||||
logging.exception(f"failed to clean up tenant {tenant_id}")
|
||||
|
||||
logging.info(f"finished script.py, pid={os.getpid()}")
|
||||
|
||||
os.remove("script.pid")
|
||||
@@ -424,12 +424,16 @@ async fn http1_handler(
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
let args = Args::parse();
|
||||
|
||||
// important to keep the order of:
|
||||
// 1. init logging
|
||||
// 2. tracing panic hook
|
||||
// 3. sentry
|
||||
logging::init(LogFormat::from_config(&args.log_format)?)?;
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
info!("version: {GIT_VERSION}");
|
||||
::metrics::set_build_info_metric(GIT_VERSION);
|
||||
|
||||
|
||||
@@ -1217,7 +1217,7 @@ class PageserverHttpClient(requests.Session):
|
||||
"""
|
||||
Returns the tenant size, together with the model inputs as the second tuple item.
|
||||
"""
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/size")
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size")
|
||||
self.verbose_error(res)
|
||||
res = res.json()
|
||||
assert isinstance(res, dict)
|
||||
@@ -1228,6 +1228,16 @@ class PageserverHttpClient(requests.Session):
|
||||
assert type(inputs) is dict
|
||||
return (size, inputs)
|
||||
|
||||
def tenant_size_debug(self, tenant_id: TenantId) -> str:
|
||||
"""
|
||||
Returns the tenant size debug info, as an HTML string
|
||||
"""
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size",
|
||||
headers={"Accept": "text/html"},
|
||||
)
|
||||
return res.text
|
||||
|
||||
def timeline_list(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
@@ -1278,6 +1288,7 @@ class PageserverHttpClient(requests.Session):
|
||||
timeline_id: TimelineId,
|
||||
include_non_incremental_logical_size: bool = False,
|
||||
include_timeline_dir_layer_file_size_sum: bool = False,
|
||||
**kwargs,
|
||||
) -> Dict[Any, Any]:
|
||||
params = {}
|
||||
if include_non_incremental_logical_size:
|
||||
@@ -1288,6 +1299,7 @@ class PageserverHttpClient(requests.Session):
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||
params=params,
|
||||
**kwargs,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
@@ -2067,6 +2079,9 @@ class NeonPageserver(PgProtocol):
|
||||
".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs
|
||||
".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
|
||||
".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress
|
||||
".*task iteration took longer than the configured period.*",
|
||||
# this is until #3501
|
||||
".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
|
||||
]
|
||||
|
||||
def start(
|
||||
@@ -2517,15 +2532,17 @@ class NeonProxy(PgProtocol):
|
||||
tb: Optional[TracebackType],
|
||||
):
|
||||
if self._popen is not None:
|
||||
# NOTE the process will die when we're done with tests anyway, because
|
||||
# it's a child process. This is mostly to clean up in between different tests.
|
||||
self._popen.kill()
|
||||
self._popen.terminate()
|
||||
try:
|
||||
self._popen.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
log.warn("failed to gracefully terminate proxy; killing")
|
||||
self._popen.kill()
|
||||
|
||||
@staticmethod
|
||||
async def activate_link_auth(
|
||||
local_vanilla_pg, proxy_with_metric_collector, psql_session_id, create_user=True
|
||||
):
|
||||
|
||||
pg_user = "proxy"
|
||||
|
||||
if create_user:
|
||||
|
||||
@@ -17,7 +17,9 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
cur = pgmain.connect().cursor()
|
||||
# Create table, and insert rows, each in a separate transaction
|
||||
# Disable synchronous_commit to make this initialization go faster.
|
||||
# Disable `synchronous_commit`` to make this initialization go faster.
|
||||
# XXX: on my laptop this test takes 7s, and setting `synchronous_commit=off`
|
||||
# doesn't change anything.
|
||||
#
|
||||
# Each row contains current insert LSN and the current timestamp, when
|
||||
# the row was inserted.
|
||||
@@ -32,20 +34,23 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Execute one more transaction with synchronous_commit enabled, to flush
|
||||
# all the previous transactions
|
||||
cur.execute("SET synchronous_commit=on")
|
||||
cur.execute("INSERT INTO foo VALUES (-1)")
|
||||
|
||||
# Wait until WAL is received by pageserver
|
||||
wait_for_last_flush_lsn(env, pgmain, env.initial_tenant, new_timeline_id)
|
||||
|
||||
with env.pageserver.http_client() as client:
|
||||
# Check edge cases: timestamp in the future
|
||||
# Check edge cases
|
||||
# Timestamp is in the future
|
||||
probe_timestamp = tbl[-1][1] + timedelta(hours=1)
|
||||
result = client.timeline_get_lsn_by_timestamp(
|
||||
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z"
|
||||
)
|
||||
assert result == "future"
|
||||
# We should still return LSN of the first commit before timestamp
|
||||
assert result not in ["past", "nodata"]
|
||||
|
||||
# timestamp too the far history
|
||||
# Timestamp is in the unreachable past
|
||||
probe_timestamp = tbl[0][1] - timedelta(hours=10)
|
||||
result = client.timeline_get_lsn_by_timestamp(
|
||||
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z"
|
||||
@@ -55,10 +60,12 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
|
||||
# Probe a bunch of timestamps in the valid range
|
||||
for i in range(1, len(tbl), 100):
|
||||
probe_timestamp = tbl[i][1]
|
||||
# Call get_lsn_by_timestamp to get the LSN
|
||||
lsn = client.timeline_get_lsn_by_timestamp(
|
||||
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z"
|
||||
)
|
||||
# Call get_lsn_by_timestamp to get the LSN
|
||||
assert lsn not in ["past", "nodata"]
|
||||
|
||||
# Launch a new read-only node at that LSN, and check that only the rows
|
||||
# that were supposed to be committed at that point in time are visible.
|
||||
pg_here = env.postgres.create_start(
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user