mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-18 19:02:56 +00:00
Compare commits
126 Commits
layer_map_
...
copy_block
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d07101d317 | ||
|
|
89e4fc3c63 | ||
|
|
d1a0f2f0eb | ||
|
|
a34e78d084 | ||
|
|
b80fe41af3 | ||
|
|
0d8ced8534 | ||
|
|
7627d85345 | ||
|
|
3f11a647c0 | ||
|
|
e43c413a3f | ||
|
|
8459e0265e | ||
|
|
03a2ce9d13 | ||
|
|
ccf92df4da | ||
|
|
37bc6d9be4 | ||
|
|
177f986795 | ||
|
|
fb1581d0b9 | ||
|
|
02b8e0e5af | ||
|
|
1b16de0d0f | ||
|
|
069b5b0a06 | ||
|
|
b05e94e4ff | ||
|
|
0acf9ace9a | ||
|
|
ca85646df4 | ||
|
|
7b9057ad01 | ||
|
|
96f65fad68 | ||
|
|
9cada8b59d | ||
|
|
66a5159511 | ||
|
|
d1a0a907ff | ||
|
|
1b780fa752 | ||
|
|
38022ff11c | ||
|
|
1b9b9d60d4 | ||
|
|
68141a924d | ||
|
|
764d27f696 | ||
|
|
b23742e09c | ||
|
|
5e514b8465 | ||
|
|
a60f687ce2 | ||
|
|
8dae879994 | ||
|
|
d19c5248c9 | ||
|
|
1360361f60 | ||
|
|
000eb1b069 | ||
|
|
f51b48fa49 | ||
|
|
9f906ff236 | ||
|
|
c79dd8d458 | ||
|
|
ec4ecdd543 | ||
|
|
20a4d817ce | ||
|
|
5ebf7e5619 | ||
|
|
0692fffbf3 | ||
|
|
093570af20 | ||
|
|
eb403da814 | ||
|
|
f3ad635911 | ||
|
|
a8d7360881 | ||
|
|
b0311cfdeb | ||
|
|
412e0aa985 | ||
|
|
965b4f4ae2 | ||
|
|
95018672fa | ||
|
|
2caece2077 | ||
|
|
b8b8c19fb4 | ||
|
|
225add041f | ||
|
|
5d001b1e5a | ||
|
|
fe462de85b | ||
|
|
c0de7f5cd8 | ||
|
|
b220ba6cd1 | ||
|
|
7de373210d | ||
|
|
5c5b03ce08 | ||
|
|
d7d3f451f0 | ||
|
|
bc7d3c6476 | ||
|
|
e3d75879c0 | ||
|
|
485b269674 | ||
|
|
ee1eda9921 | ||
|
|
e363911c85 | ||
|
|
d5d690c044 | ||
|
|
8f557477c6 | ||
|
|
af210c8b42 | ||
|
|
2153d2e00a | ||
|
|
564fa11244 | ||
|
|
8d28a24b26 | ||
|
|
53128d56d9 | ||
|
|
40799d8ae7 | ||
|
|
b242b0ad67 | ||
|
|
d90cd36bcc | ||
|
|
956b6f17ca | ||
|
|
6f9af0aa8c | ||
|
|
8e6b27bf7c | ||
|
|
ae3eff1ad2 | ||
|
|
501702b27c | ||
|
|
526f8b76aa | ||
|
|
a1b062123b | ||
|
|
a4d5c8085b | ||
|
|
edffe0dd9d | ||
|
|
d9c518b2cc | ||
|
|
0d3aefb274 | ||
|
|
6139e8e426 | ||
|
|
d9ba3c5f5e | ||
|
|
0cf7fd0fb8 | ||
|
|
f0b41e7750 | ||
|
|
5082d84f5b | ||
|
|
7991bd3b69 | ||
|
|
ddbdcdddd7 | ||
|
|
7b182e2605 | ||
|
|
1d9d7c02db | ||
|
|
a974602f9f | ||
|
|
a839860c2e | ||
|
|
a5ce2b5330 | ||
|
|
3569c1bacd | ||
|
|
86681b92aa | ||
|
|
eb21d9969d | ||
|
|
e6618f1cc0 | ||
|
|
eaff14da5f | ||
|
|
f383b4d540 | ||
|
|
694150ce40 | ||
|
|
f4359b688c | ||
|
|
948f047f0a | ||
|
|
4175cfbdac | ||
|
|
9657459d80 | ||
|
|
a4256b3250 | ||
|
|
175a577ad4 | ||
|
|
1fdf01e3bc | ||
|
|
446a39e969 | ||
|
|
7ed9eb4a56 | ||
|
|
f07d6433b6 | ||
|
|
2040db98ef | ||
|
|
371493ae32 | ||
|
|
1b9e5e84aa | ||
|
|
7ed93fff06 | ||
|
|
a6dffb6ef9 | ||
|
|
c5c14368e3 | ||
|
|
1254dc7ee2 | ||
|
|
fcb905f519 |
@@ -21,3 +21,4 @@
|
||||
!workspace_hack/
|
||||
!neon_local/
|
||||
!scripts/ninstall.sh
|
||||
!vm-cgconfig.conf
|
||||
|
||||
@@ -2,11 +2,11 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-ap-southeast-1
|
||||
bucket_region: ap-southeast-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
@@ -32,7 +32,7 @@ storage:
|
||||
hosts:
|
||||
safekeeper-0.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-0d6f1dc5161eef894
|
||||
safekeeper-1.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-0e338adda8eb2d19f
|
||||
safekeeper-2.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-04fb63634e4679eb9
|
||||
safekeeper-3.ap-southeast-1.aws.neon.tech:
|
||||
ansible_host: i-05481f3bc88cfc2d4
|
||||
|
||||
4
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
4
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
@@ -2,11 +2,11 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-eu-central-1
|
||||
bucket_region: eu-central-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
|
||||
6
.github/ansible/prod.us-east-2.hosts.yaml
vendored
6
.github/ansible/prod.us-east-2.hosts.yaml
vendored
@@ -2,11 +2,11 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-us-east-2
|
||||
bucket_region: us-east-2
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
@@ -27,6 +27,8 @@ storage:
|
||||
ansible_host: i-062227ba7f119eb8c
|
||||
pageserver-1.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-0b3ec0afab5968938
|
||||
pageserver-2.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-0d7a1c4325e71421d
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
|
||||
6
.github/ansible/prod.us-west-2.hosts.yaml
vendored
6
.github/ansible/prod.us-west-2.hosts.yaml
vendored
@@ -2,11 +2,11 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-us-west-2
|
||||
bucket_region: us-west-2
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
@@ -29,6 +29,8 @@ storage:
|
||||
ansible_host: i-0c834be1dddba8b3f
|
||||
pageserver-2.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-051642d372c0a4f32
|
||||
pageserver-3.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-00c3844beb9ad1c6b
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
|
||||
40
.github/ansible/production.hosts.yaml
vendored
40
.github/ansible/production.hosts.yaml
vendored
@@ -1,40 +0,0 @@
|
||||
---
|
||||
storage:
|
||||
vars:
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
bucket_name: zenith-storage-oregon
|
||||
bucket_region: us-west-2
|
||||
broker_endpoint: http://storage-broker.prod.local:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "{{ inventory_hostname }}"
|
||||
safekeeper_s3_prefix: prod-1/wal
|
||||
hostname_suffix: ".local"
|
||||
remote_user: admin
|
||||
sentry_environment: production
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
zenith-1-ps-2:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-3:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-4:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-5:
|
||||
console_region_id: aws-us-west-2
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
zenith-1-sk-1:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-sk-2:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-sk-4:
|
||||
console_region_id: aws-us-west-2
|
||||
9
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
9
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
@@ -2,12 +2,17 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-dev-storage-eu-west-1
|
||||
bucket_region: eu-west-1
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.build
|
||||
broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "20m"
|
||||
threshold: "20m"
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
13
.github/ansible/staging.us-east-2.hosts.yaml
vendored
13
.github/ansible/staging.us-east-2.hosts.yaml
vendored
@@ -2,12 +2,17 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-staging-storage-us-east-2
|
||||
bucket_region: us-east-2
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.build
|
||||
broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "20m"
|
||||
threshold: "20m"
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
@@ -31,6 +36,8 @@ storage:
|
||||
ansible_host: i-01e31cdf7e970586a
|
||||
pageserver-3.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0602a0291365ef7cc
|
||||
pageserver-99.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0c39491109bb88824
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
@@ -40,3 +47,5 @@ storage:
|
||||
ansible_host: i-0171efc3604a7b907
|
||||
safekeeper-2.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0de0b03a51676a6ce
|
||||
safekeeper-99.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0d61b6a2ea32028d5
|
||||
|
||||
@@ -1,16 +1,31 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.eu-west-1.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -10,7 +10,7 @@ settings:
|
||||
uri: "https://console.stage.neon.tech/psql_session/"
|
||||
domain: "pg.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy-link pods
|
||||
|
||||
@@ -6,11 +6,11 @@ image:
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.cloud.stage.neon.tech"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -1,16 +1,31 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -1,16 +1,32 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.ap-southeast-1.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -1,16 +1,32 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.eu-central-1.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -1,16 +1,32 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -6,11 +6,11 @@ image:
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.cloud.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -1,16 +1,32 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
||||
# but doesn't receive new ones
|
||||
containerLifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
||||
terminationGracePeriodSeconds: 604800
|
||||
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.us-west-2.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -1,56 +0,0 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: production
|
||||
neon_service: storage-broker
|
||||
|
||||
# Use L4 LB
|
||||
service:
|
||||
# service.annotations -- Annotations to add to the service
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||
# assign service to this name at external-dns
|
||||
external-dns.alpha.kubernetes.io/hostname: storage-broker.prod.local
|
||||
# service.type -- Service type
|
||||
type: LoadBalancer
|
||||
# service.port -- broker listen port
|
||||
port: 50051
|
||||
|
||||
ingress:
|
||||
enabled: false
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "production"
|
||||
25
.github/workflows/build_and_test.yml
vendored
25
.github/workflows/build_and_test.yml
vendored
@@ -611,34 +611,31 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_INFORMANT_VERSION: 0.1.1
|
||||
VM_BUILDER_VERSION: v0.4.6
|
||||
|
||||
steps:
|
||||
- name: Downloading latest vm-builder
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Downloading vm-builder
|
||||
run: |
|
||||
curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder
|
||||
curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
||||
chmod +x vm-builder
|
||||
|
||||
- name: Pulling compute-node image
|
||||
run: |
|
||||
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Downloading VM informant version ${{ env.VM_INFORMANT_VERSION }}
|
||||
- name: Building VM compute-node rootfs
|
||||
run: |
|
||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/${{ env.VM_INFORMANT_VERSION }}/vm-informant -o vm-informant
|
||||
chmod +x vm-informant
|
||||
|
||||
- name: Adding VM informant to compute-node image
|
||||
run: |
|
||||
ID=$(docker create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}})
|
||||
docker cp vm-informant $ID:/bin/vm-informant
|
||||
docker commit $ID temp-vm-compute-node
|
||||
docker rm -f $ID
|
||||
docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node .
|
||||
|
||||
- name: Build vm image
|
||||
run: |
|
||||
# note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
|
||||
./vm-builder -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Pushing vm-compute-node image
|
||||
run: |
|
||||
|
||||
2
.github/workflows/deploy-dev.yml
vendored
2
.github/workflows/deploy-dev.yml
vendored
@@ -67,7 +67,7 @@ jobs:
|
||||
./get_binaries.sh
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
ansible-playbook -v deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
- name: Cleanup ansible folder
|
||||
|
||||
81
.github/workflows/deploy-prod.yml
vendored
81
.github/workflows/deploy-prod.yml
vendored
@@ -40,7 +40,9 @@ concurrency:
|
||||
jobs:
|
||||
deploy-prod-new:
|
||||
runs-on: prod
|
||||
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
container:
|
||||
image: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
options: --user root --privileged
|
||||
if: inputs.deployStorage && inputs.disclamerAcknowledged
|
||||
defaults:
|
||||
run:
|
||||
@@ -66,7 +68,7 @@ jobs:
|
||||
./get_binaries.sh
|
||||
|
||||
ansible-galaxy collection install sivel.toiletwater
|
||||
ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
ansible-playbook -v deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-proxy-prod-new:
|
||||
@@ -163,78 +165,3 @@ jobs:
|
||||
- name: Deploy storage-broker
|
||||
run:
|
||||
helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
||||
|
||||
# Deploy to old account below
|
||||
|
||||
deploy:
|
||||
runs-on: prod
|
||||
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
if: inputs.deployStorage && inputs.disclamerAcknowledged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
environment:
|
||||
name: prod-old
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Redeploy
|
||||
run: |
|
||||
export DOCKER_TAG=${{ inputs.dockerTag }}
|
||||
cd "$(pwd)/.github/ansible"
|
||||
|
||||
./get_binaries.sh
|
||||
|
||||
eval $(ssh-agent)
|
||||
echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key
|
||||
echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
|
||||
chmod 0600 ssh-key
|
||||
ssh-add ssh-key
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
|
||||
ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied
|
||||
- name: Cleanup ansible folder
|
||||
run: rm -rf ~/.ansible
|
||||
|
||||
deploy-storage-broker:
|
||||
name: deploy storage broker on old staging and old prod
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
if: inputs.deployStorageBroker && inputs.disclamerAcknowledged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
environment:
|
||||
name: prod-old
|
||||
env:
|
||||
KUBECONFIG: .kubeconfig
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Store kubeconfig file
|
||||
run: |
|
||||
echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
|
||||
chmod 0600 ${KUBECONFIG}
|
||||
|
||||
- name: Add neon helm chart
|
||||
run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
|
||||
- name: Deploy storage-broker
|
||||
run:
|
||||
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
||||
|
||||
- name: Cleanup helm folder
|
||||
run: rm -rf ~/.cache
|
||||
|
||||
143
Cargo.lock
generated
143
Cargo.lock
generated
@@ -854,6 +854,7 @@ dependencies = [
|
||||
"opentelemetry",
|
||||
"postgres",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tar",
|
||||
@@ -912,11 +913,13 @@ dependencies = [
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"postgres",
|
||||
"postgres_backend",
|
||||
"postgres_connection",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"safekeeper_api",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"storage_broker",
|
||||
"tar",
|
||||
@@ -2109,6 +2112,16 @@ version = "0.3.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
|
||||
|
||||
[[package]]
|
||||
name = "mime_guess"
|
||||
version = "2.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
|
||||
dependencies = [
|
||||
"mime",
|
||||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "minimal-lexical"
|
||||
version = "0.2.1"
|
||||
@@ -2421,6 +2434,7 @@ dependencies = [
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"crossbeam-utils",
|
||||
"either",
|
||||
"enum-map",
|
||||
"enumset",
|
||||
"fail",
|
||||
@@ -2441,6 +2455,7 @@ dependencies = [
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
"postgres_backend",
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
@@ -2484,6 +2499,7 @@ dependencies = [
|
||||
"enum-map",
|
||||
"postgres_ffi",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
@@ -2662,6 +2678,28 @@ dependencies = [
|
||||
"postgres-protocol",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres_backend"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures",
|
||||
"once_cell",
|
||||
"pq_proto",
|
||||
"rustls",
|
||||
"rustls-pemfile",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres_connection"
|
||||
version = "0.1.0"
|
||||
@@ -2709,7 +2747,7 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
||||
name = "pq_proto"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
@@ -2881,14 +2919,18 @@ dependencies = [
|
||||
"md5",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"parking_lot",
|
||||
"pin-project-lite",
|
||||
"postgres_backend",
|
||||
"pq_proto",
|
||||
"prometheus",
|
||||
"rand",
|
||||
"rcgen",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
"reqwest-tracing",
|
||||
"routerify",
|
||||
"rstest",
|
||||
"rustls",
|
||||
@@ -2898,6 +2940,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"socket2",
|
||||
"sync_wrapper",
|
||||
"thiserror",
|
||||
"tls-listener",
|
||||
"tokio",
|
||||
@@ -2905,7 +2948,9 @@ dependencies = [
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"tracing-utils",
|
||||
"url",
|
||||
"utils",
|
||||
"uuid",
|
||||
@@ -3035,6 +3080,7 @@ dependencies = [
|
||||
"hyper",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
@@ -3046,15 +3092,6 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "remove_dir_all"
|
||||
version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "reqwest"
|
||||
version = "0.11.14"
|
||||
@@ -3075,6 +3112,7 @@ dependencies = [
|
||||
"js-sys",
|
||||
"log",
|
||||
"mime",
|
||||
"mime_guess",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
@@ -3094,6 +3132,36 @@ dependencies = [
|
||||
"winreg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-middleware"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a1c03e9011a8c59716ad13115550469e081e2e9892656b0ba6a47c907921894"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"http",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"task-local-extensions",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-tracing"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b739d87a6b2cf4743968ad2b4cef648fbe0204c19999509824425babb2097bce"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"opentelemetry",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
"task-local-extensions",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.16.20"
|
||||
@@ -3234,15 +3302,6 @@ dependencies = [
|
||||
"base64 0.21.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-split"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78802c9612b4689d207acff746f38132ca1b12dadb55d471aa5f10fd580f47d3"
|
||||
dependencies = [
|
||||
"rustls",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.11"
|
||||
@@ -3264,6 +3323,7 @@ dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap 4.1.4",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
@@ -3278,6 +3338,7 @@ dependencies = [
|
||||
"parking_lot",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres_backend",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"regex",
|
||||
@@ -3787,17 +3848,25 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.3.0"
|
||||
name = "task-local-extensions"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4"
|
||||
checksum = "4167afbec18ae012de40f8cf1b9bf48420abb390678c34821caa07d924941cc4"
|
||||
dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"fastrand",
|
||||
"libc",
|
||||
"redox_syscall",
|
||||
"remove_dir_all",
|
||||
"winapi",
|
||||
"rustix",
|
||||
"windows-sys 0.42.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3805,6 +3874,8 @@ name = "tenant_size_model"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -4354,6 +4425,15 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicase"
|
||||
version = "2.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
|
||||
dependencies = [
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-bidi"
|
||||
version = "0.3.10"
|
||||
@@ -4443,7 +4523,7 @@ dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"criterion",
|
||||
"git-version",
|
||||
"futures",
|
||||
"heapless",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
@@ -4452,12 +4532,8 @@ dependencies = [
|
||||
"metrics",
|
||||
"nix",
|
||||
"once_cell",
|
||||
"pq_proto",
|
||||
"rand",
|
||||
"routerify",
|
||||
"rustls",
|
||||
"rustls-pemfile",
|
||||
"rustls-split",
|
||||
"sentry",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -4468,10 +4544,10 @@ dependencies = [
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"uuid",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -4771,15 +4847,19 @@ name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap 4.1.4",
|
||||
"crossbeam-utils",
|
||||
"digest",
|
||||
"either",
|
||||
"fail",
|
||||
"futures",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-executor",
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"hashbrown 0.12.3",
|
||||
"indexmap",
|
||||
@@ -4804,6 +4884,7 @@ dependencies = [
|
||||
"socket2",
|
||||
"syn",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tokio-util",
|
||||
"tonic",
|
||||
"tower",
|
||||
|
||||
@@ -38,6 +38,7 @@ comfy-table = "6.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
crossbeam-utils = "0.8.5"
|
||||
either = "1.8"
|
||||
enum-map = "2.4.2"
|
||||
enumset = "1.0.12"
|
||||
fail = "0.5.0"
|
||||
@@ -68,7 +69,6 @@ once_cell = "1.13"
|
||||
opentelemetry = "0.18.0"
|
||||
opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.10.0"
|
||||
tracing-opentelemetry = "0.18.0"
|
||||
parking_lot = "0.12"
|
||||
pin-project-lite = "0.2"
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
@@ -76,6 +76,8 @@ prost = "0.11"
|
||||
rand = "0.8"
|
||||
regex = "1.4"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
|
||||
reqwest-middleware = "0.2.0"
|
||||
routerify = "3"
|
||||
rpds = "0.12.0"
|
||||
rustls = "0.20"
|
||||
@@ -92,6 +94,7 @@ socket2 = "0.4.4"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
svg_fmt = "0.4.1"
|
||||
sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
thiserror = "1.0"
|
||||
tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
|
||||
@@ -104,6 +107,7 @@ toml = "0.5"
|
||||
toml_edit = { version = "0.17", features = ["easy"] }
|
||||
tonic = {version = "0.8", features = ["tls", "tls-roots"]}
|
||||
tracing = "0.1"
|
||||
tracing-opentelemetry = "0.18.0"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
url = "2.2"
|
||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||
@@ -129,6 +133,7 @@ heapless = { default-features=false, features=[], git = "https://github.com/japa
|
||||
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
|
||||
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
|
||||
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
|
||||
pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
|
||||
@@ -146,7 +151,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
criterion = "0.4"
|
||||
rcgen = "0.10"
|
||||
rstest = "0.16"
|
||||
tempfile = "3.2"
|
||||
tempfile = "3.4"
|
||||
tonic-build = "0.8"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
ARG PG_VERSION
|
||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
@@ -11,7 +12,7 @@ FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
|
||||
libicu-dev
|
||||
libicu-dev libxslt1-dev
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -23,18 +24,24 @@ FROM build-deps AS pg-build
|
||||
ARG PG_VERSION
|
||||
COPY vendor/postgres-${PG_VERSION} postgres
|
||||
RUN cd postgres && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu \
|
||||
--with-libxml --with-libxslt && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||
# Enable some of contrib extensions
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -50,17 +57,18 @@ RUN apt update && \
|
||||
libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
|
||||
protobuf-c-compiler xsltproc
|
||||
|
||||
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz && \
|
||||
tar zxvf SFCGAL-v1.3.10.tar.gz && \
|
||||
cd SFCGAL-v1.3.10 && cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
|
||||
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
|
||||
mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
|
||||
cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make clean && cp -R /sfcgal/* /
|
||||
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||
tar xvzf postgis-3.3.1.tar.gz && \
|
||||
cd postgis-3.3.1 && \
|
||||
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
|
||||
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
|
||||
./autogen.sh && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
cd extensions/postgis && \
|
||||
@@ -74,6 +82,15 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
|
||||
|
||||
RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
|
||||
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
cmake .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "plv8-build"
|
||||
@@ -83,30 +100,17 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||
FROM build-deps AS plv8-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
|
||||
apt install -y ninja-build python3-dev libncurses5 binutils clang
|
||||
|
||||
# https://github.com/plv8/plv8/issues/475:
|
||||
# v8 uses gold for linking and sets `--thread-count=4` which breaks
|
||||
# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
|
||||
# Install newer gold version manually as debian-testing binutils version updates
|
||||
# libc version, which in turn breaks other extension built against non-testing libc.
|
||||
RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
|
||||
tar xvzf binutils-2.38.tar.gz && \
|
||||
cd binutils-2.38 && \
|
||||
cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
cd ../bfd && ./configure && make bfdver.h && \
|
||||
cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
|
||||
cp /usr/local/bin/ld.gold /usr/bin/gold
|
||||
|
||||
# Sed is used to patch for https://github.com/plv8/plv8/issues/503
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||
tar xvzf v3.1.4.tar.gz && \
|
||||
cd plv8-3.1.4 && \
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \
|
||||
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
|
||||
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
|
||||
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -124,20 +128,17 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2
|
||||
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
|
||||
&& rm /tmp/cmake-install.sh
|
||||
|
||||
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
|
||||
tar xvzf h3.tgz && \
|
||||
cd h3-4.0.1 && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
|
||||
mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && cd build && \
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
DESTDIR=/h3 make install && \
|
||||
cp -R /h3/usr / && \
|
||||
rm -rf build
|
||||
|
||||
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
|
||||
tar xvzf h3-pg.tgz && \
|
||||
cd h3-pg-4.0.1 && \
|
||||
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \
|
||||
mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
@@ -153,9 +154,8 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
|
||||
FROM build-deps AS unit-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \
|
||||
tar xvzf 7.7.tar.gz && \
|
||||
cd postgresql-unit-7.7 && \
|
||||
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
|
||||
mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
# unit extension's "create extension" script relies on absolute install path to fill some reference tables.
|
||||
@@ -165,6 +165,201 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz &
|
||||
find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "vector-pg-build"
|
||||
# compile pgvector extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS vector-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pgjwt-pg-build"
|
||||
# compile pgjwt extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pgjwt-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
|
||||
RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
|
||||
mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "hypopg-pg-build"
|
||||
# compile hypopg extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS hypopg-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypopg.tar.gz && \
|
||||
mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-hashids-pg-build"
|
||||
# compile pg_hashids extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-hashids-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
|
||||
mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rum-pg-build"
|
||||
# compile rum extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rum-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
|
||||
mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pgtap-pg-build"
|
||||
# compile pgTAP extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pgtap-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
|
||||
mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "prefix-pg-build"
|
||||
# compile Prefix extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS prefix-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O prefix.tar.gz && \
|
||||
mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "hll-pg-build"
|
||||
# compile hll extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS hll-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar.gz -O hll.tar.gz && \
|
||||
mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "plpgsql-check-pg-build"
|
||||
# compile plpgsql_check extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS plpgsql-check-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz -O plpgsql_check.tar.gz && \
|
||||
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions"
|
||||
# This layer is used to build `pgx` deps
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rust-extensions-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl libclang-dev cmake && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||
rm rustup-init && \
|
||||
cargo install --git https://github.com/vadim2404/pgx --branch neon_abi_v0.6.1 --locked cargo-pgx && \
|
||||
/bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
USER root
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-jsonschema-pg-build"
|
||||
# Compile "pg_jsonschema" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-jsonschema-pg-build
|
||||
|
||||
RUN git clone --depth=1 --single-branch --branch neon_abi_v0.1.4 https://github.com/vadim2404/pg_jsonschema/ && \
|
||||
cd pg_jsonschema && \
|
||||
cargo pgx install --release && \
|
||||
# it's needed to enable extension because it uses untrusted C language
|
||||
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_jsonschema.control && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-graphql-pg-build"
|
||||
# Compile "pg_graphql" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-graphql-pg-build
|
||||
|
||||
RUN git clone --depth=1 --single-branch --branch neon_abi_v1.1.0 https://github.com/vadim2404/pg_graphql && \
|
||||
cd pg_graphql && \
|
||||
cargo pgx install --release && \
|
||||
# it's needed to enable extension because it uses untrusted C language
|
||||
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
@@ -178,6 +373,17 @@ COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=h3-pg-build /h3/usr /
|
||||
COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -228,16 +434,19 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||
chown -R postgres:postgres /var/db/postgres && \
|
||||
chmod 0750 /var/db/postgres/compute && \
|
||||
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
|
||||
# create folder for file cache
|
||||
mkdir -p -m 777 /neon/cache
|
||||
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
# Install:
|
||||
# libreadline8 for psql
|
||||
# libicu67, locales for collations (including ICU)
|
||||
# libicu67, locales for collations (including ICU and plpgsql_check)
|
||||
# libossp-uuid16 for extension ossp-uuid
|
||||
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
||||
# libxml2, libxslt1.1 for xml2
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
locales \
|
||||
@@ -249,6 +458,8 @@ RUN apt update && \
|
||||
libproj19 \
|
||||
libprotobuf-c1 \
|
||||
libsfcgal1 \
|
||||
libxml2 \
|
||||
libxslt1.1 \
|
||||
gdb && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
25
Dockerfile.vm-compute-node
Normal file
25
Dockerfile.vm-compute-node
Normal file
@@ -0,0 +1,25 @@
|
||||
# Note: this file *mostly* just builds on Dockerfile.compute-node
|
||||
|
||||
ARG SRC_IMAGE
|
||||
ARG VM_INFORMANT_VERSION=v0.1.6
|
||||
|
||||
# Pull VM informant and set up inittab
|
||||
FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant
|
||||
|
||||
RUN set -e \
|
||||
&& rm -f /etc/inittab \
|
||||
&& touch /etc/inittab
|
||||
|
||||
RUN set -e \
|
||||
&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart'" >> /etc/inittab
|
||||
|
||||
# Combine, starting from non-VM compute node image.
|
||||
FROM $SRC_IMAGE as base
|
||||
|
||||
# Temporarily set user back to root so we can run adduser
|
||||
USER root
|
||||
RUN adduser vm-informant --disabled-password --no-create-home
|
||||
USER postgres
|
||||
|
||||
COPY --from=informant /etc/inittab /etc/inittab
|
||||
COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant
|
||||
12
Makefile
12
Makefile
@@ -136,9 +136,15 @@ neon-pg-ext-%: postgres-%
|
||||
|
||||
.PHONY: neon-pg-ext-clean-%
|
||||
neon-pg-ext-clean-%:
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
|
||||
|
||||
.PHONY: neon-pg-ext
|
||||
neon-pg-ext: \
|
||||
|
||||
15
README.md
15
README.md
@@ -34,6 +34,11 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \
|
||||
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
|
||||
protobuf-devel
|
||||
```
|
||||
* On Arch based systems, these packages are needed:
|
||||
```bash
|
||||
pacman -S base-devel readline zlib libseccomp openssl clang \
|
||||
postgresql-libs cmake postgresql protobuf
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
```
|
||||
@@ -83,9 +88,10 @@ cd neon
|
||||
|
||||
# The preferred and default is to make a debug build. This will create a
|
||||
# demonstrably slower build than a release build. For a release build,
|
||||
# use "BUILD_TYPE=release make -j`nproc`"
|
||||
# use "BUILD_TYPE=release make -j`nproc` -s"
|
||||
# Remove -s for the verbose build log
|
||||
|
||||
make -j`nproc`
|
||||
make -j`nproc` -s
|
||||
```
|
||||
|
||||
#### Building on OSX
|
||||
@@ -99,9 +105,10 @@ cd neon
|
||||
|
||||
# The preferred and default is to make a debug build. This will create a
|
||||
# demonstrably slower build than a release build. For a release build,
|
||||
# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`"
|
||||
# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu` -s"
|
||||
# Remove -s for the verbose build log
|
||||
|
||||
make -j`sysctl -n hw.logicalcpu`
|
||||
make -j`sysctl -n hw.logicalcpu` -s
|
||||
```
|
||||
|
||||
#### Dependency installation notes
|
||||
|
||||
@@ -17,6 +17,7 @@ regex.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tar.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tokio-postgres.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -44,7 +44,6 @@ use tracing::{error, info};
|
||||
|
||||
use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::informant::spawn_vm_informant_if_present;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
@@ -66,6 +65,9 @@ fn main() -> Result<()> {
|
||||
let spec = matches.get_one::<String>("spec");
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
|
||||
let compute_id = matches.get_one::<String>("compute-id");
|
||||
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
|
||||
|
||||
// Try to use just 'postgres' if no path is provided
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap();
|
||||
|
||||
@@ -78,8 +80,27 @@ fn main() -> Result<()> {
|
||||
let path = Path::new(sp);
|
||||
let file = File::open(path)?;
|
||||
serde_json::from_reader(file)?
|
||||
} else if let Some(id) = compute_id {
|
||||
if let Some(cp_base) = control_plane_uri {
|
||||
let cp_uri = format!("{cp_base}/management/api/v1/{id}/spec");
|
||||
let jwt: String = match std::env::var("NEON_CONSOLE_JWT") {
|
||||
Ok(v) => v,
|
||||
Err(_) => "".to_string(),
|
||||
};
|
||||
|
||||
reqwest::blocking::Client::new()
|
||||
.get(cp_uri)
|
||||
.header("Authorization", jwt)
|
||||
.send()?
|
||||
.json()?
|
||||
} else {
|
||||
panic!(
|
||||
"must specify --control-plane-uri \"{:#?}\" and --compute-id \"{:#?}\"",
|
||||
control_plane_uri, compute_id
|
||||
);
|
||||
}
|
||||
} else {
|
||||
panic!("cluster spec should be provided via --spec or --spec-path argument");
|
||||
panic!("compute spec should be provided via --spec or --spec-path argument");
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -141,8 +162,6 @@ fn main() -> Result<()> {
|
||||
// requests, while configuration is still in progress.
|
||||
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
|
||||
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
||||
// Also spawn the thread responsible for handling the VM informant -- if it's present
|
||||
let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant");
|
||||
|
||||
// Start Postgres
|
||||
let mut delay_exit = false;
|
||||
@@ -230,6 +249,18 @@ fn cli() -> clap::Command {
|
||||
.long("spec-path")
|
||||
.value_name("SPEC_PATH"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("compute-id")
|
||||
.short('i')
|
||||
.long("compute-id")
|
||||
.value_name("COMPUTE_ID"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("control-plane-uri")
|
||||
.short('p')
|
||||
.long("control-plane-uri")
|
||||
.value_name("CONTROL_PLANE"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -25,6 +25,7 @@ use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres::{Client, NoTls};
|
||||
use serde::{Serialize, Serializer};
|
||||
use tokio_postgres;
|
||||
use tracing::{info, instrument, warn};
|
||||
|
||||
use crate::checker::create_writability_check_data;
|
||||
@@ -284,6 +285,7 @@ impl ComputeNode {
|
||||
handle_role_deletions(self, &mut client)?;
|
||||
handle_grants(self, &mut client)?;
|
||||
create_writability_check_data(&mut client)?;
|
||||
handle_extensions(&self.spec, &mut client)?;
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
@@ -400,4 +402,43 @@ impl ComputeNode {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Select `pg_stat_statements` data and return it as a stringified JSON
|
||||
pub async fn collect_insights(&self) -> String {
|
||||
let mut result_rows: Vec<String> = Vec::new();
|
||||
let connect_result = tokio_postgres::connect(self.connstr.as_str(), NoTls).await;
|
||||
let (client, connection) = connect_result.unwrap();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
let result = client
|
||||
.simple_query(
|
||||
"SELECT
|
||||
row_to_json(pg_stat_statements)
|
||||
FROM
|
||||
pg_stat_statements
|
||||
WHERE
|
||||
userid != 'cloud_admin'::regrole::oid
|
||||
ORDER BY
|
||||
(mean_exec_time + mean_plan_time) DESC
|
||||
LIMIT 100",
|
||||
)
|
||||
.await;
|
||||
|
||||
if let Ok(raw_rows) = result {
|
||||
for message in raw_rows.iter() {
|
||||
if let postgres::SimpleQueryMessage::Row(row) = message {
|
||||
if let Some(json) = row.get(0) {
|
||||
result_rows.push(json.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
format!("{{\"pg_stat_statements\": [{}]}}", result_rows.join(","))
|
||||
} else {
|
||||
"{{\"pg_stat_statements\": []}}".to_string()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
use anyhow::Result;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
@@ -10,8 +11,6 @@ use serde_json;
|
||||
use tracing::{error, info};
|
||||
use tracing_utils::http::OtelName;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
// Service function to handle all available routes.
|
||||
async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
|
||||
//
|
||||
@@ -34,6 +33,13 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
|
||||
}
|
||||
|
||||
// Collect Postgres current usage insights
|
||||
(&Method::GET, "/insights") => {
|
||||
info!("serving /insights GET request");
|
||||
let insights = compute.collect_insights().await;
|
||||
Response::new(Body::from(insights))
|
||||
}
|
||||
|
||||
(&Method::POST, "/check_writability") => {
|
||||
info!("serving /check_writability POST request");
|
||||
let res = crate::checker::check_writability(compute).await;
|
||||
|
||||
@@ -10,12 +10,12 @@ paths:
|
||||
/status:
|
||||
get:
|
||||
tags:
|
||||
- "info"
|
||||
- Info
|
||||
summary: Get compute node internal status
|
||||
description: ""
|
||||
operationId: getComputeStatus
|
||||
responses:
|
||||
"200":
|
||||
200:
|
||||
description: ComputeState
|
||||
content:
|
||||
application/json:
|
||||
@@ -25,27 +25,43 @@ paths:
|
||||
/metrics.json:
|
||||
get:
|
||||
tags:
|
||||
- "info"
|
||||
- Info
|
||||
summary: Get compute node startup metrics in JSON format
|
||||
description: ""
|
||||
operationId: getComputeMetricsJSON
|
||||
responses:
|
||||
"200":
|
||||
200:
|
||||
description: ComputeMetrics
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ComputeMetrics"
|
||||
|
||||
/insights:
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
summary: Get current compute insights in JSON format
|
||||
description: |
|
||||
Note, that this doesn't include any historical data
|
||||
operationId: getComputeInsights
|
||||
responses:
|
||||
200:
|
||||
description: Compute insights
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ComputeInsights"
|
||||
|
||||
/check_writability:
|
||||
post:
|
||||
tags:
|
||||
- "check"
|
||||
- Check
|
||||
summary: Check that we can write new data on this compute
|
||||
description: ""
|
||||
operationId: checkComputeWritability
|
||||
responses:
|
||||
"200":
|
||||
200:
|
||||
description: Check result
|
||||
content:
|
||||
text/plain:
|
||||
@@ -96,6 +112,15 @@ components:
|
||||
type: string
|
||||
description: Text of the error during compute startup, if any
|
||||
|
||||
ComputeInsights:
|
||||
type: object
|
||||
properties:
|
||||
pg_stat_statements:
|
||||
description: Contains raw output from pg_stat_statements in JSON format
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
|
||||
ComputeStatus:
|
||||
type: string
|
||||
enum:
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
use std::path::Path;
|
||||
use std::process;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
const VM_INFORMANT_PATH: &str = "/bin/vm-informant";
|
||||
const RESTART_INFORMANT_AFTER_MILLIS: u64 = 5000;
|
||||
|
||||
/// Launch a thread to start the VM informant if it's present (and restart, on failure)
|
||||
pub fn spawn_vm_informant_if_present() -> Result<Option<thread::JoinHandle<()>>> {
|
||||
let exists = Path::new(VM_INFORMANT_PATH)
|
||||
.try_exists()
|
||||
.context("could not check if path exists")?;
|
||||
|
||||
if !exists {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some(
|
||||
thread::Builder::new()
|
||||
.name("run-vm-informant".into())
|
||||
.spawn(move || run_informant())?,
|
||||
))
|
||||
}
|
||||
|
||||
fn run_informant() -> ! {
|
||||
let restart_wait = Duration::from_millis(RESTART_INFORMANT_AFTER_MILLIS);
|
||||
|
||||
info!("starting VM informant");
|
||||
|
||||
loop {
|
||||
let mut cmd = process::Command::new(VM_INFORMANT_PATH);
|
||||
// Block on subprocess:
|
||||
let result = cmd.status();
|
||||
|
||||
match result {
|
||||
Err(e) => warn!("failed to run VM informant at {VM_INFORMANT_PATH:?}: {e}"),
|
||||
Ok(status) if !status.success() => {
|
||||
warn!("{VM_INFORMANT_PATH} exited with code {status:?}, retrying")
|
||||
}
|
||||
Ok(_) => info!("{VM_INFORMANT_PATH} ended gracefully (unexpectedly). Retrying"),
|
||||
}
|
||||
|
||||
// Wait before retrying
|
||||
thread::sleep(restart_wait);
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,6 @@ pub mod http;
|
||||
#[macro_use]
|
||||
pub mod logger;
|
||||
pub mod compute;
|
||||
pub mod informant;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
|
||||
@@ -63,6 +63,8 @@ impl GenericOption {
|
||||
/// Represent `GenericOption` as configuration option.
|
||||
pub fn to_pg_setting(&self) -> String {
|
||||
if let Some(val) = &self.value {
|
||||
// TODO: check in the console DB that we don't have these settings
|
||||
// set for any non-deleted project and drop this override.
|
||||
let name = match self.name.as_str() {
|
||||
"safekeepers" => "neon.safekeepers",
|
||||
"wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
|
||||
|
||||
@@ -515,3 +515,18 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create required system extensions
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
|
||||
if libs.contains("pg_stat_statements") {
|
||||
// Create extension only if this compute really needs it
|
||||
let query = "CREATE EXTENSION IF NOT EXISTS pg_stat_statements";
|
||||
info!("creating system extensions with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@ postgres.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde_with.workspace = true
|
||||
tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
@@ -23,6 +24,7 @@ url.workspace = true
|
||||
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
|
||||
# instead, so that recompile times are better.
|
||||
pageserver_api.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
safekeeper_api.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
storage_broker.workspace = true
|
||||
|
||||
@@ -17,6 +17,7 @@ use pageserver_api::{
|
||||
DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
|
||||
DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use safekeeper_api::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
||||
@@ -30,7 +31,6 @@ use utils::{
|
||||
auth::{Claims, Scope},
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
project_git_version,
|
||||
};
|
||||
|
||||
|
||||
@@ -11,10 +11,10 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use postgres_backend::AuthType;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
};
|
||||
|
||||
use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
@@ -19,7 +20,6 @@ use std::process::{Command, Stdio};
|
||||
use utils::{
|
||||
auth::{encode_from_key_file, Claims, Scope},
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
postgres_backend::AuthType,
|
||||
};
|
||||
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
@@ -11,6 +11,7 @@ use anyhow::{bail, Context};
|
||||
use pageserver_api::models::{
|
||||
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
@@ -20,7 +21,6 @@ use utils::{
|
||||
http::error::HttpErrorBody,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
};
|
||||
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
@@ -419,6 +419,11 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||
eviction_policy: settings
|
||||
.get("eviction_policy")
|
||||
.map(|x| serde_json::from_str(x))
|
||||
.transpose()
|
||||
.context("Failed to parse 'eviction_policy' json")?,
|
||||
})
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
|
||||
@@ -16,7 +16,7 @@ listen_http_addr = '127.0.0.1:9898'
|
||||
checkpoint_distance = '268435456' # in bytes
|
||||
checkpoint_timeout = '10m'
|
||||
|
||||
gc_period = '100 s'
|
||||
gc_period = '1 hour'
|
||||
gc_horizon = '67108864'
|
||||
|
||||
max_file_descriptors = '100'
|
||||
@@ -101,7 +101,7 @@ away.
|
||||
|
||||
#### gc_period
|
||||
|
||||
Interval at which garbage collection is triggered. Default is 100 s.
|
||||
Interval at which garbage collection is triggered. Default is 1 hour.
|
||||
|
||||
#### image_creation_threshold
|
||||
|
||||
@@ -109,7 +109,7 @@ L0 delta layer threshold for L1 image layer creation. Default is 3.
|
||||
|
||||
#### pitr_interval
|
||||
|
||||
WAL retention duration for PITR branching. Default is 30 days.
|
||||
WAL retention duration for PITR branching. Default is 7 days.
|
||||
|
||||
#### walreceiver_connect_timeout
|
||||
|
||||
|
||||
335
docs/synthetic-size.md
Normal file
335
docs/synthetic-size.md
Normal file
@@ -0,0 +1,335 @@
|
||||
# Synthetic size
|
||||
|
||||
Neon storage has copy-on-write branching, which makes it difficult to
|
||||
answer the question "how large is my database"? To give one reasonable
|
||||
answer, we calculate _synthetic size_ for a project.
|
||||
|
||||
The calculation is called "synthetic", because it is based purely on
|
||||
the user-visible logical size, which is the size that you would see on
|
||||
a standalone PostgreSQL installation, and the amount of WAL, which is
|
||||
also the same as what you'd see on a standalone PostgreSQL, for the
|
||||
same set of updates.
|
||||
|
||||
The synthetic size does *not* depend on the actual physical size
|
||||
consumed in the storage, or implementation details of the Neon storage
|
||||
like garbage collection, compaction and compression. There is a
|
||||
strong *correlation* between the physical size and the synthetic size,
|
||||
but the synthetic size is designed to be independent of the
|
||||
implementation details, so that any improvements we make in the
|
||||
storage system simply reduce our COGS. And vice versa: any bugs or bad
|
||||
implementation where we keep more data than we would need to, do not
|
||||
change the synthetic size or incur any costs to the user.
|
||||
|
||||
The synthetic size is calculated for the whole project. It is not
|
||||
straighforward to attribute size to individual branches. See "What is
|
||||
the size of an individual branch?" for discussion on those
|
||||
difficulties.
|
||||
|
||||
The synthetic size is designed to:
|
||||
|
||||
- Take into account the copy-on-write nature of the storage. For
|
||||
example, if you create a branch, it doesn't immediately add anything
|
||||
to the synthetic size. It starts to affect the synthetic size only
|
||||
as it diverges from the parent branch.
|
||||
|
||||
- Be independent of any implementation details of the storage, like
|
||||
garbage collection, remote storage, or compression.
|
||||
|
||||
## Terms & assumptions
|
||||
|
||||
- logical size is the size of a branch *at a given point in
|
||||
time*. It's the total size of all tables in all databases, as you
|
||||
see with "\l+" in psql for example, plus the Postgres SLRUs and some
|
||||
small amount of metadata. NOTE that currently, Neon does not include
|
||||
the SLRUs and metadata in the logical size. See comment to `get_current_logical_size_non_incremental()`.
|
||||
|
||||
- a "point in time" is defined as an LSN value. You can convert a
|
||||
timestamp to an LSN, but the storage internally works with LSNs.
|
||||
|
||||
- PITR horizon can be set per-branch.
|
||||
|
||||
- PITR horizon can be set as a time interval, e.g. 5 days or hours, or
|
||||
as amount of WAL, in bytes. If it's given as a time interval, it's
|
||||
converted to an LSN for the calculation.
|
||||
|
||||
- PITR horizon can be set to 0, if you don't want to retain any history.
|
||||
|
||||
## Calculation
|
||||
|
||||
Inputs to the calculation are:
|
||||
- logical size of the database at different points in time,
|
||||
- amount of WAL generated, and
|
||||
- the PITR horizon settings
|
||||
|
||||
The synthetic size is based on an idealistic model of the storage
|
||||
system, where we pretend that the storage consists of two things:
|
||||
- snapshots, containing a full snapshot of the database, at a given
|
||||
point in time, and
|
||||
- WAL.
|
||||
|
||||
In the simple case that the project contains just one branch (main),
|
||||
and a fixed PITR horizon, the synthetic size is the sum of:
|
||||
|
||||
- the logical size of the branch *at the beginning of the PITR
|
||||
horizon*, i.e. at the oldest point that you can still recover to, and
|
||||
- the size of the WAL covering the PITR horizon.
|
||||
|
||||
The snapshot allows you to recover to the beginning of the PITR
|
||||
horizon, and the WAL allows you to recover from that point to any
|
||||
point within the horizon.
|
||||
|
||||
```
|
||||
WAL
|
||||
-----------------------#########>
|
||||
^
|
||||
snapshot
|
||||
|
||||
Legend:
|
||||
##### PITR horizon. This is the region that you can still access
|
||||
with Point-in-time query and you can still create branches
|
||||
from.
|
||||
----- history that has fallen out of the PITR horizon, and can no
|
||||
longer be accessed
|
||||
```
|
||||
|
||||
NOTE: This is not how the storage system actually works! The actual
|
||||
implementation is also based on snapshots and WAL, but the snapshots
|
||||
are taken for individual database pages and ranges of pages rather
|
||||
than the whole database, and it is much more complicated. This model
|
||||
is a reasonable approximation, however, to make the synthetic size a
|
||||
useful proxy for the actual storage consumption.
|
||||
|
||||
|
||||
## Example: Data is INSERTed
|
||||
|
||||
For example, let's assume that your database contained 10 GB of data
|
||||
at the beginning of the PITR horizon, and you have since then inserted
|
||||
5 GB of additional data into it. The additional insertions of 5 GB of
|
||||
data consume roughly 5 GB of WAL. In that case, the synthetic size is:
|
||||
|
||||
> 10 GB (snapshot) + 5 GB (WAL) = 15 GB
|
||||
|
||||
If you now set the PITR horizon on the project to 0, so that no
|
||||
historical data is retained, then the beginning PITR horizon would be
|
||||
at the end of the branch, so the size of the snapshot would be
|
||||
calculated at the end of the branch, after the insertions. Then the
|
||||
synthetic size is:
|
||||
|
||||
> 15 GB (snapshot) + 0 GB (WAL) = 15 GB.
|
||||
|
||||
In this case, the synthetic size is the same, regardless of the PITR horizon,
|
||||
because all the history consists of inserts. The newly inserted data takes
|
||||
up the same amount of space, whether it's stored as part of the logical
|
||||
snapshot, or as WAL. (*)
|
||||
|
||||
(*) This is a rough approximation. In reality, the WAL contains
|
||||
headers and other overhead, and on the other hand, the logical
|
||||
snapshot includes empty space on pages, so the size of insertions in
|
||||
WAL can be smaller or greater than the size of the final table after
|
||||
the insertions. But in most cases, it's in the same ballpark.
|
||||
|
||||
## Example: Data is DELETEd
|
||||
|
||||
Let's look at another example:
|
||||
|
||||
Let's start again with a database that contains 10 GB of data. Then,
|
||||
you DELETE 5 GB of the data, and run VACUUM to free up the space, so
|
||||
that the logical size of the database is now only 5 GB.
|
||||
|
||||
Let's assume that the WAL for the deletions and the vacuum take up
|
||||
100 MB of space. In that case, the synthetic size of the project is:
|
||||
|
||||
> 10 GB (snapshot) + 100 MB (WAL) = 10.1 GB
|
||||
|
||||
This is much larger than the logical size of the database after the
|
||||
deletions (5 GB). That's because the system still needs to retain the
|
||||
deleted data, because it's still accessible to queries and branching
|
||||
in the PITR window.
|
||||
|
||||
If you now set the PITR horizon to 0 or just wait for time to pass so
|
||||
that the data falls out of the PITR horizon, making the deleted data
|
||||
inaccessible, the synthetic size shrinks:
|
||||
|
||||
> 5 GB (snapshot) + 0 GB (WAL) = 5 GB
|
||||
|
||||
|
||||
# Branching
|
||||
|
||||
Things get more complicated with branching. Branches in Neon are
|
||||
copy-on-write, which is also reflected in the synthetic size.
|
||||
|
||||
When you create a branch, it doesn't immediately change the synthetic
|
||||
size at all. The branch point is within the PITR horizon, and all the
|
||||
data needed to recover to that point in time needs to be retained
|
||||
anyway.
|
||||
|
||||
However, if you make modifications on the branch, the system needs to
|
||||
keep the WAL of those modifications. The WAL is included in the
|
||||
synthetic size.
|
||||
|
||||
## Example: branch and INSERT
|
||||
|
||||
Let's assume that you again start with a 10 GB database.
|
||||
On the main branch, you insert 2 GB of data. Then you create
|
||||
a branch at that point, and insert another 3 GB of data on the
|
||||
main branch, and 1 GB of data on the child branch
|
||||
|
||||
```
|
||||
child +#####>
|
||||
|
|
||||
| WAL
|
||||
main ---------###############>
|
||||
^
|
||||
snapshot
|
||||
```
|
||||
|
||||
In this case, the synthetic size consists of:
|
||||
- the snapshot at the beginning of the PITR horizon (10 GB)
|
||||
- the WAL on the main branch (2 GB + 3 GB = 5 GB)
|
||||
- the WAL on the child branch (1 GB)
|
||||
|
||||
Total: 16 GB
|
||||
|
||||
# Diverging branches
|
||||
|
||||
If there is only a small amount of changes in the database on the
|
||||
different branches, as in the previous example, the synthetic size
|
||||
consists of a snapshot before the branch point, containing all the
|
||||
shared data, and the WAL on both branches. However, if the branches
|
||||
diverge a lot, it is more efficient to store a separate snapshot of
|
||||
branches.
|
||||
|
||||
## Example: diverging branches
|
||||
|
||||
You start with a 10 GB database. You insert 5 GB of data on the main
|
||||
branch. Then you create a branch, and immediately delete all the data
|
||||
on the child branch and insert 5 GB of new data to it. Then you do the
|
||||
same on the main branch. Let's assume
|
||||
that the PITR horizon requires keeping the last 1 GB of WAL on the
|
||||
both branches.
|
||||
|
||||
```
|
||||
snapshot
|
||||
v WAL
|
||||
child +---------##############>
|
||||
|
|
||||
|
|
||||
main -------------+---------##############>
|
||||
^ WAL
|
||||
snapshot
|
||||
```
|
||||
|
||||
In this case, the synthetic size consists of:
|
||||
- snapshot at the beginning of the PITR horizon on the main branch (4 GB)
|
||||
- WAL on the main branch (1 GB)
|
||||
- snapshot at the beginning of the PITR horizon on the child branch (4 GB)
|
||||
- last 1 GB of WAL on the child branch (1 GB)
|
||||
|
||||
Total: 10 GB
|
||||
|
||||
The alternative way to store this would be to take only one snapshot
|
||||
at the beginning of branch point, and keep all the WAL on both
|
||||
branches. However, the size with that method would be larger, as it
|
||||
would require one 10 GB snapshot, and 5 GB + 5 GB of WAL. It depends
|
||||
on the amount of changes (WAL) on both branches, and the logical size
|
||||
at the branch point, which method would result in a smaller synthetic
|
||||
size. On each branch point, the system performs the calculation with
|
||||
both methods, and uses the method that is cheaper, i.e. the one that
|
||||
results in a smaller synthetic size.
|
||||
|
||||
One way to think about this is that when you create a branch, it
|
||||
starts out as a thin branch that only stores the WAL since the branch
|
||||
point. As you modify it, and the amount of WAL grows, at some point
|
||||
it becomes cheaper to store a completely new snapshot of the branch
|
||||
and truncate the WAL.
|
||||
|
||||
|
||||
# What is the size of an individual branch?
|
||||
|
||||
Synthetic size is calculated for the whole project, and includes all
|
||||
branches. There is no such thing as the size of a branch, because it
|
||||
is not straighforward to attribute the parts of size to individual
|
||||
branches.
|
||||
|
||||
## Example: attributing size to branches
|
||||
|
||||
(copied from https://github.com/neondatabase/neon/pull/2884#discussion_r1029365278)
|
||||
|
||||
Imagine that you create two branches, A and B, at the same point from
|
||||
main branch, and do a couple of small updates on both branches. Then
|
||||
six months pass, and during those six months the data on the main
|
||||
branch churns over completely multiple times. The retention period is,
|
||||
say 1 month.
|
||||
|
||||
```
|
||||
+------> A
|
||||
/
|
||||
--------------------*-------------------------------> main
|
||||
\
|
||||
+--------> B
|
||||
```
|
||||
|
||||
In that situation, the synthetic tenant size would be calculated based
|
||||
on a "logical snapshot" at the branch point, that is, the logical size
|
||||
of the database at that point. Plus the WAL on branches A and B. Let's
|
||||
say that the snapshot size is 10 GB, and the WAL is 1 MB on both
|
||||
branches A and B. So the total synthetic storage size is 10002
|
||||
MB. (Let's ignore the main branch for now, that would be just added to
|
||||
the sum)
|
||||
|
||||
How would you break that down per branch? I can think of three
|
||||
different ways to do it, and all of them have their own problems:
|
||||
|
||||
### Subtraction method
|
||||
|
||||
For each branch, calculate how much smaller the total synthetic size
|
||||
would be, if that branch didn't exist. In other words, how much would
|
||||
you save if you dropped the branch. With this method, the size of
|
||||
branches A and B is 1 MB.
|
||||
|
||||
With this method, the 10 GB shared logical snapshot is not included
|
||||
for A nor B. So the size of all branches is not equal to the total
|
||||
synthetic size of the tenant. If you drop branch A, you save 1 MB as
|
||||
you'd expect, but also the size of B suddenly jumps from 1 MB to 10001
|
||||
MB, which might feel surprising.
|
||||
|
||||
### Division method
|
||||
|
||||
Divide the common parts evenly across all branches that need
|
||||
them. With this method, the size of branches A and B would be 5001 MB.
|
||||
|
||||
With this method, the sum of all branches adds up to the total
|
||||
synthetic size. But it's surprising in other ways: if you drop branch
|
||||
A, you might think that you save 5001 MB, but in reality you only save
|
||||
1 MB, and the size of branch B suddenly grows from 5001 to 10001 MB.
|
||||
|
||||
### Addition method
|
||||
|
||||
For each branch, include all the snapshots and WAL that it depends on,
|
||||
even if some of them are shared by other branches. With this method,
|
||||
the size of branches A and B would be 10001 MB.
|
||||
|
||||
The surprise with this method is that the sum of all the branches is
|
||||
larger than the total synthetic size. And if you drop branch A, the
|
||||
total synthetic size doesn't fall by 10001 MB as you might think.
|
||||
|
||||
# Alternatives
|
||||
|
||||
A sort of cop-out method would be to show the whole tree of branches
|
||||
graphically, and for each section of WAL or logical snapshot, display
|
||||
the size of that section. You can then see which branches depend on
|
||||
which sections, which sections are shared etc. That would be good to
|
||||
have in the UI anyway.
|
||||
|
||||
Or perhaps calculate per-branch numbers using the subtraction method,
|
||||
and in addition to that, one more number for "shared size" that
|
||||
includes all the data that is needed by more than one branch.
|
||||
|
||||
## Which is the right method?
|
||||
|
||||
The bottom line is that it's not straightforward to attribute the
|
||||
synthetic size to individual branches. There are things we can do, and
|
||||
all of those methods are pretty straightforward to implement, but they
|
||||
all have their own problems. What makes sense depends a lot on what
|
||||
you want to do with the number, what question you are trying to
|
||||
answer.
|
||||
@@ -14,5 +14,6 @@ byteorder.workspace = true
|
||||
utils.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
enum-map.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -155,6 +155,11 @@ pub struct TenantConfigRequest {
|
||||
pub lagging_wal_timeout: Option<String>,
|
||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||
pub trace_read_requests: Option<bool>,
|
||||
// We defer the parsing of the eviction_policy field to the request handler.
|
||||
// Otherwise we'd have to move the types for eviction policy into this package.
|
||||
// We might do that once the eviction feature has stabilizied.
|
||||
// For now, this field is not even documented in the openapi_spec.yml.
|
||||
pub eviction_policy: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
impl TenantConfigRequest {
|
||||
@@ -174,6 +179,7 @@ impl TenantConfigRequest {
|
||||
lagging_wal_timeout: None,
|
||||
max_lsn_wal_lag: None,
|
||||
trace_read_requests: None,
|
||||
eviction_policy: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -263,11 +269,11 @@ pub struct LayerResidenceEvent {
|
||||
///
|
||||
#[serde(rename = "timestamp_millis_since_epoch")]
|
||||
#[serde_as(as = "serde_with::TimestampMilliSeconds")]
|
||||
timestamp: SystemTime,
|
||||
pub timestamp: SystemTime,
|
||||
/// The new residence status of the layer.
|
||||
status: LayerResidenceStatus,
|
||||
pub status: LayerResidenceStatus,
|
||||
/// The reason why we had to record this event.
|
||||
reason: LayerResidenceEventReason,
|
||||
pub reason: LayerResidenceEventReason,
|
||||
}
|
||||
|
||||
/// The reason for recording a given [`ResidenceEvent`].
|
||||
|
||||
@@ -98,6 +98,15 @@ impl RelTag {
|
||||
|
||||
name
|
||||
}
|
||||
|
||||
pub fn with_forknum(&self, forknum: u8) -> Self {
|
||||
RelTag {
|
||||
forknum,
|
||||
spcnode: self.spcnode,
|
||||
dbnode: self.dbnode,
|
||||
relnode: self.relnode,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
26
libs/postgres_backend/Cargo.toml
Normal file
26
libs/postgres_backend/Cargo.toml
Normal file
@@ -0,0 +1,26 @@
|
||||
[package]
|
||||
name = "postgres_backend"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
async-trait.workspace = true
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
futures.workspace = true
|
||||
rustls.workspace = true
|
||||
serde.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
pq_proto.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
once_cell.workspace = true
|
||||
rustls-pemfile.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-postgres-rustls.workspace = true
|
||||
910
libs/postgres_backend/src/lib.rs
Normal file
910
libs/postgres_backend/src/lib.rs
Normal file
@@ -0,0 +1,910 @@
|
||||
//! Server-side asynchronous Postgres connection, as limited as we need.
|
||||
//! To use, create PostgresBackend and run() it, passing the Handler
|
||||
//! implementation determining how to process the queries. Currently its API
|
||||
//! is rather narrow, but we can extend it once required.
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use futures::pin_mut;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::io::ErrorKind;
|
||||
use std::net::SocketAddr;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::task::{ready, Poll};
|
||||
use std::{fmt, io};
|
||||
use std::{future::Future, str::FromStr};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_rustls::TlsAcceptor;
|
||||
use tracing::{debug, error, info, trace};
|
||||
|
||||
use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
|
||||
use pq_proto::{
|
||||
BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_INTERNAL_ERROR,
|
||||
SQLSTATE_SUCCESSFUL_COMPLETION,
|
||||
};
|
||||
|
||||
/// An error, occurred during query processing:
|
||||
/// either during the connection ([`ConnectionError`]) or before/after it.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum QueryError {
|
||||
/// The connection was lost while processing the query.
|
||||
#[error(transparent)]
|
||||
Disconnected(#[from] ConnectionError),
|
||||
/// Some other error
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<io::Error> for QueryError {
|
||||
fn from(e: io::Error) -> Self {
|
||||
Self::Disconnected(ConnectionError::Io(e))
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryError {
|
||||
pub fn pg_error_code(&self) -> &'static [u8; 5] {
|
||||
match self {
|
||||
Self::Disconnected(_) => b"08006", // connection failure
|
||||
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_expected_io_error(e: &io::Error) -> bool {
|
||||
use io::ErrorKind::*;
|
||||
matches!(
|
||||
e.kind(),
|
||||
ConnectionRefused | ConnectionAborted | ConnectionReset
|
||||
)
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
pub trait Handler {
|
||||
/// Handle single query.
|
||||
/// postgres_backend will issue ReadyForQuery after calling this (this
|
||||
/// might be not what we want after CopyData streaming, but currently we don't
|
||||
/// care). It will also flush out the output buffer.
|
||||
async fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
query_string: &str,
|
||||
) -> Result<(), QueryError>;
|
||||
|
||||
/// Called on startup packet receival, allows to process params.
|
||||
///
|
||||
/// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
|
||||
/// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
|
||||
/// to override whole init logic in implementations.
|
||||
fn startup(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
_sm: &FeStartupPacket,
|
||||
) -> Result<(), QueryError> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check auth jwt
|
||||
fn check_auth_jwt(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
_jwt_response: &[u8],
|
||||
) -> Result<(), QueryError> {
|
||||
Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
|
||||
}
|
||||
}
|
||||
|
||||
/// PostgresBackend protocol state.
|
||||
/// XXX: The order of the constructors matters.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)]
|
||||
pub enum ProtoState {
|
||||
/// Nothing happened yet.
|
||||
Initialization,
|
||||
/// Encryption handshake is done; waiting for encrypted Startup message.
|
||||
Encrypted,
|
||||
/// Waiting for password (auth token).
|
||||
Authentication,
|
||||
/// Performed handshake and auth, ReadyForQuery is issued.
|
||||
Established,
|
||||
Closed,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum ProcessMsgResult {
|
||||
Continue,
|
||||
Break,
|
||||
}
|
||||
|
||||
/// Either plain TCP stream or encrypted one, implementing AsyncRead + AsyncWrite.
|
||||
pub enum MaybeTlsStream {
|
||||
Unencrypted(tokio::net::TcpStream),
|
||||
Tls(Box<tokio_rustls::server::TlsStream<tokio::net::TcpStream>>),
|
||||
}
|
||||
|
||||
impl AsyncWrite for MaybeTlsStream {
|
||||
fn poll_write(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &[u8],
|
||||
) -> Poll<io::Result<usize>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf),
|
||||
}
|
||||
}
|
||||
fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<io::Result<()>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_flush(cx),
|
||||
}
|
||||
}
|
||||
fn poll_shutdown(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<io::Result<()>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl AsyncRead for MaybeTlsStream {
|
||||
fn poll_read(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut tokio::io::ReadBuf<'_>,
|
||||
) -> Poll<io::Result<()>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
|
||||
pub enum AuthType {
|
||||
Trust,
|
||||
// This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT
|
||||
NeonJWT,
|
||||
}
|
||||
|
||||
impl FromStr for AuthType {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"Trust" => Ok(Self::Trust),
|
||||
"NeonJWT" => Ok(Self::NeonJWT),
|
||||
_ => anyhow::bail!("invalid value \"{s}\" for auth type"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for AuthType {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(match self {
|
||||
AuthType::Trust => "Trust",
|
||||
AuthType::NeonJWT => "NeonJWT",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Either full duplex Framed or write only half; the latter is left in
|
||||
/// PostgresBackend after call to `split`. In principle we could always store a
|
||||
/// pair of splitted handles, but that would force to to pay splitting price
|
||||
/// (Arc and kinda mutex inside polling) for all uses (e.g. pageserver).
|
||||
enum MaybeWriteOnly {
|
||||
Full(Framed<MaybeTlsStream>),
|
||||
WriteOnly(FramedWriter<MaybeTlsStream>),
|
||||
Broken, // temporary value palmed off during the split
|
||||
}
|
||||
|
||||
impl MaybeWriteOnly {
|
||||
async fn read_startup_message(&mut self) -> Result<Option<FeStartupPacket>, ConnectionError> {
|
||||
match self {
|
||||
MaybeWriteOnly::Full(framed) => framed.read_startup_message().await,
|
||||
MaybeWriteOnly::WriteOnly(_) => {
|
||||
Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
|
||||
}
|
||||
MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
|
||||
}
|
||||
}
|
||||
|
||||
async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
|
||||
match self {
|
||||
MaybeWriteOnly::Full(framed) => framed.read_message().await,
|
||||
MaybeWriteOnly::WriteOnly(_) => {
|
||||
Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
|
||||
}
|
||||
MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
|
||||
}
|
||||
}
|
||||
|
||||
fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> {
|
||||
match self {
|
||||
MaybeWriteOnly::Full(framed) => framed.write_message(msg),
|
||||
MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.write_message_noflush(msg),
|
||||
MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
|
||||
}
|
||||
}
|
||||
|
||||
async fn flush(&mut self) -> io::Result<()> {
|
||||
match self {
|
||||
MaybeWriteOnly::Full(framed) => framed.flush().await,
|
||||
MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.flush().await,
|
||||
MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
|
||||
}
|
||||
}
|
||||
|
||||
async fn shutdown(&mut self) -> io::Result<()> {
|
||||
match self {
|
||||
MaybeWriteOnly::Full(framed) => framed.shutdown().await,
|
||||
MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.shutdown().await,
|
||||
MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PostgresBackend {
|
||||
framed: MaybeWriteOnly,
|
||||
|
||||
pub state: ProtoState,
|
||||
|
||||
auth_type: AuthType,
|
||||
|
||||
peer_addr: SocketAddr,
|
||||
pub tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
}
|
||||
|
||||
pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
|
||||
let mut query_string = query_string.to_vec();
|
||||
if let Some(ch) = query_string.last() {
|
||||
if *ch == 0 {
|
||||
query_string.pop();
|
||||
}
|
||||
}
|
||||
query_string
|
||||
}
|
||||
|
||||
/// Cast a byte slice to a string slice, dropping null terminator if there's one.
|
||||
fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
|
||||
let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
|
||||
std::str::from_utf8(without_null).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
impl PostgresBackend {
|
||||
pub fn new(
|
||||
socket: tokio::net::TcpStream,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
) -> io::Result<Self> {
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
let stream = MaybeTlsStream::Unencrypted(socket);
|
||||
|
||||
Ok(Self {
|
||||
framed: MaybeWriteOnly::Full(Framed::new(stream)),
|
||||
state: ProtoState::Initialization,
|
||||
auth_type,
|
||||
tls_config,
|
||||
peer_addr,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_peer_addr(&self) -> &SocketAddr {
|
||||
&self.peer_addr
|
||||
}
|
||||
|
||||
/// Read full message or return None if connection is cleanly closed with no
|
||||
/// unprocessed data.
|
||||
pub async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
|
||||
if let ProtoState::Closed = self.state {
|
||||
Ok(None)
|
||||
} else {
|
||||
let m = self.framed.read_message().await?;
|
||||
trace!("read msg {:?}", m);
|
||||
Ok(m)
|
||||
}
|
||||
}
|
||||
|
||||
/// Write message into internal output buffer, doesn't flush it. Technically
|
||||
/// error type can be only ProtocolError here (if, unlikely, serialization
|
||||
/// fails), but callers typically wrap it anyway.
|
||||
pub fn write_message_noflush(
|
||||
&mut self,
|
||||
message: &BeMessage<'_>,
|
||||
) -> Result<&mut Self, ConnectionError> {
|
||||
self.framed.write_message_noflush(message)?;
|
||||
trace!("wrote msg {:?}", message);
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
/// Flush output buffer into the socket.
|
||||
pub async fn flush(&mut self) -> io::Result<()> {
|
||||
self.framed.flush().await
|
||||
}
|
||||
|
||||
/// Polling version of `flush()`, saves the caller need to pin.
|
||||
pub fn poll_flush(
|
||||
&mut self,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
let flush_fut = self.flush();
|
||||
pin_mut!(flush_fut);
|
||||
flush_fut.poll(cx)
|
||||
}
|
||||
|
||||
/// Write message into internal output buffer and flush it to the stream.
|
||||
pub async fn write_message(
|
||||
&mut self,
|
||||
message: &BeMessage<'_>,
|
||||
) -> Result<&mut Self, ConnectionError> {
|
||||
self.write_message_noflush(message)?;
|
||||
self.flush().await?;
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
/// Returns an AsyncWrite implementation that wraps all the data written
|
||||
/// to it in CopyData messages, and writes them to the connection
|
||||
///
|
||||
/// The caller is responsible for sending CopyOutResponse and CopyDone messages.
|
||||
pub fn copyout_writer(&mut self) -> CopyDataWriter {
|
||||
CopyDataWriter { pgb: self }
|
||||
}
|
||||
|
||||
/// Wrapper for run_message_loop() that shuts down socket when we are done
|
||||
pub async fn run<F, S>(
|
||||
mut self,
|
||||
handler: &mut impl Handler,
|
||||
shutdown_watcher: F,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
F: Fn() -> S,
|
||||
S: Future,
|
||||
{
|
||||
let ret = self.run_message_loop(handler, shutdown_watcher).await;
|
||||
// socket might be already closed, e.g. if previously received error,
|
||||
// so ignore result.
|
||||
self.framed.shutdown().await.ok();
|
||||
ret
|
||||
}
|
||||
|
||||
async fn run_message_loop<F, S>(
|
||||
&mut self,
|
||||
handler: &mut impl Handler,
|
||||
shutdown_watcher: F,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
F: Fn() -> S,
|
||||
S: Future,
|
||||
{
|
||||
trace!("postgres backend to {:?} started", self.peer_addr);
|
||||
|
||||
tokio::select!(
|
||||
biased;
|
||||
|
||||
_ = shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
tracing::info!("shutdown request received during handshake");
|
||||
return Ok(())
|
||||
},
|
||||
|
||||
result = self.handshake(handler) => {
|
||||
// Handshake complete.
|
||||
result?;
|
||||
if self.state == ProtoState::Closed {
|
||||
return Ok(()); // EOF during handshake
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// Authentication completed
|
||||
let mut query_string = Bytes::new();
|
||||
while let Some(msg) = tokio::select!(
|
||||
biased;
|
||||
_ = shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
tracing::info!("shutdown request received in run_message_loop");
|
||||
Ok(None)
|
||||
},
|
||||
msg = self.read_message() => { msg },
|
||||
)? {
|
||||
trace!("got message {:?}", msg);
|
||||
|
||||
let result = self.process_message(handler, msg, &mut query_string).await;
|
||||
self.flush().await?;
|
||||
match result? {
|
||||
ProcessMsgResult::Continue => {
|
||||
self.flush().await?;
|
||||
continue;
|
||||
}
|
||||
ProcessMsgResult::Break => break,
|
||||
}
|
||||
}
|
||||
|
||||
trace!("postgres backend to {:?} exited", self.peer_addr);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Try to upgrade MaybeTlsStream into actual TLS one, performing handshake.
|
||||
async fn tls_upgrade(
|
||||
src: MaybeTlsStream,
|
||||
tls_config: Arc<rustls::ServerConfig>,
|
||||
) -> anyhow::Result<MaybeTlsStream> {
|
||||
match src {
|
||||
MaybeTlsStream::Unencrypted(s) => {
|
||||
let acceptor = TlsAcceptor::from(tls_config);
|
||||
let tls_stream = acceptor.accept(s).await?;
|
||||
Ok(MaybeTlsStream::Tls(Box::new(tls_stream)))
|
||||
}
|
||||
MaybeTlsStream::Tls(_) => {
|
||||
anyhow::bail!("TLS already started");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn start_tls(&mut self) -> anyhow::Result<()> {
|
||||
// temporary replace stream with fake to cook TLS one, Indiana Jones style
|
||||
match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) {
|
||||
MaybeWriteOnly::Full(framed) => {
|
||||
let tls_config = self
|
||||
.tls_config
|
||||
.as_ref()
|
||||
.context("start_tls called without conf")?
|
||||
.clone();
|
||||
let tls_framed = framed
|
||||
.map_stream(|s| PostgresBackend::tls_upgrade(s, tls_config))
|
||||
.await?;
|
||||
// push back ready TLS stream
|
||||
self.framed = MaybeWriteOnly::Full(tls_framed);
|
||||
Ok(())
|
||||
}
|
||||
MaybeWriteOnly::WriteOnly(_) => {
|
||||
anyhow::bail!("TLS upgrade attempt in split state")
|
||||
}
|
||||
MaybeWriteOnly::Broken => panic!("TLS upgrade on framed in invalid state"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split off owned read part from which messages can be read in different
|
||||
/// task/thread.
|
||||
pub fn split(&mut self) -> anyhow::Result<PostgresBackendReader> {
|
||||
// temporary replace stream with fake to cook split one, Indiana Jones style
|
||||
match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) {
|
||||
MaybeWriteOnly::Full(framed) => {
|
||||
let (reader, writer) = framed.split();
|
||||
self.framed = MaybeWriteOnly::WriteOnly(writer);
|
||||
Ok(PostgresBackendReader(reader))
|
||||
}
|
||||
MaybeWriteOnly::WriteOnly(_) => {
|
||||
anyhow::bail!("PostgresBackend is already split")
|
||||
}
|
||||
MaybeWriteOnly::Broken => panic!("split on framed in invalid state"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Join read part back.
|
||||
pub fn unsplit(&mut self, reader: PostgresBackendReader) -> anyhow::Result<()> {
|
||||
// temporary replace stream with fake to cook joined one, Indiana Jones style
|
||||
match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) {
|
||||
MaybeWriteOnly::Full(_) => {
|
||||
anyhow::bail!("PostgresBackend is not split")
|
||||
}
|
||||
MaybeWriteOnly::WriteOnly(writer) => {
|
||||
let joined = Framed::unsplit(reader.0, writer);
|
||||
self.framed = MaybeWriteOnly::Full(joined);
|
||||
Ok(())
|
||||
}
|
||||
MaybeWriteOnly::Broken => panic!("unsplit on framed in invalid state"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Perform handshake with the client, transitioning to Established.
|
||||
/// In case of EOF during handshake logs this, sets state to Closed and returns Ok(()).
|
||||
async fn handshake(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
|
||||
while self.state < ProtoState::Authentication {
|
||||
match self.framed.read_startup_message().await? {
|
||||
Some(msg) => {
|
||||
self.process_startup_message(handler, msg).await?;
|
||||
}
|
||||
None => {
|
||||
trace!(
|
||||
"postgres backend to {:?} received EOF during handshake",
|
||||
self.peer_addr
|
||||
);
|
||||
self.state = ProtoState::Closed;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Perform auth, if needed.
|
||||
if self.state == ProtoState::Authentication {
|
||||
match self.framed.read_message().await? {
|
||||
Some(FeMessage::PasswordMessage(m)) => {
|
||||
assert!(self.auth_type == AuthType::NeonJWT);
|
||||
|
||||
let (_, jwt_response) = m.split_last().context("protocol violation")?;
|
||||
|
||||
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
self.write_message_noflush(&BeMessage::AuthenticationOk)?
|
||||
.write_message_noflush(&BeMessage::CLIENT_ENCODING)?
|
||||
.write_message(&BeMessage::ReadyForQuery)
|
||||
.await?;
|
||||
self.state = ProtoState::Established;
|
||||
}
|
||||
Some(m) => {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"Unexpected message {:?} while waiting for handshake",
|
||||
m
|
||||
)));
|
||||
}
|
||||
None => {
|
||||
trace!(
|
||||
"postgres backend to {:?} received EOF during auth",
|
||||
self.peer_addr
|
||||
);
|
||||
self.state = ProtoState::Closed;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Process startup packet:
|
||||
/// - transition to Established if auth type is trust
|
||||
/// - transition to Authentication if auth type is NeonJWT.
|
||||
/// - or perform TLS handshake -- then need to call this again to receive
|
||||
/// actual startup packet.
|
||||
async fn process_startup_message(
|
||||
&mut self,
|
||||
handler: &mut impl Handler,
|
||||
msg: FeStartupPacket,
|
||||
) -> Result<(), QueryError> {
|
||||
assert!(self.state < ProtoState::Authentication);
|
||||
let have_tls = self.tls_config.is_some();
|
||||
match msg {
|
||||
FeStartupPacket::SslRequest => {
|
||||
debug!("SSL requested");
|
||||
|
||||
self.write_message(&BeMessage::EncryptionResponse(have_tls))
|
||||
.await?;
|
||||
|
||||
if have_tls {
|
||||
self.start_tls().await?;
|
||||
self.state = ProtoState::Encrypted;
|
||||
}
|
||||
}
|
||||
FeStartupPacket::GssEncRequest => {
|
||||
debug!("GSS requested");
|
||||
self.write_message(&BeMessage::EncryptionResponse(false))
|
||||
.await?;
|
||||
}
|
||||
FeStartupPacket::StartupMessage { .. } => {
|
||||
if have_tls && !matches!(self.state, ProtoState::Encrypted) {
|
||||
self.write_message(&BeMessage::ErrorResponse("must connect with TLS", None))
|
||||
.await?;
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"client did not connect with TLS"
|
||||
)));
|
||||
}
|
||||
|
||||
// NB: startup() may change self.auth_type -- we are using that in proxy code
|
||||
// to bypass auth for new users.
|
||||
handler.startup(self, &msg)?;
|
||||
|
||||
match self.auth_type {
|
||||
AuthType::Trust => {
|
||||
self.write_message_noflush(&BeMessage::AuthenticationOk)?
|
||||
.write_message_noflush(&BeMessage::CLIENT_ENCODING)?
|
||||
.write_message_noflush(&BeMessage::INTEGER_DATETIMES)?
|
||||
// The async python driver requires a valid server_version
|
||||
.write_message_noflush(&BeMessage::server_version("14.1"))?
|
||||
.write_message(&BeMessage::ReadyForQuery)
|
||||
.await?;
|
||||
self.state = ProtoState::Established;
|
||||
}
|
||||
AuthType::NeonJWT => {
|
||||
self.write_message(&BeMessage::AuthenticationCleartextPassword)
|
||||
.await?;
|
||||
self.state = ProtoState::Authentication;
|
||||
}
|
||||
}
|
||||
}
|
||||
FeStartupPacket::CancelRequest { .. } => {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"Unexpected CancelRequest message during handshake"
|
||||
)));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn process_message(
|
||||
&mut self,
|
||||
handler: &mut impl Handler,
|
||||
msg: FeMessage,
|
||||
unnamed_query_string: &mut Bytes,
|
||||
) -> Result<ProcessMsgResult, QueryError> {
|
||||
// Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
|
||||
// TODO: change that to proper top-level match of protocol state with separate message handling for each state
|
||||
assert!(self.state == ProtoState::Established);
|
||||
|
||||
match msg {
|
||||
FeMessage::Query(body) => {
|
||||
// remove null terminator
|
||||
let query_string = cstr_to_str(&body)?;
|
||||
|
||||
trace!("got query {query_string:?}");
|
||||
if let Err(e) = handler.process_query(self, query_string).await {
|
||||
log_query_error(query_string, &e);
|
||||
let short_error = short_error(&e);
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&short_error,
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
}
|
||||
self.write_message_noflush(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
|
||||
FeMessage::Parse(m) => {
|
||||
*unnamed_query_string = m.query_string;
|
||||
self.write_message_noflush(&BeMessage::ParseComplete)?;
|
||||
}
|
||||
|
||||
FeMessage::Describe(_) => {
|
||||
self.write_message_noflush(&BeMessage::ParameterDescription)?
|
||||
.write_message_noflush(&BeMessage::NoData)?;
|
||||
}
|
||||
|
||||
FeMessage::Bind(_) => {
|
||||
self.write_message_noflush(&BeMessage::BindComplete)?;
|
||||
}
|
||||
|
||||
FeMessage::Close(_) => {
|
||||
self.write_message_noflush(&BeMessage::CloseComplete)?;
|
||||
}
|
||||
|
||||
FeMessage::Execute(_) => {
|
||||
let query_string = cstr_to_str(unnamed_query_string)?;
|
||||
trace!("got execute {query_string:?}");
|
||||
if let Err(e) = handler.process_query(self, query_string).await {
|
||||
log_query_error(query_string, &e);
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
}
|
||||
// NOTE there is no ReadyForQuery message. This handler is used
|
||||
// for basebackup and it uses CopyOut which doesn't require
|
||||
// ReadyForQuery message and backend just switches back to
|
||||
// processing mode after sending CopyDone or ErrorResponse.
|
||||
}
|
||||
|
||||
FeMessage::Sync => {
|
||||
self.write_message_noflush(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
|
||||
FeMessage::Terminate => {
|
||||
return Ok(ProcessMsgResult::Break);
|
||||
}
|
||||
|
||||
// We prefer explicit pattern matching to wildcards, because
|
||||
// this helps us spot the places where new variants are missing
|
||||
FeMessage::CopyData(_)
|
||||
| FeMessage::CopyDone
|
||||
| FeMessage::CopyFail
|
||||
| FeMessage::PasswordMessage(_) => {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"unexpected message type: {msg:?}",
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ProcessMsgResult::Continue)
|
||||
}
|
||||
|
||||
/// Log as info/error result of handling COPY stream and send back
|
||||
/// ErrorResponse if that makes sense. Shutdown the stream if we got
|
||||
/// Terminate. TODO: transition into waiting for Sync msg if we initiate the
|
||||
/// close.
|
||||
pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
|
||||
use CopyStreamHandlerEnd::*;
|
||||
|
||||
let expected_end = match &end {
|
||||
ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF => true,
|
||||
CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error))
|
||||
if is_expected_io_error(io_error) =>
|
||||
{
|
||||
true
|
||||
}
|
||||
_ => false,
|
||||
};
|
||||
if expected_end {
|
||||
info!("terminated: {:#}", end);
|
||||
} else {
|
||||
error!("terminated: {:?}", end);
|
||||
}
|
||||
|
||||
// Note: no current usages ever send this
|
||||
if let CopyDone = &end {
|
||||
if let Err(e) = self.write_message(&BeMessage::CopyDone).await {
|
||||
error!("failed to send CopyDone: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
if let Terminate = &end {
|
||||
self.state = ProtoState::Closed;
|
||||
}
|
||||
|
||||
let err_to_send_and_errcode = match &end {
|
||||
ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
|
||||
Other(_) => Some((end.to_string(), SQLSTATE_INTERNAL_ERROR)),
|
||||
// Note: CopyFail in duplex copy is somewhat unexpected (at least to
|
||||
// PG walsender; evidently and per my docs reading client should
|
||||
// finish it with CopyDone). It is not a problem to recover from it
|
||||
// finishing the stream in both directions like we do, but note that
|
||||
// sync rust-postgres client (which we don't use anymore) hangs if
|
||||
// socket is not closed here.
|
||||
// https://github.com/sfackler/rust-postgres/issues/755
|
||||
// https://github.com/neondatabase/neon/issues/935
|
||||
//
|
||||
// Currently, the version of tokio_postgres replication patch we use
|
||||
// sends this when it closes the stream (e.g. pageserver decided to
|
||||
// switch conn to another safekeeper and client gets dropped).
|
||||
// Moreover, seems like 'connection' task errors with 'unexpected
|
||||
// message from server' when it receives ErrorResponse (anything but
|
||||
// CopyData/CopyDone) back.
|
||||
CopyFail => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
|
||||
_ => None,
|
||||
};
|
||||
if let Some((err, errcode)) = err_to_send_and_errcode {
|
||||
if let Err(ee) = self
|
||||
.write_message(&BeMessage::ErrorResponse(&err, Some(errcode)))
|
||||
.await
|
||||
{
|
||||
error!("failed to send ErrorResponse: {}", ee);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PostgresBackendReader(FramedReader<MaybeTlsStream>);
|
||||
|
||||
impl PostgresBackendReader {
|
||||
/// Read full message or return None if connection is cleanly closed with no
|
||||
/// unprocessed data.
|
||||
pub async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
|
||||
let m = self.0.read_message().await?;
|
||||
trace!("read msg {:?}", m);
|
||||
Ok(m)
|
||||
}
|
||||
|
||||
/// Get CopyData contents of the next message in COPY stream or error
|
||||
/// closing it. The error type is wider than actual errors which can happen
|
||||
/// here -- it includes 'Other' and 'ServerInitiated', but that's ok for
|
||||
/// current callers.
|
||||
pub async fn read_copy_message(&mut self) -> Result<Bytes, CopyStreamHandlerEnd> {
|
||||
match self.read_message().await? {
|
||||
Some(msg) => match msg {
|
||||
FeMessage::CopyData(m) => Ok(m),
|
||||
FeMessage::CopyDone => Err(CopyStreamHandlerEnd::CopyDone),
|
||||
FeMessage::CopyFail => Err(CopyStreamHandlerEnd::CopyFail),
|
||||
FeMessage::Terminate => Err(CopyStreamHandlerEnd::Terminate),
|
||||
_ => Err(CopyStreamHandlerEnd::from(ConnectionError::Protocol(
|
||||
ProtocolError::Protocol(format!("unexpected message in COPY stream {:?}", msg)),
|
||||
))),
|
||||
},
|
||||
None => Err(CopyStreamHandlerEnd::EOF),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
|
||||
/// messages.
|
||||
///
|
||||
|
||||
pub struct CopyDataWriter<'a> {
|
||||
pgb: &'a mut PostgresBackend,
|
||||
}
|
||||
|
||||
impl<'a> AsyncWrite for CopyDataWriter<'a> {
|
||||
fn poll_write(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &[u8],
|
||||
) -> Poll<Result<usize, std::io::Error>> {
|
||||
let this = self.get_mut();
|
||||
|
||||
// It's not strictly required to flush between each message, but makes it easier
|
||||
// to view in wireshark, and usually the messages that the callers write are
|
||||
// decently-sized anyway.
|
||||
if let Err(err) = ready!(this.pgb.poll_flush(cx)) {
|
||||
return Poll::Ready(Err(err));
|
||||
}
|
||||
|
||||
// CopyData
|
||||
// XXX: if the input is large, we should split it into multiple messages.
|
||||
// Not sure what the threshold should be, but the ultimate hard limit is that
|
||||
// the length cannot exceed u32.
|
||||
this.pgb
|
||||
.write_message_noflush(&BeMessage::CopyData(buf))
|
||||
// write_message only writes to the buffer, so it can fail iff the
|
||||
// message is invaid, but CopyData can't be invalid.
|
||||
.map_err(|_| io::Error::new(ErrorKind::Other, "failed to serialize CopyData"))?;
|
||||
|
||||
Poll::Ready(Ok(buf.len()))
|
||||
}
|
||||
|
||||
fn poll_flush(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
let this = self.get_mut();
|
||||
this.pgb.poll_flush(cx)
|
||||
}
|
||||
|
||||
fn poll_shutdown(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
let this = self.get_mut();
|
||||
this.pgb.poll_flush(cx)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn short_error(e: &QueryError) -> String {
|
||||
match e {
|
||||
QueryError::Disconnected(connection_error) => connection_error.to_string(),
|
||||
QueryError::Other(e) => format!("{e:#}"),
|
||||
}
|
||||
}
|
||||
|
||||
fn log_query_error(query: &str, e: &QueryError) {
|
||||
match e {
|
||||
QueryError::Disconnected(ConnectionError::Io(io_error)) => {
|
||||
if is_expected_io_error(io_error) {
|
||||
info!("query handler for '{query}' failed with expected io error: {io_error}");
|
||||
} else {
|
||||
error!("query handler for '{query}' failed with io error: {io_error}");
|
||||
}
|
||||
}
|
||||
QueryError::Disconnected(other_connection_error) => {
|
||||
error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
|
||||
}
|
||||
QueryError::Other(e) => {
|
||||
error!("query handler for '{query}' failed: {e:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Something finishing handling of COPY stream, see handle_copy_stream_end.
|
||||
/// This is not always a real error, but it allows to use ? and thiserror impls.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum CopyStreamHandlerEnd {
|
||||
/// Handler initiates the end of streaming.
|
||||
#[error("{0}")]
|
||||
ServerInitiated(String),
|
||||
#[error("received CopyDone")]
|
||||
CopyDone,
|
||||
#[error("received CopyFail")]
|
||||
CopyFail,
|
||||
#[error("received Terminate")]
|
||||
Terminate,
|
||||
#[error("EOF on COPY stream")]
|
||||
EOF,
|
||||
/// The connection was lost
|
||||
#[error(transparent)]
|
||||
Disconnected(#[from] ConnectionError),
|
||||
/// Some other error
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
139
libs/postgres_backend/tests/simple_select.rs
Normal file
139
libs/postgres_backend/tests/simple_select.rs
Normal file
@@ -0,0 +1,139 @@
|
||||
/// Test postgres_backend_async with tokio_postgres
|
||||
use once_cell::sync::Lazy;
|
||||
use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
|
||||
use pq_proto::{BeMessage, RowDescriptor};
|
||||
use std::io::Cursor;
|
||||
use std::{future, sync::Arc};
|
||||
use tokio::net::{TcpListener, TcpStream};
|
||||
use tokio_postgres::config::SslMode;
|
||||
use tokio_postgres::tls::MakeTlsConnect;
|
||||
use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
|
||||
use tokio_postgres_rustls::MakeRustlsConnect;
|
||||
|
||||
// generate client, server test streams
|
||||
async fn make_tcp_pair() -> (TcpStream, TcpStream) {
|
||||
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
let client_stream = TcpStream::connect(addr).await.unwrap();
|
||||
let (server_stream, _) = listener.accept().await.unwrap();
|
||||
(client_stream, server_stream)
|
||||
}
|
||||
|
||||
struct TestHandler {}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Handler for TestHandler {
|
||||
// return single col 'hey' for any query
|
||||
async fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
_query_string: &str,
|
||||
) -> Result<(), QueryError> {
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
|
||||
b"hey",
|
||||
)]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some("hey".as_bytes())]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// test that basic select works
|
||||
#[tokio::test]
|
||||
async fn simple_select() {
|
||||
let (client_sock, server_sock) = make_tcp_pair().await;
|
||||
|
||||
// create and run pgbackend
|
||||
let pgbackend =
|
||||
PostgresBackend::new(server_sock, AuthType::Trust, None).expect("pgbackend creation");
|
||||
|
||||
tokio::spawn(async move {
|
||||
let mut handler = TestHandler {};
|
||||
pgbackend.run(&mut handler, future::pending::<()>).await
|
||||
});
|
||||
|
||||
let conf = Config::new();
|
||||
let (client, connection) = conf.connect_raw(client_sock, NoTls).await.expect("connect");
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
let first_val = &(client.simple_query("SELECT 42;").await.expect("select"))[0];
|
||||
if let SimpleQueryMessage::Row(row) = first_val {
|
||||
let first_col = row.get(0).expect("first column");
|
||||
assert_eq!(first_col, "hey");
|
||||
} else {
|
||||
panic!("expected SimpleQueryMessage::Row");
|
||||
}
|
||||
}
|
||||
|
||||
static KEY: Lazy<rustls::PrivateKey> = Lazy::new(|| {
|
||||
let mut cursor = Cursor::new(include_bytes!("key.pem"));
|
||||
rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
|
||||
});
|
||||
|
||||
static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
|
||||
let mut cursor = Cursor::new(include_bytes!("cert.pem"));
|
||||
rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
|
||||
});
|
||||
|
||||
// test that basic select with ssl works
|
||||
#[tokio::test]
|
||||
async fn simple_select_ssl() {
|
||||
let (client_sock, server_sock) = make_tcp_pair().await;
|
||||
|
||||
let server_cfg = rustls::ServerConfig::builder()
|
||||
.with_safe_defaults()
|
||||
.with_no_client_auth()
|
||||
.with_single_cert(vec![CERT.clone()], KEY.clone())
|
||||
.unwrap();
|
||||
let tls_config = Some(Arc::new(server_cfg));
|
||||
let pgbackend =
|
||||
PostgresBackend::new(server_sock, AuthType::Trust, tls_config).expect("pgbackend creation");
|
||||
|
||||
tokio::spawn(async move {
|
||||
let mut handler = TestHandler {};
|
||||
pgbackend.run(&mut handler, future::pending::<()>).await
|
||||
});
|
||||
|
||||
let client_cfg = rustls::ClientConfig::builder()
|
||||
.with_safe_defaults()
|
||||
.with_root_certificates({
|
||||
let mut store = rustls::RootCertStore::empty();
|
||||
store.add(&CERT).unwrap();
|
||||
store
|
||||
})
|
||||
.with_no_client_auth();
|
||||
let mut make_tls_connect = tokio_postgres_rustls::MakeRustlsConnect::new(client_cfg);
|
||||
let tls_connect = <MakeRustlsConnect as MakeTlsConnect<TcpStream>>::make_tls_connect(
|
||||
&mut make_tls_connect,
|
||||
"localhost",
|
||||
)
|
||||
.expect("make_tls_connect");
|
||||
|
||||
let mut conf = Config::new();
|
||||
conf.ssl_mode(SslMode::Require);
|
||||
let (client, connection) = conf
|
||||
.connect_raw(client_sock, tls_connect)
|
||||
.await
|
||||
.expect("connect");
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
let first_val = &(client.simple_query("SELECT 42;").await.expect("select"))[0];
|
||||
if let SimpleQueryMessage::Row(row) = first_val {
|
||||
let first_col = row.get(0).expect("first column");
|
||||
assert_eq!(first_col, "hey");
|
||||
} else {
|
||||
panic!("expected SimpleQueryMessage::Row");
|
||||
}
|
||||
}
|
||||
@@ -5,8 +5,8 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
byteorder.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
rand.workspace = true
|
||||
|
||||
244
libs/pq_proto/src/framed.rs
Normal file
244
libs/pq_proto/src/framed.rs
Normal file
@@ -0,0 +1,244 @@
|
||||
//! Provides `Framed` -- writing/flushing and reading Postgres messages to/from
|
||||
//! the async stream based on (and buffered with) BytesMut. All functions are
|
||||
//! cancellation safe.
|
||||
//!
|
||||
//! It is similar to what tokio_util::codec::Framed with appropriate codec
|
||||
//! provides, but `FramedReader` and `FramedWriter` read/write parts can be used
|
||||
//! separately without using split from futures::stream::StreamExt (which
|
||||
//! allocates box[1] in polling internally). tokio::io::split is used for splitting
|
||||
//! instead. Plus we customize error messages more than a single type for all io
|
||||
//! calls.
|
||||
//!
|
||||
//! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
|
||||
use bytes::{Buf, BytesMut};
|
||||
use std::{
|
||||
future::Future,
|
||||
io::{self, ErrorKind},
|
||||
};
|
||||
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, ReadHalf, WriteHalf};
|
||||
|
||||
use crate::{BeMessage, FeMessage, FeStartupPacket, ProtocolError};
|
||||
|
||||
const INITIAL_CAPACITY: usize = 8 * 1024;
|
||||
|
||||
/// Error on postgres connection: either IO (physical transport error) or
|
||||
/// protocol violation.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum ConnectionError {
|
||||
#[error(transparent)]
|
||||
Io(#[from] io::Error),
|
||||
#[error(transparent)]
|
||||
Protocol(#[from] ProtocolError),
|
||||
}
|
||||
|
||||
impl ConnectionError {
|
||||
/// Proxy stream.rs uses only io::Error; provide it.
|
||||
pub fn into_io_error(self) -> io::Error {
|
||||
match self {
|
||||
ConnectionError::Io(io) => io,
|
||||
ConnectionError::Protocol(pe) => io::Error::new(io::ErrorKind::Other, pe.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Wraps async io `stream`, providing messages to write/flush + read Postgres
|
||||
/// messages.
|
||||
pub struct Framed<S> {
|
||||
stream: S,
|
||||
read_buf: BytesMut,
|
||||
write_buf: BytesMut,
|
||||
}
|
||||
|
||||
impl<S> Framed<S> {
|
||||
pub fn new(stream: S) -> Self {
|
||||
Self {
|
||||
stream,
|
||||
read_buf: BytesMut::with_capacity(INITIAL_CAPACITY),
|
||||
write_buf: BytesMut::with_capacity(INITIAL_CAPACITY),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a shared reference to the underlying stream.
|
||||
pub fn get_ref(&self) -> &S {
|
||||
&self.stream
|
||||
}
|
||||
|
||||
/// Extract the underlying stream.
|
||||
pub fn into_inner(self) -> S {
|
||||
self.stream
|
||||
}
|
||||
|
||||
/// Return new Framed with stream type transformed by async f, for TLS
|
||||
/// upgrade.
|
||||
pub async fn map_stream<S2, E, F, Fut>(self, f: F) -> Result<Framed<S2>, E>
|
||||
where
|
||||
F: FnOnce(S) -> Fut,
|
||||
Fut: Future<Output = Result<S2, E>>,
|
||||
{
|
||||
let stream = f(self.stream).await?;
|
||||
Ok(Framed {
|
||||
stream,
|
||||
read_buf: self.read_buf,
|
||||
write_buf: self.write_buf,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead + Unpin> Framed<S> {
|
||||
pub async fn read_startup_message(
|
||||
&mut self,
|
||||
) -> Result<Option<FeStartupPacket>, ConnectionError> {
|
||||
read_message(&mut self.stream, &mut self.read_buf, FeStartupPacket::parse).await
|
||||
}
|
||||
|
||||
pub async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
|
||||
read_message(&mut self.stream, &mut self.read_buf, FeMessage::parse).await
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncWrite + Unpin> Framed<S> {
|
||||
/// Write next message to the output buffer; doesn't flush.
|
||||
pub fn write_message(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> {
|
||||
BeMessage::write(&mut self.write_buf, msg)
|
||||
}
|
||||
|
||||
/// Flush out the buffer. This function is cancellation safe: it can be
|
||||
/// interrupted and flushing will be continued in the next call.
|
||||
pub async fn flush(&mut self) -> Result<(), io::Error> {
|
||||
flush(&mut self.stream, &mut self.write_buf).await
|
||||
}
|
||||
|
||||
/// Flush out the buffer and shutdown the stream.
|
||||
pub async fn shutdown(&mut self) -> Result<(), io::Error> {
|
||||
shutdown(&mut self.stream, &mut self.write_buf).await
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
|
||||
/// Split into owned read and write parts. Beware of potential issues with
|
||||
/// using halves in different tasks on TLS stream:
|
||||
/// https://github.com/tokio-rs/tls/issues/40
|
||||
pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) {
|
||||
let (read_half, write_half) = tokio::io::split(self.stream);
|
||||
let reader = FramedReader {
|
||||
stream: read_half,
|
||||
read_buf: self.read_buf,
|
||||
};
|
||||
let writer = FramedWriter {
|
||||
stream: write_half,
|
||||
write_buf: self.write_buf,
|
||||
};
|
||||
(reader, writer)
|
||||
}
|
||||
|
||||
/// Join read and write parts back.
|
||||
pub fn unsplit(reader: FramedReader<S>, writer: FramedWriter<S>) -> Self {
|
||||
Self {
|
||||
stream: reader.stream.unsplit(writer.stream),
|
||||
read_buf: reader.read_buf,
|
||||
write_buf: writer.write_buf,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Read-only version of `Framed`.
|
||||
pub struct FramedReader<S> {
|
||||
stream: ReadHalf<S>,
|
||||
read_buf: BytesMut,
|
||||
}
|
||||
|
||||
impl<S: AsyncRead + Unpin> FramedReader<S> {
|
||||
pub async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
|
||||
read_message(&mut self.stream, &mut self.read_buf, FeMessage::parse).await
|
||||
}
|
||||
}
|
||||
|
||||
/// Write-only version of `Framed`.
|
||||
pub struct FramedWriter<S> {
|
||||
stream: WriteHalf<S>,
|
||||
write_buf: BytesMut,
|
||||
}
|
||||
|
||||
impl<S: AsyncWrite + Unpin> FramedWriter<S> {
|
||||
/// Write next message to the output buffer; doesn't flush.
|
||||
pub fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> {
|
||||
BeMessage::write(&mut self.write_buf, msg)
|
||||
}
|
||||
|
||||
/// Flush out the buffer. This function is cancellation safe: it can be
|
||||
/// interrupted and flushing will be continued in the next call.
|
||||
pub async fn flush(&mut self) -> Result<(), io::Error> {
|
||||
flush(&mut self.stream, &mut self.write_buf).await
|
||||
}
|
||||
|
||||
/// Flush out the buffer and shutdown the stream.
|
||||
pub async fn shutdown(&mut self) -> Result<(), io::Error> {
|
||||
shutdown(&mut self.stream, &mut self.write_buf).await
|
||||
}
|
||||
}
|
||||
|
||||
/// Read next message from the stream. Returns Ok(None), if EOF happened and we
|
||||
/// don't have remaining data in the buffer. This function is cancellation safe:
|
||||
/// you can drop future which is not yet complete and finalize reading message
|
||||
/// with the next call.
|
||||
///
|
||||
/// Parametrized to allow reading startup or usual message, having different
|
||||
/// format.
|
||||
async fn read_message<S: AsyncRead + Unpin, M, P>(
|
||||
stream: &mut S,
|
||||
read_buf: &mut BytesMut,
|
||||
parse: P,
|
||||
) -> Result<Option<M>, ConnectionError>
|
||||
where
|
||||
P: Fn(&mut BytesMut) -> Result<Option<M>, ProtocolError>,
|
||||
{
|
||||
loop {
|
||||
if let Some(msg) = parse(read_buf)? {
|
||||
return Ok(Some(msg));
|
||||
}
|
||||
// If we can't build a frame yet, try to read more data and try again.
|
||||
// Make sure we've got room for at least one byte to read to ensure
|
||||
// that we don't get a spurious 0 that looks like EOF.
|
||||
read_buf.reserve(1);
|
||||
if stream.read_buf(read_buf).await? == 0 {
|
||||
if read_buf.has_remaining() {
|
||||
return Err(io::Error::new(
|
||||
ErrorKind::UnexpectedEof,
|
||||
"EOF with unprocessed data in the buffer",
|
||||
)
|
||||
.into());
|
||||
} else {
|
||||
return Ok(None); // clean EOF
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn flush<S: AsyncWrite + Unpin>(
|
||||
stream: &mut S,
|
||||
write_buf: &mut BytesMut,
|
||||
) -> Result<(), io::Error> {
|
||||
while write_buf.has_remaining() {
|
||||
let bytes_written = stream.write(write_buf.chunk()).await?;
|
||||
if bytes_written == 0 {
|
||||
return Err(io::Error::new(
|
||||
ErrorKind::WriteZero,
|
||||
"failed to write message",
|
||||
));
|
||||
}
|
||||
// The advanced part will be garbage collected, likely during shifting
|
||||
// data left on next attempt to write to buffer when free space is not
|
||||
// enough.
|
||||
write_buf.advance(bytes_written);
|
||||
}
|
||||
write_buf.clear();
|
||||
stream.flush().await
|
||||
}
|
||||
|
||||
async fn shutdown<S: AsyncWrite + Unpin>(
|
||||
stream: &mut S,
|
||||
write_buf: &mut BytesMut,
|
||||
) -> Result<(), io::Error> {
|
||||
flush(stream, write_buf).await?;
|
||||
stream.shutdown().await
|
||||
}
|
||||
@@ -2,24 +2,18 @@
|
||||
//! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
|
||||
//! on message formats.
|
||||
|
||||
// Tools for calling certain async methods in sync contexts.
|
||||
pub mod sync;
|
||||
pub mod framed;
|
||||
|
||||
use anyhow::{ensure, Context, Result};
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use postgres_protocol::PG_EPOCH;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
fmt,
|
||||
future::Future,
|
||||
io::{self, Cursor},
|
||||
str,
|
||||
fmt, io, str,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
use sync::{AsyncishRead, SyncFuture};
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tracing::{trace, warn};
|
||||
|
||||
pub type Oid = u32;
|
||||
@@ -31,7 +25,6 @@ pub const TEXT_OID: Oid = 25;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum FeMessage {
|
||||
StartupPacket(FeStartupPacket),
|
||||
// Simple query.
|
||||
Query(Bytes),
|
||||
// Extended query protocol.
|
||||
@@ -75,27 +68,36 @@ impl StartupMessageParams {
|
||||
/// taking into account all escape sequences but leaving them as-is.
|
||||
/// [`None`] means that there's no `options` in [`Self`].
|
||||
pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
|
||||
// See `postgres: pg_split_opts`.
|
||||
let mut last_was_escape = false;
|
||||
let iter = self
|
||||
.get("options")?
|
||||
.split(move |c: char| {
|
||||
// We split by non-escaped whitespace symbols.
|
||||
let should_split = c.is_ascii_whitespace() && !last_was_escape;
|
||||
last_was_escape = c == '\\' && !last_was_escape;
|
||||
should_split
|
||||
})
|
||||
.filter(|s| !s.is_empty());
|
||||
|
||||
Some(iter)
|
||||
self.get("options").map(Self::parse_options_raw)
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
/// applying all escape sequences (using owned strings as needed).
|
||||
/// [`None`] means that there's no `options` in [`Self`].
|
||||
pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
|
||||
self.get("options").map(Self::parse_options_escaped)
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
/// taking into account all escape sequences but leaving them as-is.
|
||||
pub fn parse_options_raw(input: &str) -> impl Iterator<Item = &str> {
|
||||
// See `postgres: pg_split_opts`.
|
||||
let iter = self.options_raw()?.map(|s| {
|
||||
let mut last_was_escape = false;
|
||||
input
|
||||
.split(move |c: char| {
|
||||
// We split by non-escaped whitespace symbols.
|
||||
let should_split = c.is_ascii_whitespace() && !last_was_escape;
|
||||
last_was_escape = c == '\\' && !last_was_escape;
|
||||
should_split
|
||||
})
|
||||
.filter(|s| !s.is_empty())
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
/// applying all escape sequences (using owned strings as needed).
|
||||
pub fn parse_options_escaped(input: &str) -> impl Iterator<Item = Cow<'_, str>> {
|
||||
// See `postgres: pg_split_opts`.
|
||||
Self::parse_options_raw(input).map(|s| {
|
||||
let mut preserve_next_escape = false;
|
||||
let escape = |c| {
|
||||
// We should remove '\\' unless it's preceded by '\\'.
|
||||
@@ -108,9 +110,12 @@ impl StartupMessageParams {
|
||||
true => Cow::Owned(s.replace(escape, "")),
|
||||
false => Cow::Borrowed(s),
|
||||
}
|
||||
});
|
||||
})
|
||||
}
|
||||
|
||||
Some(iter)
|
||||
/// Iterate through key-value pairs in an arbitrary order.
|
||||
pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
|
||||
self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
|
||||
}
|
||||
|
||||
// This function is mostly useful in tests.
|
||||
@@ -179,260 +184,207 @@ pub struct FeExecuteMessage {
|
||||
#[derive(Debug)]
|
||||
pub struct FeCloseMessage;
|
||||
|
||||
/// Retry a read on EINTR
|
||||
///
|
||||
/// This runs the enclosed expression, and if it returns
|
||||
/// Err(io::ErrorKind::Interrupted), retries it.
|
||||
macro_rules! retry_read {
|
||||
( $x:expr ) => {
|
||||
loop {
|
||||
match $x {
|
||||
Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
||||
res => break res,
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// An error occured during connection being open.
|
||||
/// An error occured while parsing or serializing raw stream into Postgres
|
||||
/// messages.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum ConnectionError {
|
||||
/// IO error during writing to or reading from the connection socket.
|
||||
#[error("Socket IO error: {0}")]
|
||||
Socket(std::io::Error),
|
||||
/// Invalid packet was received from client
|
||||
pub enum ProtocolError {
|
||||
/// Invalid packet was received from the client (e.g. unexpected message
|
||||
/// type or broken len).
|
||||
#[error("Protocol error: {0}")]
|
||||
Protocol(String),
|
||||
/// Failed to parse a protocol mesage
|
||||
/// Failed to parse or, (unlikely), serialize a protocol message.
|
||||
#[error("Message parse error: {0}")]
|
||||
MessageParse(anyhow::Error),
|
||||
BadMessage(String),
|
||||
}
|
||||
|
||||
impl From<anyhow::Error> for ConnectionError {
|
||||
fn from(e: anyhow::Error) -> Self {
|
||||
Self::MessageParse(e)
|
||||
}
|
||||
}
|
||||
|
||||
impl ConnectionError {
|
||||
impl ProtocolError {
|
||||
/// Proxy stream.rs uses only io::Error; provide it.
|
||||
pub fn into_io_error(self) -> io::Error {
|
||||
match self {
|
||||
ConnectionError::Socket(io) => io,
|
||||
other => io::Error::new(io::ErrorKind::Other, other.to_string()),
|
||||
}
|
||||
io::Error::new(io::ErrorKind::Other, self.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl FeMessage {
|
||||
/// Read one message from the stream.
|
||||
/// This function returns `Ok(None)` in case of EOF.
|
||||
/// One way to handle this properly:
|
||||
/// Read and parse one message from the `buf` input buffer. If there is at
|
||||
/// least one valid message, returns it, advancing `buf`; redundant copies
|
||||
/// are avoided, as thanks to `bytes` crate ptrs in parsed message point
|
||||
/// directly into the `buf` (processed data is garbage collected after
|
||||
/// parsed message is dropped).
|
||||
///
|
||||
/// ```
|
||||
/// # use std::io;
|
||||
/// # use pq_proto::FeMessage;
|
||||
/// #
|
||||
/// # fn process_message(msg: FeMessage) -> anyhow::Result<()> {
|
||||
/// # Ok(())
|
||||
/// # };
|
||||
/// #
|
||||
/// fn do_the_job(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<()> {
|
||||
/// while let Some(msg) = FeMessage::read(stream)? {
|
||||
/// process_message(msg)?;
|
||||
/// }
|
||||
/// Returns None if `buf` doesn't contain enough data for a single message.
|
||||
/// For efficiency, tries to reserve large enough space in `buf` for the
|
||||
/// next message in this case to save the repeated calls.
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[inline(never)]
|
||||
pub fn read(
|
||||
stream: &mut (impl io::Read + Unpin),
|
||||
) -> Result<Option<FeMessage>, ConnectionError> {
|
||||
Self::read_fut(&mut AsyncishRead(stream)).wait()
|
||||
}
|
||||
/// Returns Error if message is malformed, the only possible ErrorKind is
|
||||
/// InvalidInput.
|
||||
//
|
||||
// Inspired by rust-postgres Message::parse.
|
||||
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeMessage>, ProtocolError> {
|
||||
// Every message contains message type byte and 4 bytes len; can't do
|
||||
// much without them.
|
||||
if buf.len() < 5 {
|
||||
let to_read = 5 - buf.len();
|
||||
buf.reserve(to_read);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
/// Read one message from the stream.
|
||||
/// See documentation for `Self::read`.
|
||||
pub fn read_fut<Reader>(
|
||||
stream: &mut Reader,
|
||||
) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
|
||||
where
|
||||
Reader: tokio::io::AsyncRead + Unpin,
|
||||
{
|
||||
// We return a Future that's sync (has a `wait` method) if and only if the provided stream is SyncProof.
|
||||
// SyncFuture contract: we are only allowed to await on sync-proof futures, the AsyncRead and
|
||||
// AsyncReadExt methods of the stream.
|
||||
SyncFuture::new(async move {
|
||||
// Each libpq message begins with a message type byte, followed by message length
|
||||
// If the client closes the connection, return None. But if the client closes the
|
||||
// connection in the middle of a message, we will return an error.
|
||||
let tag = match retry_read!(stream.read_u8().await) {
|
||||
Ok(b) => b,
|
||||
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
|
||||
Err(e) => return Err(ConnectionError::Socket(e)),
|
||||
};
|
||||
// We shouldn't advance `buf` as probably full message is not there yet,
|
||||
// so can't directly use Bytes::get_u32 etc.
|
||||
let tag = buf[0];
|
||||
let len = (&buf[1..5]).read_u32::<BigEndian>().unwrap();
|
||||
if len < 4 {
|
||||
return Err(ProtocolError::Protocol(format!(
|
||||
"invalid message length {}",
|
||||
len
|
||||
)));
|
||||
}
|
||||
|
||||
// The message length includes itself, so it better be at least 4.
|
||||
let len = retry_read!(stream.read_u32().await)
|
||||
.map_err(ConnectionError::Socket)?
|
||||
.checked_sub(4)
|
||||
.ok_or_else(|| ConnectionError::Protocol("invalid message length".to_string()))?;
|
||||
// length field includes itself, but not message type.
|
||||
let total_len = len as usize + 1;
|
||||
if buf.len() < total_len {
|
||||
// Don't have full message yet.
|
||||
let to_read = total_len - buf.len();
|
||||
buf.reserve(to_read);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let body = {
|
||||
let mut buffer = vec![0u8; len as usize];
|
||||
stream
|
||||
.read_exact(&mut buffer)
|
||||
.await
|
||||
.map_err(ConnectionError::Socket)?;
|
||||
Bytes::from(buffer)
|
||||
};
|
||||
// got the message, advance buffer
|
||||
let mut msg = buf.split_to(total_len).freeze();
|
||||
msg.advance(5); // consume message type and len
|
||||
|
||||
match tag {
|
||||
b'Q' => Ok(Some(FeMessage::Query(body))),
|
||||
b'P' => Ok(Some(FeParseMessage::parse(body)?)),
|
||||
b'D' => Ok(Some(FeDescribeMessage::parse(body)?)),
|
||||
b'E' => Ok(Some(FeExecuteMessage::parse(body)?)),
|
||||
b'B' => Ok(Some(FeBindMessage::parse(body)?)),
|
||||
b'C' => Ok(Some(FeCloseMessage::parse(body)?)),
|
||||
b'S' => Ok(Some(FeMessage::Sync)),
|
||||
b'X' => Ok(Some(FeMessage::Terminate)),
|
||||
b'd' => Ok(Some(FeMessage::CopyData(body))),
|
||||
b'c' => Ok(Some(FeMessage::CopyDone)),
|
||||
b'f' => Ok(Some(FeMessage::CopyFail)),
|
||||
b'p' => Ok(Some(FeMessage::PasswordMessage(body))),
|
||||
tag => {
|
||||
return Err(ConnectionError::Protocol(format!(
|
||||
"unknown message tag: {tag},'{body:?}'"
|
||||
)))
|
||||
}
|
||||
match tag {
|
||||
b'Q' => Ok(Some(FeMessage::Query(msg))),
|
||||
b'P' => Ok(Some(FeParseMessage::parse(msg)?)),
|
||||
b'D' => Ok(Some(FeDescribeMessage::parse(msg)?)),
|
||||
b'E' => Ok(Some(FeExecuteMessage::parse(msg)?)),
|
||||
b'B' => Ok(Some(FeBindMessage::parse(msg)?)),
|
||||
b'C' => Ok(Some(FeCloseMessage::parse(msg)?)),
|
||||
b'S' => Ok(Some(FeMessage::Sync)),
|
||||
b'X' => Ok(Some(FeMessage::Terminate)),
|
||||
b'd' => Ok(Some(FeMessage::CopyData(msg))),
|
||||
b'c' => Ok(Some(FeMessage::CopyDone)),
|
||||
b'f' => Ok(Some(FeMessage::CopyFail)),
|
||||
b'p' => Ok(Some(FeMessage::PasswordMessage(msg))),
|
||||
tag => {
|
||||
return Err(ProtocolError::Protocol(format!(
|
||||
"unknown message tag: {tag},'{msg:?}'"
|
||||
)))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FeStartupPacket {
|
||||
/// Read startup message from the stream.
|
||||
// XXX: It's tempting yet undesirable to accept `stream` by value,
|
||||
// since such a change will cause user-supplied &mut references to be consumed
|
||||
pub fn read(
|
||||
stream: &mut (impl io::Read + Unpin),
|
||||
) -> Result<Option<FeMessage>, ConnectionError> {
|
||||
Self::read_fut(&mut AsyncishRead(stream)).wait()
|
||||
}
|
||||
|
||||
/// Read startup message from the stream.
|
||||
// XXX: It's tempting yet undesirable to accept `stream` by value,
|
||||
// since such a change will cause user-supplied &mut references to be consumed
|
||||
pub fn read_fut<Reader>(
|
||||
stream: &mut Reader,
|
||||
) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
|
||||
where
|
||||
Reader: tokio::io::AsyncRead + Unpin,
|
||||
{
|
||||
/// Read and parse startup message from the `buf` input buffer. It is
|
||||
/// different from [`FeMessage::parse`] because startup messages don't have
|
||||
/// message type byte; otherwise, its comments apply.
|
||||
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
|
||||
const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
|
||||
const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
|
||||
const CANCEL_REQUEST_CODE: u32 = 5678;
|
||||
const NEGOTIATE_SSL_CODE: u32 = 5679;
|
||||
const NEGOTIATE_GSS_CODE: u32 = 5680;
|
||||
|
||||
SyncFuture::new(async move {
|
||||
// Read length. If the connection is closed before reading anything (or before
|
||||
// reading 4 bytes, to be precise), return None to indicate that the connection
|
||||
// was closed. This matches the PostgreSQL server's behavior, which avoids noise
|
||||
// in the log if the client opens connection but closes it immediately.
|
||||
let len = match retry_read!(stream.read_u32().await) {
|
||||
Ok(len) => len as usize,
|
||||
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
|
||||
Err(e) => return Err(ConnectionError::Socket(e)),
|
||||
};
|
||||
// need at least 4 bytes with packet len
|
||||
if buf.len() < 4 {
|
||||
let to_read = 4 - buf.len();
|
||||
buf.reserve(to_read);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
#[allow(clippy::manual_range_contains)]
|
||||
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||
return Err(ConnectionError::Protocol(format!(
|
||||
"invalid message length {len}"
|
||||
// We shouldn't advance `buf` as probably full message is not there yet,
|
||||
// so can't directly use Bytes::get_u32 etc.
|
||||
let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
|
||||
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||
return Err(ProtocolError::Protocol(format!(
|
||||
"invalid startup packet message length {}",
|
||||
len
|
||||
)));
|
||||
}
|
||||
|
||||
if buf.len() < len {
|
||||
// Don't have full message yet.
|
||||
let to_read = len - buf.len();
|
||||
buf.reserve(to_read);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// got the message, advance buffer
|
||||
let mut msg = buf.split_to(len).freeze();
|
||||
msg.advance(4); // consume len
|
||||
|
||||
let request_code = msg.get_u32();
|
||||
let req_hi = request_code >> 16;
|
||||
let req_lo = request_code & ((1 << 16) - 1);
|
||||
// StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
|
||||
let message = match (req_hi, req_lo) {
|
||||
(RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
|
||||
if msg.remaining() != 8 {
|
||||
return Err(ProtocolError::BadMessage(
|
||||
"CancelRequest message is malformed, backend PID / secret key missing"
|
||||
.to_owned(),
|
||||
));
|
||||
}
|
||||
FeStartupPacket::CancelRequest(CancelKeyData {
|
||||
backend_pid: msg.get_i32(),
|
||||
cancel_key: msg.get_i32(),
|
||||
})
|
||||
}
|
||||
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
|
||||
// Requested upgrade to SSL (aka TLS)
|
||||
FeStartupPacket::SslRequest
|
||||
}
|
||||
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
|
||||
// Requested upgrade to GSSAPI
|
||||
FeStartupPacket::GssEncRequest
|
||||
}
|
||||
(RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
|
||||
return Err(ProtocolError::Protocol(format!(
|
||||
"Unrecognized request code {unrecognized_code}"
|
||||
)));
|
||||
}
|
||||
// TODO bail if protocol major_version is not 3?
|
||||
(major_version, minor_version) => {
|
||||
// StartupMessage
|
||||
|
||||
let request_code =
|
||||
retry_read!(stream.read_u32().await).map_err(ConnectionError::Socket)?;
|
||||
// Parse pairs of null-terminated strings (key, value).
|
||||
// See `postgres: ProcessStartupPacket, build_startup_packet`.
|
||||
let mut tokens = str::from_utf8(&msg)
|
||||
.map_err(|_e| {
|
||||
ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
|
||||
})?
|
||||
.strip_suffix('\0') // drop packet's own null
|
||||
.ok_or_else(|| {
|
||||
ProtocolError::Protocol(
|
||||
"StartupMessage params: missing null terminator".to_string(),
|
||||
)
|
||||
})?
|
||||
.split_terminator('\0');
|
||||
|
||||
// the rest of startup packet are params
|
||||
let params_len = len - 8;
|
||||
let mut params_bytes = vec![0u8; params_len];
|
||||
stream
|
||||
.read_exact(params_bytes.as_mut())
|
||||
.await
|
||||
.map_err(ConnectionError::Socket)?;
|
||||
let mut params = HashMap::new();
|
||||
while let Some(name) = tokens.next() {
|
||||
let value = tokens.next().ok_or_else(|| {
|
||||
ProtocolError::Protocol(
|
||||
"StartupMessage params: key without value".to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
// Parse params depending on request code
|
||||
let req_hi = request_code >> 16;
|
||||
let req_lo = request_code & ((1 << 16) - 1);
|
||||
let message = match (req_hi, req_lo) {
|
||||
(RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
|
||||
if params_len != 8 {
|
||||
return Err(ConnectionError::Protocol(
|
||||
"expected 8 bytes for CancelRequest params".to_string(),
|
||||
));
|
||||
}
|
||||
let mut cursor = Cursor::new(params_bytes);
|
||||
FeStartupPacket::CancelRequest(CancelKeyData {
|
||||
backend_pid: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
|
||||
cancel_key: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
|
||||
})
|
||||
params.insert(name.to_owned(), value.to_owned());
|
||||
}
|
||||
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
|
||||
// Requested upgrade to SSL (aka TLS)
|
||||
FeStartupPacket::SslRequest
|
||||
}
|
||||
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
|
||||
// Requested upgrade to GSSAPI
|
||||
FeStartupPacket::GssEncRequest
|
||||
}
|
||||
(RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
|
||||
return Err(ConnectionError::Protocol(format!(
|
||||
"Unrecognized request code {unrecognized_code}"
|
||||
)));
|
||||
}
|
||||
// TODO bail if protocol major_version is not 3?
|
||||
(major_version, minor_version) => {
|
||||
// Parse pairs of null-terminated strings (key, value).
|
||||
// See `postgres: ProcessStartupPacket, build_startup_packet`.
|
||||
let mut tokens = str::from_utf8(¶ms_bytes)
|
||||
.context("StartupMessage params: invalid utf-8")?
|
||||
.strip_suffix('\0') // drop packet's own null
|
||||
.ok_or_else(|| {
|
||||
ConnectionError::Protocol(
|
||||
"StartupMessage params: missing null terminator".to_string(),
|
||||
)
|
||||
})?
|
||||
.split_terminator('\0');
|
||||
|
||||
let mut params = HashMap::new();
|
||||
while let Some(name) = tokens.next() {
|
||||
let value = tokens.next().ok_or_else(|| {
|
||||
ConnectionError::Protocol(
|
||||
"StartupMessage params: key without value".to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
params.insert(name.to_owned(), value.to_owned());
|
||||
}
|
||||
|
||||
FeStartupPacket::StartupMessage {
|
||||
major_version,
|
||||
minor_version,
|
||||
params: StartupMessageParams { params },
|
||||
}
|
||||
FeStartupPacket::StartupMessage {
|
||||
major_version,
|
||||
minor_version,
|
||||
params: StartupMessageParams { params },
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Some(FeMessage::StartupPacket(message)))
|
||||
})
|
||||
}
|
||||
};
|
||||
Ok(Some(message))
|
||||
}
|
||||
}
|
||||
|
||||
impl FeParseMessage {
|
||||
fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
fn parse(mut buf: Bytes) -> Result<FeMessage, ProtocolError> {
|
||||
// FIXME: the rust-postgres driver uses a named prepared statement
|
||||
// for copy_out(). We're not prepared to handle that correctly. For
|
||||
// now, just ignore the statement name, assuming that the client never
|
||||
@@ -440,55 +392,82 @@ impl FeParseMessage {
|
||||
|
||||
let _pstmt_name = read_cstr(&mut buf)?;
|
||||
let query_string = read_cstr(&mut buf)?;
|
||||
if buf.remaining() < 2 {
|
||||
return Err(ProtocolError::BadMessage(
|
||||
"Parse message is malformed, nparams missing".to_string(),
|
||||
));
|
||||
}
|
||||
let nparams = buf.get_i16();
|
||||
|
||||
ensure!(nparams == 0, "query params not implemented");
|
||||
if nparams != 0 {
|
||||
return Err(ProtocolError::BadMessage(
|
||||
"query params not implemented".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(FeMessage::Parse(FeParseMessage { query_string }))
|
||||
}
|
||||
}
|
||||
|
||||
impl FeDescribeMessage {
|
||||
fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
fn parse(mut buf: Bytes) -> Result<FeMessage, ProtocolError> {
|
||||
let kind = buf.get_u8();
|
||||
let _pstmt_name = read_cstr(&mut buf)?;
|
||||
|
||||
// FIXME: see FeParseMessage::parse
|
||||
ensure!(
|
||||
kind == b'S',
|
||||
"only prepared statemement Describe is implemented"
|
||||
);
|
||||
if kind != b'S' {
|
||||
return Err(ProtocolError::BadMessage(
|
||||
"only prepared statemement Describe is implemented".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(FeMessage::Describe(FeDescribeMessage { kind }))
|
||||
}
|
||||
}
|
||||
|
||||
impl FeExecuteMessage {
|
||||
fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
fn parse(mut buf: Bytes) -> Result<FeMessage, ProtocolError> {
|
||||
let portal_name = read_cstr(&mut buf)?;
|
||||
if buf.remaining() < 4 {
|
||||
return Err(ProtocolError::BadMessage(
|
||||
"FeExecuteMessage message is malformed, maxrows missing".to_string(),
|
||||
));
|
||||
}
|
||||
let maxrows = buf.get_i32();
|
||||
|
||||
ensure!(portal_name.is_empty(), "named portals not implemented");
|
||||
ensure!(maxrows == 0, "row limit in Execute message not implemented");
|
||||
if !portal_name.is_empty() {
|
||||
return Err(ProtocolError::BadMessage(
|
||||
"named portals not implemented".to_string(),
|
||||
));
|
||||
}
|
||||
if maxrows != 0 {
|
||||
return Err(ProtocolError::BadMessage(
|
||||
"row limit in Execute message not implemented".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(FeMessage::Execute(FeExecuteMessage { maxrows }))
|
||||
}
|
||||
}
|
||||
|
||||
impl FeBindMessage {
|
||||
fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
fn parse(mut buf: Bytes) -> Result<FeMessage, ProtocolError> {
|
||||
let portal_name = read_cstr(&mut buf)?;
|
||||
let _pstmt_name = read_cstr(&mut buf)?;
|
||||
|
||||
// FIXME: see FeParseMessage::parse
|
||||
ensure!(portal_name.is_empty(), "named portals not implemented");
|
||||
if !portal_name.is_empty() {
|
||||
return Err(ProtocolError::BadMessage(
|
||||
"named portals not implemented".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(FeMessage::Bind(FeBindMessage))
|
||||
}
|
||||
}
|
||||
|
||||
impl FeCloseMessage {
|
||||
fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
fn parse(mut buf: Bytes) -> Result<FeMessage, ProtocolError> {
|
||||
let _kind = buf.get_u8();
|
||||
let _pstmt_or_portal_name = read_cstr(&mut buf)?;
|
||||
|
||||
@@ -517,6 +496,7 @@ pub enum BeMessage<'a> {
|
||||
CloseComplete,
|
||||
// None means column is NULL
|
||||
DataRow(&'a [Option<&'a [u8]>]),
|
||||
// None errcode means internal_error will be sent.
|
||||
ErrorResponse(&'a str, Option<&'a [u8; 5]>),
|
||||
/// Single byte - used in response to SSLRequest/GSSENCRequest.
|
||||
EncryptionResponse(bool),
|
||||
@@ -547,6 +527,11 @@ impl<'a> BeMessage<'a> {
|
||||
value: b"UTF8",
|
||||
};
|
||||
|
||||
pub const INTEGER_DATETIMES: Self = Self::ParameterStatus {
|
||||
name: b"integer_datetimes",
|
||||
value: b"on",
|
||||
};
|
||||
|
||||
/// Build a [`BeMessage::ParameterStatus`] holding the server version.
|
||||
pub fn server_version(version: &'a str) -> Self {
|
||||
Self::ParameterStatus {
|
||||
@@ -625,7 +610,7 @@ impl RowDescriptor<'_> {
|
||||
#[derive(Debug)]
|
||||
pub struct XLogDataBody<'a> {
|
||||
pub wal_start: u64,
|
||||
pub wal_end: u64,
|
||||
pub wal_end: u64, // current end of WAL on the server
|
||||
pub timestamp: i64,
|
||||
pub data: &'a [u8],
|
||||
}
|
||||
@@ -665,12 +650,11 @@ fn write_body<R>(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R {
|
||||
}
|
||||
|
||||
/// Safe write of s into buf as cstring (String in the protocol).
|
||||
fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> {
|
||||
fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), ProtocolError> {
|
||||
let bytes = s.as_ref();
|
||||
if bytes.contains(&0) {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"string contains embedded null",
|
||||
return Err(ProtocolError::BadMessage(
|
||||
"string contains embedded null".to_owned(),
|
||||
));
|
||||
}
|
||||
buf.put_slice(bytes);
|
||||
@@ -678,22 +662,27 @@ fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_cstr(buf: &mut Bytes) -> anyhow::Result<Bytes> {
|
||||
let pos = buf.iter().position(|x| *x == 0);
|
||||
let result = buf.split_to(pos.context("missing terminator")?);
|
||||
/// Read cstring from buf, advancing it.
|
||||
fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
|
||||
let pos = buf
|
||||
.iter()
|
||||
.position(|x| *x == 0)
|
||||
.ok_or_else(|| ProtocolError::BadMessage("missing cstring terminator".to_owned()))?;
|
||||
let result = buf.split_to(pos);
|
||||
buf.advance(1); // drop the null terminator
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
|
||||
pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000";
|
||||
|
||||
impl<'a> BeMessage<'a> {
|
||||
/// Write message to the given buf.
|
||||
// Unlike the reading side, we use BytesMut
|
||||
// here as msg len precedes its body and it is handy to write it down first
|
||||
// and then fill the length. With Write we would have to either calc it
|
||||
// manually or have one more buffer.
|
||||
pub fn write(buf: &mut BytesMut, message: &BeMessage) -> io::Result<()> {
|
||||
/// Serialize `message` to the given `buf`.
|
||||
/// Apart from smart memory managemet, BytesMut is good here as msg len
|
||||
/// precedes its body and it is handy to write it down first and then fill
|
||||
/// the length. With Write we would have to either calc it manually or have
|
||||
/// one more buffer.
|
||||
pub fn write(buf: &mut BytesMut, message: &BeMessage) -> Result<(), ProtocolError> {
|
||||
match message {
|
||||
BeMessage::AuthenticationOk => {
|
||||
buf.put_u8(b'R');
|
||||
@@ -738,7 +727,7 @@ impl<'a> BeMessage<'a> {
|
||||
buf.put_slice(extra);
|
||||
}
|
||||
}
|
||||
Ok::<_, io::Error>(())
|
||||
Ok(())
|
||||
})?;
|
||||
}
|
||||
|
||||
@@ -842,7 +831,7 @@ impl<'a> BeMessage<'a> {
|
||||
write_cstr(error_msg, buf)?;
|
||||
|
||||
buf.put_u8(0); // terminator
|
||||
Ok::<_, io::Error>(())
|
||||
Ok(())
|
||||
})?;
|
||||
}
|
||||
|
||||
@@ -865,7 +854,7 @@ impl<'a> BeMessage<'a> {
|
||||
write_cstr(error_msg.as_bytes(), buf)?;
|
||||
|
||||
buf.put_u8(0); // terminator
|
||||
Ok::<_, io::Error>(())
|
||||
Ok(())
|
||||
})?;
|
||||
}
|
||||
|
||||
@@ -920,7 +909,7 @@ impl<'a> BeMessage<'a> {
|
||||
buf.put_i32(-1); /* typmod */
|
||||
buf.put_i16(0); /* format code */
|
||||
}
|
||||
Ok::<_, io::Error>(())
|
||||
Ok(())
|
||||
})?;
|
||||
}
|
||||
|
||||
@@ -987,7 +976,7 @@ impl ReplicationFeedback {
|
||||
// null-terminated string - key,
|
||||
// uint32 - value length in bytes
|
||||
// value itself
|
||||
pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
|
||||
pub fn serialize(&self, buf: &mut BytesMut) {
|
||||
buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys
|
||||
buf.put_slice(b"current_timeline_size\0");
|
||||
buf.put_i32(8);
|
||||
@@ -1012,7 +1001,6 @@ impl ReplicationFeedback {
|
||||
buf.put_slice(b"ps_replytime\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_i64(timestamp);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Deserialize ReplicationFeedback message
|
||||
@@ -1080,7 +1068,7 @@ mod tests {
|
||||
// because it is rounded up to microseconds during serialization.
|
||||
rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
let mut data = BytesMut::new();
|
||||
rf.serialize(&mut data).unwrap();
|
||||
rf.serialize(&mut data);
|
||||
|
||||
let rf_parsed = ReplicationFeedback::parse(data.freeze());
|
||||
assert_eq!(rf, rf_parsed);
|
||||
@@ -1095,7 +1083,7 @@ mod tests {
|
||||
// because it is rounded up to microseconds during serialization.
|
||||
rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
let mut data = BytesMut::new();
|
||||
rf.serialize(&mut data).unwrap();
|
||||
rf.serialize(&mut data);
|
||||
|
||||
// Add an extra field to the buffer and adjust number of keys
|
||||
if let Some(first) = data.first_mut() {
|
||||
@@ -1137,15 +1125,6 @@ mod tests {
|
||||
let params = make_params("foo\\ bar \\ \\\\ baz\\ lol");
|
||||
assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]);
|
||||
}
|
||||
|
||||
// Make sure that `read` is sync/async callable
|
||||
async fn _assert(stream: &mut (impl tokio::io::AsyncRead + Unpin)) {
|
||||
let _ = FeMessage::read(&mut [].as_ref());
|
||||
let _ = FeMessage::read_fut(stream).await;
|
||||
|
||||
let _ = FeStartupPacket::read(&mut [].as_ref());
|
||||
let _ = FeStartupPacket::read_fut(stream).await;
|
||||
}
|
||||
}
|
||||
|
||||
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
|
||||
|
||||
@@ -1,179 +0,0 @@
|
||||
use pin_project_lite::pin_project;
|
||||
use std::future::Future;
|
||||
use std::marker::PhantomData;
|
||||
use std::pin::Pin;
|
||||
use std::{io, task};
|
||||
|
||||
pin_project! {
|
||||
/// We use this future to mark certain methods
|
||||
/// as callable in both sync and async modes.
|
||||
#[repr(transparent)]
|
||||
pub struct SyncFuture<S, T: Future> {
|
||||
#[pin]
|
||||
inner: T,
|
||||
_marker: PhantomData<S>,
|
||||
}
|
||||
}
|
||||
|
||||
/// This wrapper lets us synchronously wait for inner future's completion
|
||||
/// (see [`SyncFuture::wait`]) **provided that `S` implements [`SyncProof`]**.
|
||||
/// For instance, `S` may be substituted with types implementing
|
||||
/// [`tokio::io::AsyncRead`], but it's not the only viable option.
|
||||
impl<S, T: Future> SyncFuture<S, T> {
|
||||
/// NOTE: caller should carefully pick a type for `S`,
|
||||
/// because we don't want to enable [`SyncFuture::wait`] when
|
||||
/// it's in fact impossible to run the future synchronously.
|
||||
/// Violation of this contract will not cause UB, but
|
||||
/// panics and async event loop freezes won't please you.
|
||||
///
|
||||
/// Example:
|
||||
///
|
||||
/// ```
|
||||
/// # use pq_proto::sync::SyncFuture;
|
||||
/// # use std::future::Future;
|
||||
/// # use tokio::io::AsyncReadExt;
|
||||
/// #
|
||||
/// // Parse a pair of numbers from a stream
|
||||
/// pub fn parse_pair<Reader>(
|
||||
/// stream: &mut Reader,
|
||||
/// ) -> SyncFuture<Reader, impl Future<Output = anyhow::Result<(u32, u64)>> + '_>
|
||||
/// where
|
||||
/// Reader: tokio::io::AsyncRead + Unpin,
|
||||
/// {
|
||||
/// // If `Reader` is a `SyncProof`, this will give caller
|
||||
/// // an opportunity to use `SyncFuture::wait`, because
|
||||
/// // `.await` will always result in `Poll::Ready`.
|
||||
/// SyncFuture::new(async move {
|
||||
/// let x = stream.read_u32().await?;
|
||||
/// let y = stream.read_u64().await?;
|
||||
/// Ok((x, y))
|
||||
/// })
|
||||
/// }
|
||||
/// ```
|
||||
pub fn new(inner: T) -> Self {
|
||||
Self {
|
||||
inner,
|
||||
_marker: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S, T: Future> Future for SyncFuture<S, T> {
|
||||
type Output = T::Output;
|
||||
|
||||
/// In async code, [`SyncFuture`] behaves like a regular wrapper.
|
||||
#[inline(always)]
|
||||
fn poll(self: Pin<&mut Self>, cx: &mut task::Context<'_>) -> task::Poll<Self::Output> {
|
||||
self.project().inner.poll(cx)
|
||||
}
|
||||
}
|
||||
|
||||
/// Postulates that we can call [`SyncFuture::wait`].
|
||||
/// If implementer is also a [`Future`], it should always
|
||||
/// return [`task::Poll::Ready`] from [`Future::poll`].
|
||||
///
|
||||
/// Each implementation should document which futures
|
||||
/// specifically are being declared sync-proof.
|
||||
pub trait SyncPostulate {}
|
||||
|
||||
impl<T: SyncPostulate> SyncPostulate for &T {}
|
||||
impl<T: SyncPostulate> SyncPostulate for &mut T {}
|
||||
|
||||
impl<P: SyncPostulate, T: Future> SyncFuture<P, T> {
|
||||
/// Synchronously wait for future completion.
|
||||
pub fn wait(mut self) -> T::Output {
|
||||
const RAW_WAKER: task::RawWaker = task::RawWaker::new(
|
||||
std::ptr::null(),
|
||||
&task::RawWakerVTable::new(
|
||||
|_| RAW_WAKER,
|
||||
|_| panic!("SyncFuture: failed to wake"),
|
||||
|_| panic!("SyncFuture: failed to wake by ref"),
|
||||
|_| { /* drop is no-op */ },
|
||||
),
|
||||
);
|
||||
|
||||
// SAFETY: We never move `self` during this call;
|
||||
// furthermore, it will be dropped in the end regardless of panics
|
||||
let this = unsafe { Pin::new_unchecked(&mut self) };
|
||||
|
||||
// SAFETY: This waker doesn't do anything apart from panicking
|
||||
let waker = unsafe { task::Waker::from_raw(RAW_WAKER) };
|
||||
let context = &mut task::Context::from_waker(&waker);
|
||||
|
||||
match this.poll(context) {
|
||||
task::Poll::Ready(res) => res,
|
||||
_ => panic!("SyncFuture: unexpected pending!"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This wrapper turns any [`std::io::Read`] into a blocking [`tokio::io::AsyncRead`],
|
||||
/// which lets us abstract over sync & async readers in methods returning [`SyncFuture`].
|
||||
/// NOTE: you **should not** use this in async code.
|
||||
#[repr(transparent)]
|
||||
pub struct AsyncishRead<T: io::Read + Unpin>(pub T);
|
||||
|
||||
/// This lets us call [`SyncFuture<AsyncishRead<_>, _>::wait`],
|
||||
/// and allows the future to await on any of the [`AsyncRead`]
|
||||
/// and [`AsyncReadExt`] methods on `AsyncishRead`.
|
||||
impl<T: io::Read + Unpin> SyncPostulate for AsyncishRead<T> {}
|
||||
|
||||
impl<T: io::Read + Unpin> tokio::io::AsyncRead for AsyncishRead<T> {
|
||||
#[inline(always)]
|
||||
fn poll_read(
|
||||
mut self: Pin<&mut Self>,
|
||||
_cx: &mut task::Context<'_>,
|
||||
buf: &mut tokio::io::ReadBuf<'_>,
|
||||
) -> task::Poll<io::Result<()>> {
|
||||
task::Poll::Ready(
|
||||
// `Read::read` will block, meaning we don't need a real event loop!
|
||||
self.0
|
||||
.read(buf.initialize_unfilled())
|
||||
.map(|sz| buf.advance(sz)),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
|
||||
// async helper(stream: &mut impl AsyncRead) -> io::Result<u32>
|
||||
fn bytes_add<Reader>(
|
||||
stream: &mut Reader,
|
||||
) -> SyncFuture<Reader, impl Future<Output = io::Result<u32>> + '_>
|
||||
where
|
||||
Reader: tokio::io::AsyncRead + Unpin,
|
||||
{
|
||||
SyncFuture::new(async move {
|
||||
let a = stream.read_u32().await?;
|
||||
let b = stream.read_u32().await?;
|
||||
Ok(a + b)
|
||||
})
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sync() {
|
||||
let bytes = [100u32.to_be_bytes(), 200u32.to_be_bytes()].concat();
|
||||
let res = bytes_add(&mut AsyncishRead(&mut &bytes[..]))
|
||||
.wait()
|
||||
.unwrap();
|
||||
assert_eq!(res, 300);
|
||||
}
|
||||
|
||||
// We need a single-threaded executor for this test
|
||||
#[tokio::test(flavor = "current_thread")]
|
||||
async fn test_async() {
|
||||
let (mut tx, mut rx) = tokio::net::UnixStream::pair().unwrap();
|
||||
|
||||
let write = async move {
|
||||
tx.write_u32(100).await?;
|
||||
tx.write_u32(200).await?;
|
||||
Ok(())
|
||||
};
|
||||
|
||||
let (res, ()) = tokio::try_join!(bytes_add(&mut rx), write).unwrap();
|
||||
assert_eq!(res, 300);
|
||||
}
|
||||
}
|
||||
@@ -21,7 +21,7 @@ toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
pin-project-lite.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -111,7 +111,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
}
|
||||
|
||||
pub struct Download {
|
||||
pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send>>,
|
||||
pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send + Sync>>,
|
||||
/// Extra key-value data, associated with the current remote file.
|
||||
pub metadata: Option<StorageMetadata>,
|
||||
}
|
||||
|
||||
@@ -20,7 +20,10 @@ use aws_sdk_s3::{
|
||||
};
|
||||
use aws_smithy_http::body::SdkBody;
|
||||
use hyper::Body;
|
||||
use tokio::{io, sync::Semaphore};
|
||||
use tokio::{
|
||||
io::{self, AsyncRead},
|
||||
sync::Semaphore,
|
||||
};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::debug;
|
||||
|
||||
@@ -102,7 +105,7 @@ pub struct S3Bucket {
|
||||
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
|
||||
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
|
||||
// The helps to ensure we don't exceed the thresholds.
|
||||
concurrency_limiter: Semaphore,
|
||||
concurrency_limiter: Arc<Semaphore>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -162,7 +165,7 @@ impl S3Bucket {
|
||||
client,
|
||||
bucket_name: aws_config.bucket_name.clone(),
|
||||
prefix_in_bucket,
|
||||
concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
|
||||
concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -194,9 +197,10 @@ impl S3Bucket {
|
||||
}
|
||||
|
||||
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
|
||||
let _guard = self
|
||||
let permit = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.clone()
|
||||
.acquire_owned()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 download")
|
||||
.map_err(DownloadError::Other)?;
|
||||
@@ -217,9 +221,10 @@ impl S3Bucket {
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
Ok(Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(io::BufReader::new(
|
||||
download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
|
||||
permit,
|
||||
object_output.body.into_async_read(),
|
||||
)),
|
||||
))),
|
||||
})
|
||||
}
|
||||
Err(SdkError::ServiceError {
|
||||
@@ -240,6 +245,32 @@ impl S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
|
||||
struct RatelimitedAsyncRead<S> {
|
||||
permit: tokio::sync::OwnedSemaphorePermit,
|
||||
#[pin]
|
||||
inner: S,
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> RatelimitedAsyncRead<S> {
|
||||
fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
|
||||
RatelimitedAsyncRead { permit, inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
|
||||
fn poll_read(
|
||||
self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut io::ReadBuf<'_>,
|
||||
) -> std::task::Poll<std::io::Result<()>> {
|
||||
let this = self.project();
|
||||
this.inner.poll_read(cx, buf)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3Bucket {
|
||||
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
|
||||
@@ -7,5 +7,7 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
219
libs/tenant_size_model/src/calculation.rs
Normal file
219
libs/tenant_size_model/src/calculation.rs
Normal file
@@ -0,0 +1,219 @@
|
||||
use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
|
||||
|
||||
//
|
||||
// *-g--*---D--->
|
||||
// /
|
||||
// /
|
||||
// / *---b----*-B--->
|
||||
// / /
|
||||
// / /
|
||||
// -----*--e---*-----f----* C
|
||||
// E \
|
||||
// \
|
||||
// *--a---*---A-->
|
||||
//
|
||||
// If A and B need to be retained, is it cheaper to store
|
||||
// snapshot at C+a+b, or snapshots at A and B ?
|
||||
//
|
||||
// If D also needs to be retained, which is cheaper:
|
||||
//
|
||||
// 1. E+g+e+f+a+b
|
||||
// 2. D+C+a+b
|
||||
// 3. D+A+B
|
||||
|
||||
/// [`Segment`] which has had it's size calculated.
|
||||
#[derive(Clone, Debug)]
|
||||
struct SegmentSize {
|
||||
method: SegmentMethod,
|
||||
|
||||
// calculated size of this subtree, using this method
|
||||
accum_size: u64,
|
||||
|
||||
seg_id: usize,
|
||||
children: Vec<SegmentSize>,
|
||||
}
|
||||
|
||||
struct SizeAlternatives {
|
||||
// cheapest alternative if parent is available.
|
||||
incremental: SegmentSize,
|
||||
|
||||
// cheapest alternative if parent node is not available
|
||||
non_incremental: Option<SegmentSize>,
|
||||
}
|
||||
|
||||
impl StorageModel {
|
||||
pub fn calculate(&self) -> SizeResult {
|
||||
// Build adjacency list. 'child_list' is indexed by segment id. Each entry
|
||||
// contains a list of all child segments of the segment.
|
||||
let mut roots: Vec<usize> = Vec::new();
|
||||
let mut child_list: Vec<Vec<usize>> = Vec::new();
|
||||
child_list.resize(self.segments.len(), Vec::new());
|
||||
|
||||
for (seg_id, seg) in self.segments.iter().enumerate() {
|
||||
if let Some(parent_id) = seg.parent {
|
||||
child_list[parent_id].push(seg_id);
|
||||
} else {
|
||||
roots.push(seg_id);
|
||||
}
|
||||
}
|
||||
|
||||
let mut segment_results = Vec::new();
|
||||
segment_results.resize(
|
||||
self.segments.len(),
|
||||
SegmentSizeResult {
|
||||
method: SegmentMethod::Skipped,
|
||||
accum_size: 0,
|
||||
},
|
||||
);
|
||||
|
||||
let mut total_size = 0;
|
||||
for root in roots {
|
||||
if let Some(selected) = self.size_here(root, &child_list).non_incremental {
|
||||
StorageModel::fill_selected_sizes(&selected, &mut segment_results);
|
||||
total_size += selected.accum_size;
|
||||
} else {
|
||||
// Couldn't find any way to get this root. Error?
|
||||
}
|
||||
}
|
||||
|
||||
SizeResult {
|
||||
total_size,
|
||||
segments: segment_results,
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_selected_sizes(selected: &SegmentSize, result: &mut Vec<SegmentSizeResult>) {
|
||||
result[selected.seg_id] = SegmentSizeResult {
|
||||
method: selected.method,
|
||||
accum_size: selected.accum_size,
|
||||
};
|
||||
// recurse to children
|
||||
for child in selected.children.iter() {
|
||||
StorageModel::fill_selected_sizes(child, result);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// This is the core of the sizing calculation.
|
||||
//
|
||||
// This is a recursive function, that for each Segment calculates the best way
|
||||
// to reach all the Segments that are marked as needed in this subtree, under two
|
||||
// different conditions:
|
||||
// a) when the parent of this segment is available (as a snaphot or through WAL), and
|
||||
// b) when the parent of this segment is not available.
|
||||
//
|
||||
fn size_here(&self, seg_id: usize, child_list: &Vec<Vec<usize>>) -> SizeAlternatives {
|
||||
let seg = &self.segments[seg_id];
|
||||
// First figure out the best way to get each child
|
||||
let mut children = Vec::new();
|
||||
for child_id in &child_list[seg_id] {
|
||||
children.push(self.size_here(*child_id, child_list))
|
||||
}
|
||||
|
||||
// Method 1. If this node is not needed, we can skip it as long as we
|
||||
// take snapshots later in each sub-tree
|
||||
let snapshot_later = if !seg.needed {
|
||||
let mut snapshot_later = SegmentSize {
|
||||
seg_id,
|
||||
method: SegmentMethod::Skipped,
|
||||
accum_size: 0,
|
||||
children: Vec::new(),
|
||||
};
|
||||
|
||||
let mut possible = true;
|
||||
for child in children.iter() {
|
||||
if let Some(non_incremental) = &child.non_incremental {
|
||||
snapshot_later.accum_size += non_incremental.accum_size;
|
||||
snapshot_later.children.push(non_incremental.clone())
|
||||
} else {
|
||||
possible = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if possible {
|
||||
Some(snapshot_later)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Method 2. Get a snapshot here. This assumed to be possible, if the 'size' of
|
||||
// this Segment was given.
|
||||
let snapshot_here = if !seg.needed || seg.parent.is_none() {
|
||||
if let Some(snapshot_size) = seg.size {
|
||||
let mut snapshot_here = SegmentSize {
|
||||
seg_id,
|
||||
method: SegmentMethod::SnapshotHere,
|
||||
accum_size: snapshot_size,
|
||||
children: Vec::new(),
|
||||
};
|
||||
for child in children.iter() {
|
||||
snapshot_here.accum_size += child.incremental.accum_size;
|
||||
snapshot_here.children.push(child.incremental.clone())
|
||||
}
|
||||
Some(snapshot_here)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Method 3. Use WAL to get here from parent
|
||||
let wal_here = {
|
||||
let mut wal_here = SegmentSize {
|
||||
seg_id,
|
||||
method: SegmentMethod::Wal,
|
||||
accum_size: if let Some(parent_id) = seg.parent {
|
||||
seg.lsn - self.segments[parent_id].lsn
|
||||
} else {
|
||||
0
|
||||
},
|
||||
children: Vec::new(),
|
||||
};
|
||||
for child in children {
|
||||
wal_here.accum_size += child.incremental.accum_size;
|
||||
wal_here.children.push(child.incremental)
|
||||
}
|
||||
wal_here
|
||||
};
|
||||
|
||||
// If the parent is not available, what's the cheapest method involving
|
||||
// a snapshot here or later?
|
||||
let mut cheapest_non_incremental: Option<SegmentSize> = None;
|
||||
if let Some(snapshot_here) = snapshot_here {
|
||||
cheapest_non_incremental = Some(snapshot_here);
|
||||
}
|
||||
if let Some(snapshot_later) = snapshot_later {
|
||||
// Use <=, to prefer skipping if the size is equal
|
||||
if let Some(parent) = &cheapest_non_incremental {
|
||||
if snapshot_later.accum_size <= parent.accum_size {
|
||||
cheapest_non_incremental = Some(snapshot_later);
|
||||
}
|
||||
} else {
|
||||
cheapest_non_incremental = Some(snapshot_later);
|
||||
}
|
||||
}
|
||||
|
||||
// And what's the cheapest method, if the parent is available?
|
||||
let cheapest_incremental = if let Some(cheapest_non_incremental) = &cheapest_non_incremental
|
||||
{
|
||||
// Is it cheaper to use a snapshot here or later, anyway?
|
||||
// Use <, to prefer Wal over snapshot if the cost is the same
|
||||
if wal_here.accum_size < cheapest_non_incremental.accum_size {
|
||||
wal_here
|
||||
} else {
|
||||
cheapest_non_incremental.clone()
|
||||
}
|
||||
} else {
|
||||
wal_here
|
||||
};
|
||||
|
||||
SizeAlternatives {
|
||||
incremental: cheapest_incremental,
|
||||
non_incremental: cheapest_non_incremental,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,401 +1,70 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
//! Synthetic size calculation
|
||||
|
||||
use anyhow::Context;
|
||||
mod calculation;
|
||||
pub mod svg;
|
||||
|
||||
/// Pricing model or history size builder.
|
||||
/// StorageModel is the input to the synthetic size calculation. It represents
|
||||
/// a tree of timelines, with just the information that's needed for the
|
||||
/// calculation. This doesn't track timeline names or where each timeline
|
||||
/// begins and ends, for example. Instead, it consists of "points of interest"
|
||||
/// on the timelines. A point of interest could be the timeline start or end point,
|
||||
/// the oldest point on a timeline that needs to be retained because of PITR
|
||||
/// cutoff, or snapshot points named by the user. For each such point, and the
|
||||
/// edge connecting the points (implicit in Segment), we store information about
|
||||
/// whether we need to be able to recover to the point, and if known, the logical
|
||||
/// size at the point.
|
||||
///
|
||||
/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
|
||||
/// type.
|
||||
pub struct Storage<K: 'static> {
|
||||
segments: Vec<Segment>,
|
||||
|
||||
/// Mapping from the branch name to the index of a segment describing it's latest state.
|
||||
branches: HashMap<K, usize>,
|
||||
/// The segments must form a well-formed tree, with no loops.
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct StorageModel {
|
||||
pub segments: Vec<Segment>,
|
||||
}
|
||||
|
||||
/// Snapshot of a branch.
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
/// Segment represents one point in the tree of branches, *and* the edge that leads
|
||||
/// to it (if any). We don't need separate structs for points and edges, because each
|
||||
/// point can have only one parent.
|
||||
///
|
||||
/// When 'needed' is true, it means that we need to be able to reconstruct
|
||||
/// any version between 'parent.lsn' and 'lsn'. If you want to represent that only
|
||||
/// a single point is needed, create two Segments with the same lsn, and mark only
|
||||
/// the child as needed.
|
||||
///
|
||||
#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct Segment {
|
||||
/// Previous segment index into ['Storage::segments`], if any.
|
||||
parent: Option<usize>,
|
||||
pub parent: Option<usize>,
|
||||
|
||||
/// Description of how did we get to this state.
|
||||
///
|
||||
/// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when
|
||||
/// modifying a branch directly.
|
||||
pub op: Cow<'static, str>,
|
||||
/// LSN at this point
|
||||
pub lsn: u64,
|
||||
|
||||
/// LSN before this state
|
||||
start_lsn: u64,
|
||||
/// Logical size at this node, if known.
|
||||
pub size: Option<u64>,
|
||||
|
||||
/// LSN at this state
|
||||
pub end_lsn: u64,
|
||||
|
||||
/// Logical size before this state
|
||||
start_size: u64,
|
||||
|
||||
/// Logical size at this state. Can be None in the last Segment of a branch.
|
||||
pub end_size: Option<u64>,
|
||||
|
||||
/// Indices to [`Storage::segments`]
|
||||
///
|
||||
/// FIXME: this could be an Option<usize>
|
||||
children_after: Vec<usize>,
|
||||
|
||||
/// Determined by `retention_period` given to [`Storage::calculate`]
|
||||
/// If true, the segment from parent to this node is needed by `retention_period`
|
||||
pub needed: bool,
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
//
|
||||
// *-g--*---D--->
|
||||
// /
|
||||
// /
|
||||
// / *---b----*-B--->
|
||||
// / /
|
||||
// / /
|
||||
// -----*--e---*-----f----* C
|
||||
// E \
|
||||
// \
|
||||
// *--a---*---A-->
|
||||
//
|
||||
// If A and B need to be retained, is it cheaper to store
|
||||
// snapshot at C+a+b, or snapshots at A and B ?
|
||||
//
|
||||
// If D also needs to be retained, which is cheaper:
|
||||
//
|
||||
// 1. E+g+e+f+a+b
|
||||
// 2. D+C+a+b
|
||||
// 3. D+A+B
|
||||
/// Result of synthetic size calculation. Returned by StorageModel::calculate()
|
||||
pub struct SizeResult {
|
||||
pub total_size: u64,
|
||||
|
||||
/// [`Segment`] which has had it's size calculated.
|
||||
pub struct SegmentSize {
|
||||
pub seg_id: usize,
|
||||
|
||||
pub method: SegmentMethod,
|
||||
|
||||
this_size: u64,
|
||||
|
||||
pub children: Vec<SegmentSize>,
|
||||
// This has same length as the StorageModel::segments vector in the input.
|
||||
// Each entry in this array corresponds to the entry with same index in
|
||||
// StorageModel::segments.
|
||||
pub segments: Vec<SegmentSizeResult>,
|
||||
}
|
||||
|
||||
impl SegmentSize {
|
||||
fn total(&self) -> u64 {
|
||||
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||
}
|
||||
|
||||
pub fn total_children(&self) -> u64 {
|
||||
if self.method == SnapshotAfter {
|
||||
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||
} else {
|
||||
self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||
}
|
||||
}
|
||||
#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct SegmentSizeResult {
|
||||
pub method: SegmentMethod,
|
||||
// calculated size of this subtree, using this method
|
||||
pub accum_size: u64,
|
||||
}
|
||||
|
||||
/// Different methods to retain history from a particular state
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum SegmentMethod {
|
||||
SnapshotAfter,
|
||||
Wal,
|
||||
WalNeeded,
|
||||
SnapshotHere, // A logical snapshot is needed after this segment
|
||||
Wal, // Keep WAL leading up to this node
|
||||
Skipped,
|
||||
}
|
||||
|
||||
use SegmentMethod::*;
|
||||
|
||||
impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
/// Creates a new storage with the given default branch name.
|
||||
pub fn new(initial_branch: K) -> Storage<K> {
|
||||
let init_segment = Segment {
|
||||
op: "".into(),
|
||||
needed: false,
|
||||
parent: None,
|
||||
start_lsn: 0,
|
||||
end_lsn: 0,
|
||||
start_size: 0,
|
||||
end_size: Some(0),
|
||||
children_after: Vec::new(),
|
||||
};
|
||||
|
||||
Storage {
|
||||
segments: vec![init_segment],
|
||||
branches: HashMap::from([(initial_branch, 0)]),
|
||||
}
|
||||
}
|
||||
|
||||
/// Advances the branch with a new point, at given LSN.
|
||||
pub fn insert_point<Q: ?Sized>(
|
||||
&mut self,
|
||||
branch: &Q,
|
||||
op: Cow<'static, str>,
|
||||
lsn: u64,
|
||||
size: Option<u64>,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
|
||||
let newseg_id = self.segments.len();
|
||||
let lastseg = &mut self.segments[lastseg_id];
|
||||
|
||||
assert!(lsn > lastseg.end_lsn);
|
||||
|
||||
let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
|
||||
|
||||
let newseg = Segment {
|
||||
op,
|
||||
parent: Some(lastseg_id),
|
||||
start_lsn: lastseg.end_lsn,
|
||||
end_lsn: lsn,
|
||||
start_size,
|
||||
end_size: size,
|
||||
children_after: Vec::new(),
|
||||
needed: false,
|
||||
};
|
||||
lastseg.children_after.push(newseg_id);
|
||||
|
||||
self.segments.push(newseg);
|
||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
|
||||
pub fn modify_branch<Q: ?Sized>(
|
||||
&mut self,
|
||||
branch: &Q,
|
||||
op: Cow<'static, str>,
|
||||
lsn_bytes: u64,
|
||||
size_bytes: i64,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
|
||||
let newseg_id = self.segments.len();
|
||||
let lastseg = &mut self.segments[lastseg_id];
|
||||
|
||||
let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
|
||||
|
||||
let newseg = Segment {
|
||||
op,
|
||||
parent: Some(lastseg_id),
|
||||
start_lsn: lastseg.end_lsn,
|
||||
end_lsn: lastseg.end_lsn + lsn_bytes,
|
||||
start_size: last_end_size,
|
||||
end_size: Some((last_end_size as i64 + size_bytes) as u64),
|
||||
children_after: Vec::new(),
|
||||
needed: false,
|
||||
};
|
||||
lastseg.children_after.push(newseg_id);
|
||||
|
||||
self.segments.push(newseg);
|
||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
self.modify_branch(branch, "insert".into(), bytes, bytes as i64)
|
||||
}
|
||||
|
||||
pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
self.modify_branch(branch, "update".into(), bytes, 0i64)
|
||||
}
|
||||
|
||||
pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64))
|
||||
}
|
||||
|
||||
pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q> + std::fmt::Debug,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
// Find the right segment
|
||||
let branchseg_id = *self.branches.get(parent).with_context(|| {
|
||||
format!(
|
||||
"should had found the parent {:?} by key. in branches {:?}",
|
||||
parent, self.branches
|
||||
)
|
||||
})?;
|
||||
|
||||
let _branchseg = &mut self.segments[branchseg_id];
|
||||
|
||||
// Create branch name for it
|
||||
self.branches.insert(name, branchseg_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result<SegmentSize> {
|
||||
// Phase 1: Mark all the segments that need to be retained
|
||||
for (_branch, &last_seg_id) in self.branches.iter() {
|
||||
let last_seg = &self.segments[last_seg_id];
|
||||
let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period);
|
||||
let mut seg_id = last_seg_id;
|
||||
loop {
|
||||
let seg = &mut self.segments[seg_id];
|
||||
if seg.end_lsn < cutoff_lsn {
|
||||
break;
|
||||
}
|
||||
seg.needed = true;
|
||||
if let Some(prev_seg_id) = seg.parent {
|
||||
seg_id = prev_seg_id;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: For each oldest segment in a chain that needs to be retained,
|
||||
// calculate if we should store snapshot or WAL
|
||||
self.size_from_snapshot_later(0)
|
||||
}
|
||||
|
||||
fn size_from_wal(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
|
||||
let seg = &self.segments[seg_id];
|
||||
|
||||
let this_size = seg.end_lsn - seg.start_lsn;
|
||||
|
||||
let mut children = Vec::new();
|
||||
|
||||
// try both ways
|
||||
for &child_id in seg.children_after.iter() {
|
||||
// try each child both ways
|
||||
let child = &self.segments[child_id];
|
||||
let p1 = self.size_from_wal(child_id)?;
|
||||
|
||||
let p = if !child.needed {
|
||||
let p2 = self.size_from_snapshot_later(child_id)?;
|
||||
if p1.total() < p2.total() {
|
||||
p1
|
||||
} else {
|
||||
p2
|
||||
}
|
||||
} else {
|
||||
p1
|
||||
};
|
||||
children.push(p);
|
||||
}
|
||||
Ok(SegmentSize {
|
||||
seg_id,
|
||||
method: if seg.needed { WalNeeded } else { Wal },
|
||||
this_size,
|
||||
children,
|
||||
})
|
||||
}
|
||||
|
||||
fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
|
||||
// If this is needed, then it's time to do the snapshot and continue
|
||||
// with wal method.
|
||||
let seg = &self.segments[seg_id];
|
||||
//eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed);
|
||||
if seg.needed {
|
||||
let mut children = Vec::new();
|
||||
|
||||
for &child_id in seg.children_after.iter() {
|
||||
// try each child both ways
|
||||
let child = &self.segments[child_id];
|
||||
let p1 = self.size_from_wal(child_id)?;
|
||||
|
||||
let p = if !child.needed {
|
||||
let p2 = self.size_from_snapshot_later(child_id)?;
|
||||
if p1.total() < p2.total() {
|
||||
p1
|
||||
} else {
|
||||
p2
|
||||
}
|
||||
} else {
|
||||
p1
|
||||
};
|
||||
children.push(p);
|
||||
}
|
||||
Ok(SegmentSize {
|
||||
seg_id,
|
||||
method: WalNeeded,
|
||||
this_size: seg.start_size,
|
||||
children,
|
||||
})
|
||||
} else {
|
||||
// If any of the direct children are "needed", need to be able to reconstruct here
|
||||
let mut children_needed = false;
|
||||
for &child in seg.children_after.iter() {
|
||||
let seg = &self.segments[child];
|
||||
if seg.needed {
|
||||
children_needed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let method1 = if !children_needed {
|
||||
let mut children = Vec::new();
|
||||
for child in seg.children_after.iter() {
|
||||
children.push(self.size_from_snapshot_later(*child)?);
|
||||
}
|
||||
Some(SegmentSize {
|
||||
seg_id,
|
||||
method: Skipped,
|
||||
this_size: 0,
|
||||
children,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// If this a junction, consider snapshotting here
|
||||
let method2 = if children_needed || seg.children_after.len() >= 2 {
|
||||
let mut children = Vec::new();
|
||||
for child in seg.children_after.iter() {
|
||||
children.push(self.size_from_wal(*child)?);
|
||||
}
|
||||
let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") };
|
||||
Some(SegmentSize {
|
||||
seg_id,
|
||||
method: SnapshotAfter,
|
||||
this_size,
|
||||
children,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(match (method1, method2) {
|
||||
(None, None) => anyhow::bail!(
|
||||
"neither method was applicable: children_after={}, children_needed={}",
|
||||
seg.children_after.len(),
|
||||
children_needed
|
||||
),
|
||||
(Some(method), None) => method,
|
||||
(None, Some(method)) => method,
|
||||
(Some(method1), Some(method2)) => {
|
||||
if method1.total() < method2.total() {
|
||||
method1
|
||||
} else {
|
||||
method2
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_segments(self) -> Vec<Segment> {
|
||||
self.segments
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,269 +0,0 @@
|
||||
//! Tenant size model testing ground.
|
||||
//!
|
||||
//! Has a number of scenarios and a `main` for invoking these by number, calculating the history
|
||||
//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios
|
||||
//! into pngs.
|
||||
|
||||
use tenant_size_model::{Segment, SegmentSize, Storage};
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
fn scenario_1() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
fn scenario_2() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child")?;
|
||||
storage.update("child", 1_000)?;
|
||||
|
||||
// More updates on parent
|
||||
storage.update("main", 1_000)?;
|
||||
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
// Like 2, but more updates on main
|
||||
fn scenario_3() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child")?;
|
||||
storage.update("child", 1_000)?;
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
// Diverged branches
|
||||
fn scenario_4() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child")?;
|
||||
storage.update("child", 1_000)?;
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..8 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
fn scenario_5() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
let mut storage = Storage::new("a");
|
||||
storage.insert("a", 5000)?;
|
||||
storage.branch("a", "b")?;
|
||||
storage.update("b", 4000)?;
|
||||
storage.update("a", 2000)?;
|
||||
storage.branch("a", "c")?;
|
||||
storage.insert("c", 4000)?;
|
||||
storage.insert("a", 2000)?;
|
||||
|
||||
let size = storage.calculate(5000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
fn scenario_6() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
use std::borrow::Cow;
|
||||
|
||||
const NO_OP: Cow<'static, str> = Cow::Borrowed("");
|
||||
|
||||
let branches = [
|
||||
Some(0x7ff1edab8182025f15ae33482edb590a_u128),
|
||||
Some(0xb1719e044db05401a05a2ed588a3ad3f),
|
||||
Some(0xb68d6691c895ad0a70809470020929ef),
|
||||
];
|
||||
|
||||
// compared to other scenarios, this one uses bytes instead of kB
|
||||
|
||||
let mut storage = Storage::new(None);
|
||||
|
||||
storage.branch(&None, branches[0])?; // at 0
|
||||
storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064
|
||||
storage.branch(&branches[0], branches[1])?; // at 108951064
|
||||
storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472
|
||||
storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424
|
||||
storage.branch(&branches[0], branches[2])?; // at 283415424
|
||||
storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616
|
||||
storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400
|
||||
|
||||
let size = storage.calculate(100_000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args: Vec<String> = std::env::args().collect();
|
||||
|
||||
let scenario = if args.len() < 2 { "1" } else { &args[1] };
|
||||
|
||||
let (segments, size) = match scenario {
|
||||
"1" => scenario_1(),
|
||||
"2" => scenario_2(),
|
||||
"3" => scenario_3(),
|
||||
"4" => scenario_4(),
|
||||
"5" => scenario_5(),
|
||||
"6" => scenario_6(),
|
||||
other => {
|
||||
eprintln!("invalid scenario {}", other);
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
.unwrap();
|
||||
|
||||
graphviz_tree(&segments, &size);
|
||||
}
|
||||
|
||||
fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
|
||||
use tenant_size_model::SegmentMethod::*;
|
||||
|
||||
let seg_id = node.seg_id;
|
||||
let seg = segments.get(seg_id).unwrap();
|
||||
let lsn = seg.end_lsn;
|
||||
let size = seg.end_size.unwrap_or(0);
|
||||
let method = node.method;
|
||||
|
||||
println!(" {{");
|
||||
println!(" node [width=0.1 height=0.1 shape=oval]");
|
||||
|
||||
let tenant_size = node.total_children();
|
||||
|
||||
let penwidth = if seg.needed { 6 } else { 3 };
|
||||
let x = match method {
|
||||
SnapshotAfter =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"),
|
||||
Wal =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
|
||||
WalNeeded =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
|
||||
Skipped =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"),
|
||||
};
|
||||
|
||||
println!(" \"seg{seg_id}\" [{x}]");
|
||||
println!(" }}");
|
||||
|
||||
// Recurse. Much of the data is actually on the edge
|
||||
for child in node.children.iter() {
|
||||
let child_id = child.seg_id;
|
||||
graphviz_recurse(segments, child);
|
||||
|
||||
let edge_color = match child.method {
|
||||
SnapshotAfter => "gray",
|
||||
Wal => "black",
|
||||
WalNeeded => "black",
|
||||
Skipped => "gray",
|
||||
};
|
||||
|
||||
println!(" {{");
|
||||
println!(" edge [] ");
|
||||
print!(" \"seg{seg_id}\" -> \"seg{child_id}\" [");
|
||||
print!("color={edge_color}");
|
||||
if child.method == WalNeeded {
|
||||
print!(" penwidth=6");
|
||||
}
|
||||
if child.method == Wal {
|
||||
print!(" penwidth=3");
|
||||
}
|
||||
|
||||
let next = segments.get(child_id).unwrap();
|
||||
|
||||
if next.op.is_empty() {
|
||||
print!(
|
||||
" label=\"{} / {}\"",
|
||||
next.end_lsn - seg.end_lsn,
|
||||
(next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128)
|
||||
);
|
||||
} else {
|
||||
print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
|
||||
}
|
||||
println!("]");
|
||||
println!(" }}");
|
||||
}
|
||||
}
|
||||
|
||||
fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
|
||||
println!("digraph G {{");
|
||||
println!(" fontname=\"Helvetica,Arial,sans-serif\"");
|
||||
println!(" node [fontname=\"Helvetica,Arial,sans-serif\"]");
|
||||
println!(" edge [fontname=\"Helvetica,Arial,sans-serif\"]");
|
||||
println!(" graph [center=1 rankdir=LR]");
|
||||
println!(" edge [dir=none]");
|
||||
|
||||
graphviz_recurse(segments, tree);
|
||||
|
||||
println!("}}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scenarios_return_same_size() {
|
||||
type ScenarioFn = fn() -> anyhow::Result<(Vec<Segment>, SegmentSize)>;
|
||||
let truths: &[(u32, ScenarioFn, _)] = &[
|
||||
(line!(), scenario_1, 8000),
|
||||
(line!(), scenario_2, 9000),
|
||||
(line!(), scenario_3, 13000),
|
||||
(line!(), scenario_4, 16000),
|
||||
(line!(), scenario_5, 17000),
|
||||
(line!(), scenario_6, 333_792_000),
|
||||
];
|
||||
|
||||
for (line, scenario, expected) in truths {
|
||||
let (_, size) = scenario().unwrap();
|
||||
assert_eq!(*expected, size.total_children(), "scenario on line {line}");
|
||||
}
|
||||
}
|
||||
193
libs/tenant_size_model/src/svg.rs
Normal file
193
libs/tenant_size_model/src/svg.rs
Normal file
@@ -0,0 +1,193 @@
|
||||
use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
|
||||
use std::fmt::Write;
|
||||
|
||||
const SVG_WIDTH: f32 = 500.0;
|
||||
|
||||
struct SvgDraw<'a> {
|
||||
storage: &'a StorageModel,
|
||||
branches: &'a [String],
|
||||
seg_to_branch: &'a [usize],
|
||||
sizes: &'a [SegmentSizeResult],
|
||||
|
||||
// layout
|
||||
xscale: f32,
|
||||
min_lsn: u64,
|
||||
seg_coordinates: Vec<(f32, f32)>,
|
||||
}
|
||||
|
||||
fn draw_legend(result: &mut String) -> anyhow::Result<()> {
|
||||
writeln!(
|
||||
result,
|
||||
"<circle cx=\"10\" cy=\"10\" r=\"5\" stroke=\"red\"/>"
|
||||
)?;
|
||||
writeln!(result, "<text x=\"20\" y=\"15\">logical snapshot</text>")?;
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"5\" y1=\"30\" x2=\"15\" y2=\"30\" stroke-width=\"6\" stroke=\"black\" />"
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
"<text x=\"20\" y=\"35\">WAL within retention period</text>"
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"5\" y1=\"50\" x2=\"15\" y2=\"50\" stroke-width=\"3\" stroke=\"black\" />"
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
"<text x=\"20\" y=\"55\">WAL retained to avoid copy</text>"
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
|
||||
)?;
|
||||
writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn draw_svg(
|
||||
storage: &StorageModel,
|
||||
branches: &[String],
|
||||
seg_to_branch: &[usize],
|
||||
sizes: &SizeResult,
|
||||
) -> anyhow::Result<String> {
|
||||
let mut draw = SvgDraw {
|
||||
storage,
|
||||
branches,
|
||||
seg_to_branch,
|
||||
sizes: &sizes.segments,
|
||||
|
||||
xscale: 0.0,
|
||||
min_lsn: 0,
|
||||
seg_coordinates: Vec::new(),
|
||||
};
|
||||
|
||||
let mut result = String::new();
|
||||
|
||||
writeln!(result, "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" height=\"300\" width=\"500\">")?;
|
||||
|
||||
draw.calculate_svg_layout();
|
||||
|
||||
// Draw the tree
|
||||
for (seg_id, _seg) in storage.segments.iter().enumerate() {
|
||||
draw.draw_seg_phase1(seg_id, &mut result)?;
|
||||
}
|
||||
|
||||
// Draw snapshots
|
||||
for (seg_id, _seg) in storage.segments.iter().enumerate() {
|
||||
draw.draw_seg_phase2(seg_id, &mut result)?;
|
||||
}
|
||||
|
||||
draw_legend(&mut result)?;
|
||||
|
||||
write!(result, "</svg>")?;
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
impl<'a> SvgDraw<'a> {
|
||||
fn calculate_svg_layout(&mut self) {
|
||||
// Find x scale
|
||||
let segments = &self.storage.segments;
|
||||
let min_lsn = segments.iter().map(|s| s.lsn).fold(u64::MAX, std::cmp::min);
|
||||
let max_lsn = segments.iter().map(|s| s.lsn).fold(0, std::cmp::max);
|
||||
|
||||
// Start with 1 pixel = 1 byte. Double the scale until it fits into the image
|
||||
let mut xscale = 1.0;
|
||||
while (max_lsn - min_lsn) as f32 / xscale > SVG_WIDTH {
|
||||
xscale *= 2.0;
|
||||
}
|
||||
|
||||
// Layout the timelines on Y dimension.
|
||||
// TODO
|
||||
let mut y = 100.0;
|
||||
let mut branch_y_coordinates = Vec::new();
|
||||
for _branch in self.branches {
|
||||
branch_y_coordinates.push(y);
|
||||
y += 40.0;
|
||||
}
|
||||
|
||||
// Calculate coordinates for each point
|
||||
let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
|
||||
.map(|(seg, branch_id)| {
|
||||
let x = (seg.lsn - min_lsn) as f32 / xscale;
|
||||
let y = branch_y_coordinates[*branch_id];
|
||||
(x, y)
|
||||
})
|
||||
.collect();
|
||||
|
||||
self.xscale = xscale;
|
||||
self.min_lsn = min_lsn;
|
||||
self.seg_coordinates = seg_coordinates;
|
||||
}
|
||||
|
||||
/// Draws lines between points
|
||||
fn draw_seg_phase1(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
|
||||
let seg = &self.storage.segments[seg_id];
|
||||
|
||||
let wal_bytes = if let Some(parent_id) = seg.parent {
|
||||
seg.lsn - self.storage.segments[parent_id].lsn
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
let style = match self.sizes[seg_id].method {
|
||||
SegmentMethod::SnapshotHere => "stroke-width=\"1\" stroke=\"gray\"",
|
||||
SegmentMethod::Wal if seg.needed && wal_bytes > 0 => {
|
||||
"stroke-width=\"6\" stroke=\"black\""
|
||||
}
|
||||
SegmentMethod::Wal => "stroke-width=\"3\" stroke=\"black\"",
|
||||
SegmentMethod::Skipped => "stroke-width=\"1\" stroke=\"gray\"",
|
||||
};
|
||||
if let Some(parent_id) = seg.parent {
|
||||
let (x1, y1) = self.seg_coordinates[parent_id];
|
||||
let (x2, y2) = self.seg_coordinates[seg_id];
|
||||
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
" <title>{wal_bytes} bytes of WAL (seg {seg_id})</title>"
|
||||
)?;
|
||||
writeln!(result, "</line>")?;
|
||||
} else {
|
||||
// draw a little dash to mark the starting point of this branch
|
||||
let (x, y) = self.seg_coordinates[seg_id];
|
||||
let (x1, y1) = (x, y - 5.0);
|
||||
let (x2, y2) = (x, y + 5.0);
|
||||
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
|
||||
)?;
|
||||
writeln!(result, " <title>(seg {seg_id})</title>")?;
|
||||
writeln!(result, "</line>")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Draw circles where snapshots are taken
|
||||
fn draw_seg_phase2(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
|
||||
let seg = &self.storage.segments[seg_id];
|
||||
|
||||
// draw a snapshot point if it's needed
|
||||
let (coord_x, coord_y) = self.seg_coordinates[seg_id];
|
||||
if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
|
||||
writeln!(
|
||||
result,
|
||||
"<circle cx=\"{coord_x}\" cy=\"{coord_y}\" r=\"5\" stroke=\"red\">",
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
" <title>logical size {}</title>",
|
||||
seg.size.unwrap()
|
||||
)?;
|
||||
write!(result, "</circle>")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
313
libs/tenant_size_model/tests/tests.rs
Normal file
313
libs/tenant_size_model/tests/tests.rs
Normal file
@@ -0,0 +1,313 @@
|
||||
//! Tenant size model tests.
|
||||
|
||||
use tenant_size_model::{Segment, SizeResult, StorageModel};
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
struct ScenarioBuilder {
|
||||
segments: Vec<Segment>,
|
||||
|
||||
/// Mapping from the branch name to the index of a segment describing its latest state.
|
||||
branches: HashMap<String, usize>,
|
||||
}
|
||||
|
||||
impl ScenarioBuilder {
|
||||
/// Creates a new storage with the given default branch name.
|
||||
pub fn new(initial_branch: &str) -> ScenarioBuilder {
|
||||
let init_segment = Segment {
|
||||
parent: None,
|
||||
lsn: 0,
|
||||
size: Some(0),
|
||||
needed: false, // determined later
|
||||
};
|
||||
|
||||
ScenarioBuilder {
|
||||
segments: vec![init_segment],
|
||||
branches: HashMap::from([(initial_branch.into(), 0)]),
|
||||
}
|
||||
}
|
||||
|
||||
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
|
||||
pub fn modify_branch(&mut self, branch: &str, lsn_bytes: u64, size_bytes: i64) {
|
||||
let lastseg_id = *self.branches.get(branch).unwrap();
|
||||
let newseg_id = self.segments.len();
|
||||
let lastseg = &mut self.segments[lastseg_id];
|
||||
|
||||
let newseg = Segment {
|
||||
parent: Some(lastseg_id),
|
||||
lsn: lastseg.lsn + lsn_bytes,
|
||||
size: Some((lastseg.size.unwrap() as i64 + size_bytes) as u64),
|
||||
needed: false,
|
||||
};
|
||||
|
||||
self.segments.push(newseg);
|
||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, branch: &str, bytes: u64) {
|
||||
self.modify_branch(branch, bytes, bytes as i64);
|
||||
}
|
||||
|
||||
pub fn update(&mut self, branch: &str, bytes: u64) {
|
||||
self.modify_branch(branch, bytes, 0i64);
|
||||
}
|
||||
|
||||
pub fn _delete(&mut self, branch: &str, bytes: u64) {
|
||||
self.modify_branch(branch, bytes, -(bytes as i64));
|
||||
}
|
||||
|
||||
/// Panics if the parent branch cannot be found.
|
||||
pub fn branch(&mut self, parent: &str, name: &str) {
|
||||
// Find the right segment
|
||||
let branchseg_id = *self
|
||||
.branches
|
||||
.get(parent)
|
||||
.expect("should had found the parent by key");
|
||||
let _branchseg = &mut self.segments[branchseg_id];
|
||||
|
||||
// Create branch name for it
|
||||
self.branches.insert(name.to_string(), branchseg_id);
|
||||
}
|
||||
|
||||
pub fn calculate(&mut self, retention_period: u64) -> (StorageModel, SizeResult) {
|
||||
// Phase 1: Mark all the segments that need to be retained
|
||||
for (_branch, &last_seg_id) in self.branches.iter() {
|
||||
let last_seg = &self.segments[last_seg_id];
|
||||
let cutoff_lsn = last_seg.lsn.saturating_sub(retention_period);
|
||||
let mut seg_id = last_seg_id;
|
||||
loop {
|
||||
let seg = &mut self.segments[seg_id];
|
||||
if seg.lsn <= cutoff_lsn {
|
||||
break;
|
||||
}
|
||||
seg.needed = true;
|
||||
if let Some(prev_seg_id) = seg.parent {
|
||||
seg_id = prev_seg_id;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Perform the calculation
|
||||
let storage_model = StorageModel {
|
||||
segments: self.segments.clone(),
|
||||
};
|
||||
let size_result = storage_model.calculate();
|
||||
(storage_model, size_result)
|
||||
}
|
||||
}
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
#[test]
|
||||
fn scenario_1() {
|
||||
// Create main branch
|
||||
let mut scenario = ScenarioBuilder::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
scenario.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Calculate the synthetic size with retention horizon 1000
|
||||
let (_model, result) = scenario.calculate(1000);
|
||||
|
||||
// The end of the branch is at LSN 10000. Need to retain
|
||||
// a logical snapshot at LSN 9000, plus the WAL between 9000-10000.
|
||||
// The logical snapshot has size 5000.
|
||||
assert_eq!(result.total_size, 5000 + 1000);
|
||||
}
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
#[test]
|
||||
fn scenario_2() {
|
||||
// Create main branch
|
||||
let mut scenario = ScenarioBuilder::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
scenario.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Branch
|
||||
scenario.branch("main", "child");
|
||||
scenario.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
scenario.update("main", 1_000);
|
||||
|
||||
//
|
||||
// The history looks like this now:
|
||||
//
|
||||
// 10000 11000
|
||||
// *----*----*--------------* main
|
||||
// |
|
||||
// | 11000
|
||||
// +-------------- child
|
||||
//
|
||||
//
|
||||
// With retention horizon 1000, we need to retain logical snapshot
|
||||
// at the branch point, size 5000, and the WAL from 10000-11000 on
|
||||
// both branches.
|
||||
let (_model, result) = scenario.calculate(1000);
|
||||
|
||||
assert_eq!(result.total_size, 5000 + 1000 + 1000);
|
||||
}
|
||||
|
||||
// Like 2, but more updates on main
|
||||
#[test]
|
||||
fn scenario_3() {
|
||||
// Create main branch
|
||||
let mut scenario = ScenarioBuilder::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
scenario.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Branch
|
||||
scenario.branch("main", "child");
|
||||
scenario.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..5 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
//
|
||||
// The history looks like this now:
|
||||
//
|
||||
// 10000 15000
|
||||
// *----*----*------------------------------------* main
|
||||
// |
|
||||
// | 11000
|
||||
// +-------------- child
|
||||
//
|
||||
//
|
||||
// With retention horizon 1000, it's still cheapest to retain
|
||||
// - snapshot at branch point (size 5000)
|
||||
// - WAL on child between 10000-11000
|
||||
// - WAL on main between 10000-15000
|
||||
//
|
||||
// This is in total 5000 + 1000 + 5000
|
||||
//
|
||||
let (_model, result) = scenario.calculate(1000);
|
||||
|
||||
assert_eq!(result.total_size, 5000 + 1000 + 5000);
|
||||
}
|
||||
|
||||
// Diverged branches
|
||||
#[test]
|
||||
fn scenario_4() {
|
||||
// Create main branch
|
||||
let mut scenario = ScenarioBuilder::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
scenario.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Branch
|
||||
scenario.branch("main", "child");
|
||||
scenario.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..8 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
//
|
||||
// The history looks like this now:
|
||||
//
|
||||
// 10000 18000
|
||||
// *----*----*------------------------------------* main
|
||||
// |
|
||||
// | 11000
|
||||
// +-------------- child
|
||||
//
|
||||
//
|
||||
// With retention horizon 1000, it's now cheapest to retain
|
||||
// separate snapshots on both branches:
|
||||
// - snapshot on main branch at LSN 17000 (size 5000)
|
||||
// - WAL on main between 17000-18000
|
||||
// - snapshot on child branch at LSN 10000 (size 5000)
|
||||
// - WAL on child between 10000-11000
|
||||
//
|
||||
// This is in total 5000 + 1000 + 5000 + 1000 = 12000
|
||||
//
|
||||
// (If we used the the method from the previous scenario, and
|
||||
// kept only snapshot at the branch point, we'd need to keep
|
||||
// all the WAL between 10000-18000 on the main branch, so
|
||||
// the total size would be 5000 + 1000 + 8000 = 14000. The
|
||||
// calculation always picks the cheapest alternative)
|
||||
|
||||
let (_model, result) = scenario.calculate(1000);
|
||||
|
||||
assert_eq!(result.total_size, 5000 + 1000 + 5000 + 1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scenario_5() {
|
||||
let mut scenario = ScenarioBuilder::new("a");
|
||||
scenario.insert("a", 5000);
|
||||
scenario.branch("a", "b");
|
||||
scenario.update("b", 4000);
|
||||
scenario.update("a", 2000);
|
||||
scenario.branch("a", "c");
|
||||
scenario.insert("c", 4000);
|
||||
scenario.insert("a", 2000);
|
||||
|
||||
let (_model, result) = scenario.calculate(1000);
|
||||
|
||||
assert_eq!(result.total_size, 17000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scenario_6() {
|
||||
let branches = [
|
||||
"7ff1edab8182025f15ae33482edb590a",
|
||||
"b1719e044db05401a05a2ed588a3ad3f",
|
||||
"0xb68d6691c895ad0a70809470020929ef",
|
||||
];
|
||||
|
||||
// compared to other scenarios, this one uses bytes instead of kB
|
||||
|
||||
let mut scenario = ScenarioBuilder::new("");
|
||||
|
||||
scenario.branch("", branches[0]); // at 0
|
||||
scenario.modify_branch(branches[0], 108951064, 43696128); // at 108951064
|
||||
scenario.branch(branches[0], branches[1]); // at 108951064
|
||||
scenario.modify_branch(branches[1], 15560408, -1851392); // at 124511472
|
||||
scenario.modify_branch(branches[0], 174464360, -1531904); // at 283415424
|
||||
scenario.branch(branches[0], branches[2]); // at 283415424
|
||||
scenario.modify_branch(branches[2], 15906192, 8192); // at 299321616
|
||||
scenario.modify_branch(branches[0], 18909976, 32768); // at 302325400
|
||||
|
||||
let (model, result) = scenario.calculate(100_000);
|
||||
|
||||
// FIXME: We previously calculated 333_792_000. But with this PR, we get
|
||||
// a much lower number. At a quick look at the model output and the
|
||||
// calculations here, the new result seems correct to me.
|
||||
eprintln!(
|
||||
" MODEL: {}",
|
||||
serde_json::to_string(&model.segments).unwrap()
|
||||
);
|
||||
eprintln!(
|
||||
"RESULT: {}",
|
||||
serde_json::to_string(&result.segments).unwrap()
|
||||
);
|
||||
|
||||
assert_eq!(result.total_size, 136_236_928);
|
||||
}
|
||||
@@ -12,41 +12,36 @@ anyhow.workspace = true
|
||||
bincode.workspace = true
|
||||
bytes.workspace = true
|
||||
heapless.workspace = true
|
||||
hex = { workspace = true, features = ["serde"] }
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
futures = { workspace = true}
|
||||
jsonwebtoken.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
routerify.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json"] }
|
||||
nix.workspace = true
|
||||
signal-hook.workspace = true
|
||||
rand.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
hex = { workspace = true, features = ["serde"] }
|
||||
rustls.workspace = true
|
||||
rustls-split.workspace = true
|
||||
git-version.workspace = true
|
||||
serde_with.workspace = true
|
||||
once_cell.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
url.workspace = true
|
||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||
|
||||
metrics.workspace = true
|
||||
pq_proto.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tempfile.workspace = true
|
||||
criterion.workspace = true
|
||||
rustls-pemfile.workspace = true
|
||||
|
||||
[[bench]]
|
||||
name = "benchmarks"
|
||||
|
||||
@@ -3,14 +3,14 @@ use crate::http::error;
|
||||
use anyhow::{anyhow, Context};
|
||||
use hyper::header::{HeaderName, AUTHORIZATION};
|
||||
use hyper::http::HeaderValue;
|
||||
use hyper::Method;
|
||||
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
|
||||
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::RequestInfo;
|
||||
use routerify::{Middleware, Router, RouterBuilder, RouterService};
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
|
||||
use tokio::task::JoinError;
|
||||
use tracing::info;
|
||||
use tracing::{self, debug, info, info_span, warn, Instrument};
|
||||
|
||||
use std::future::Future;
|
||||
use std::net::TcpListener;
|
||||
@@ -26,9 +26,83 @@ static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
async fn logger(res: Response<Body>, info: RequestInfo) -> Result<Response<Body>, ApiError> {
|
||||
info!("{} {} {}", info.method(), info.uri().path(), res.status(),);
|
||||
Ok(res)
|
||||
static X_REQUEST_ID_HEADER_STR: &str = "x-request-id";
|
||||
|
||||
static X_REQUEST_ID_HEADER: HeaderName = HeaderName::from_static(X_REQUEST_ID_HEADER_STR);
|
||||
#[derive(Debug, Default, Clone)]
|
||||
struct RequestId(String);
|
||||
|
||||
/// Adds a tracing info_span! instrumentation around the handler events,
|
||||
/// logs the request start and end events for non-GET requests and non-200 responses.
|
||||
///
|
||||
/// Use this to distinguish between logs of different HTTP requests: every request handler wrapped
|
||||
/// in this type will get request info logged in the wrapping span, including the unique request ID.
|
||||
///
|
||||
/// There could be other ways to implement similar functionality:
|
||||
///
|
||||
/// * procmacros placed on top of all handler methods
|
||||
/// With all the drawbacks of procmacros, brings no difference implementation-wise,
|
||||
/// and little code reduction compared to the existing approach.
|
||||
///
|
||||
/// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
|
||||
/// implemented for [`RouterBuilder`].
|
||||
/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
|
||||
///
|
||||
/// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
|
||||
/// later, in a post-response middleware.
|
||||
/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
|
||||
/// tries to achive with its `.instrument` used in the current approach.
|
||||
///
|
||||
/// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
|
||||
pub struct RequestSpan<E, R, H>(pub H)
|
||||
where
|
||||
E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
|
||||
R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
|
||||
H: Fn(Request<Body>) -> R + Send + Sync + 'static;
|
||||
|
||||
impl<E, R, H> RequestSpan<E, R, H>
|
||||
where
|
||||
E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
|
||||
R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
|
||||
H: Fn(Request<Body>) -> R + Send + Sync + 'static,
|
||||
{
|
||||
/// Creates a tracing span around inner request handler and executes the request handler in the contex of that span.
|
||||
/// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled.
|
||||
pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, E> {
|
||||
let request_id = request.context::<RequestId>().unwrap_or_default().0;
|
||||
let method = request.method();
|
||||
let path = request.uri().path();
|
||||
let request_span = info_span!("request", %method, %path, %request_id);
|
||||
|
||||
let log_quietly = method == Method::GET;
|
||||
async move {
|
||||
if log_quietly {
|
||||
debug!("Handling request");
|
||||
} else {
|
||||
info!("Handling request");
|
||||
}
|
||||
|
||||
// Note that we reuse `error::handler` here and not returning and error at all,
|
||||
// yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation.
|
||||
// Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
|
||||
//
|
||||
// Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
|
||||
match (self.0)(request).await {
|
||||
Ok(response) => {
|
||||
let response_status = response.status();
|
||||
if log_quietly && response_status.is_success() {
|
||||
debug!("Request handled, status: {response_status}");
|
||||
} else {
|
||||
info!("Request handled, status: {response_status}");
|
||||
}
|
||||
Ok(response)
|
||||
}
|
||||
Err(e) => Ok(error::handler(e.into()).await),
|
||||
}
|
||||
}
|
||||
.instrument(request_span)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -55,10 +129,48 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
pub fn add_request_id_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||
) -> Middleware<B, ApiError> {
|
||||
Middleware::pre(move |req| async move {
|
||||
let request_id = match req.headers().get(&X_REQUEST_ID_HEADER) {
|
||||
Some(request_id) => request_id
|
||||
.to_str()
|
||||
.expect("extract request id value")
|
||||
.to_owned(),
|
||||
None => {
|
||||
let request_id = uuid::Uuid::new_v4();
|
||||
request_id.to_string()
|
||||
}
|
||||
};
|
||||
req.set_context(RequestId(request_id));
|
||||
|
||||
Ok(req)
|
||||
})
|
||||
}
|
||||
|
||||
async fn add_request_id_header_to_response(
|
||||
mut res: Response<Body>,
|
||||
req_info: RequestInfo,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
if let Some(request_id) = req_info.context::<RequestId>() {
|
||||
if let Ok(request_header_value) = HeaderValue::from_str(&request_id.0) {
|
||||
res.headers_mut()
|
||||
.insert(&X_REQUEST_ID_HEADER, request_header_value);
|
||||
};
|
||||
};
|
||||
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
|
||||
Router::builder()
|
||||
.middleware(Middleware::post_with_info(logger))
|
||||
.get("/metrics", prometheus_metrics_handler)
|
||||
.middleware(add_request_id_middleware())
|
||||
.middleware(Middleware::post_with_info(
|
||||
add_request_id_header_to_response,
|
||||
))
|
||||
.get("/metrics", |r| {
|
||||
RequestSpan(prometheus_metrics_handler).handle(r)
|
||||
})
|
||||
.err_handler(error::handler)
|
||||
}
|
||||
|
||||
@@ -68,40 +180,43 @@ pub fn attach_openapi_ui(
|
||||
spec_mount_path: &'static str,
|
||||
ui_mount_path: &'static str,
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
router_builder.get(spec_mount_path, move |_| async move {
|
||||
Ok(Response::builder().body(Body::from(spec)).unwrap())
|
||||
}).get(ui_mount_path, move |_| async move {
|
||||
Ok(Response::builder().body(Body::from(format!(r#"
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>rweb</title>
|
||||
<link href="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui.css" rel="stylesheet">
|
||||
</head>
|
||||
<body>
|
||||
<div id="swagger-ui"></div>
|
||||
<script src="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui-bundle.js" charset="UTF-8"> </script>
|
||||
<script>
|
||||
window.onload = function() {{
|
||||
const ui = SwaggerUIBundle({{
|
||||
"dom_id": "\#swagger-ui",
|
||||
presets: [
|
||||
SwaggerUIBundle.presets.apis,
|
||||
SwaggerUIBundle.SwaggerUIStandalonePreset
|
||||
],
|
||||
layout: "BaseLayout",
|
||||
deepLinking: true,
|
||||
showExtensions: true,
|
||||
showCommonExtensions: true,
|
||||
url: "{}",
|
||||
}})
|
||||
window.ui = ui;
|
||||
}};
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"#, spec_mount_path))).unwrap())
|
||||
})
|
||||
router_builder
|
||||
.get(spec_mount_path, move |r| {
|
||||
RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) })
|
||||
.handle(r)
|
||||
})
|
||||
.get(ui_mount_path, move |r| RequestSpan( move |_| async move {
|
||||
Ok(Response::builder().body(Body::from(format!(r#"
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>rweb</title>
|
||||
<link href="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui.css" rel="stylesheet">
|
||||
</head>
|
||||
<body>
|
||||
<div id="swagger-ui"></div>
|
||||
<script src="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui-bundle.js" charset="UTF-8"> </script>
|
||||
<script>
|
||||
window.onload = function() {{
|
||||
const ui = SwaggerUIBundle({{
|
||||
"dom_id": "\#swagger-ui",
|
||||
presets: [
|
||||
SwaggerUIBundle.presets.apis,
|
||||
SwaggerUIBundle.SwaggerUIStandalonePreset
|
||||
],
|
||||
layout: "BaseLayout",
|
||||
deepLinking: true,
|
||||
showExtensions: true,
|
||||
showCommonExtensions: true,
|
||||
url: "{}",
|
||||
}})
|
||||
window.ui = ui;
|
||||
}};
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"#, spec_mount_path))).unwrap())
|
||||
}).handle(r))
|
||||
}
|
||||
|
||||
fn parse_token(header_value: &str) -> Result<&str, ApiError> {
|
||||
@@ -163,7 +278,7 @@ where
|
||||
async move {
|
||||
let headers = response.headers_mut();
|
||||
if headers.contains_key(&name) {
|
||||
tracing::warn!(
|
||||
warn!(
|
||||
"{} response already contains header {:?}",
|
||||
request_info.uri(),
|
||||
&name,
|
||||
@@ -223,3 +338,48 @@ where
|
||||
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use futures::future::poll_fn;
|
||||
use hyper::service::Service;
|
||||
use routerify::RequestServiceBuilder;
|
||||
use std::net::{IpAddr, SocketAddr};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_request_id_returned() {
|
||||
let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap();
|
||||
let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
|
||||
let mut service = builder.build(remote_addr);
|
||||
if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
|
||||
panic!("request service is not ready: {:?}", e);
|
||||
}
|
||||
|
||||
let mut req: Request<Body> = Request::default();
|
||||
req.headers_mut()
|
||||
.append(&X_REQUEST_ID_HEADER, HeaderValue::from_str("42").unwrap());
|
||||
|
||||
let resp: Response<hyper::body::Body> = service.call(req).await.unwrap();
|
||||
|
||||
let header_val = resp.headers().get(&X_REQUEST_ID_HEADER).unwrap();
|
||||
|
||||
assert!(header_val == "42", "response header mismatch");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_request_id_empty() {
|
||||
let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap();
|
||||
let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
|
||||
let mut service = builder.build(remote_addr);
|
||||
if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
|
||||
panic!("request service is not ready: {:?}", e);
|
||||
}
|
||||
|
||||
let req: Request<Body> = Request::default();
|
||||
let resp: Response<hyper::body::Body> = service.call(req).await.unwrap();
|
||||
|
||||
let header_val = resp.headers().get(&X_REQUEST_ID_HEADER);
|
||||
|
||||
assert_ne!(header_val, None, "response header should NOT be empty");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use std::fmt::Display;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Buf;
|
||||
use hyper::{header, Body, Request, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
|
||||
use super::error::ApiError;
|
||||
|
||||
@@ -31,3 +33,12 @@ pub fn json_response<T: Serialize>(
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
/// Serialize through Display trait.
|
||||
pub fn display_serialize<S, F>(z: &F, s: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
F: Display,
|
||||
{
|
||||
s.serialize_str(&format!("{}", z))
|
||||
}
|
||||
|
||||
@@ -13,8 +13,6 @@ pub mod simple_rcu;
|
||||
pub mod vec_map;
|
||||
|
||||
pub mod bin_ser;
|
||||
pub mod postgres_backend;
|
||||
pub mod postgres_backend_async;
|
||||
|
||||
// helper functions for creating and fsyncing
|
||||
pub mod crashsafe;
|
||||
@@ -27,9 +25,6 @@ pub mod id;
|
||||
// http endpoint utils
|
||||
pub mod http;
|
||||
|
||||
// socket splitting utils
|
||||
pub mod sock_split;
|
||||
|
||||
// common log initialisation routine
|
||||
pub mod logging;
|
||||
|
||||
|
||||
@@ -45,3 +45,115 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Disable the default rust panic hook by using `set_hook`.
|
||||
///
|
||||
/// For neon binaries, the assumption is that tracing is configured before with [`init`], after
|
||||
/// that sentry is configured (if needed). sentry will install it's own on top of this, always
|
||||
/// processing the panic before we log it.
|
||||
///
|
||||
/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
|
||||
/// If the assumptions about the initialization order are not held, use
|
||||
/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
|
||||
/// lost.
|
||||
#[must_use]
|
||||
pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
|
||||
std::panic::set_hook(Box::new(tracing_panic_hook));
|
||||
TracingPanicHookGuard::new()
|
||||
}
|
||||
|
||||
/// Drop guard which restores the std panic hook on drop.
|
||||
///
|
||||
/// Tracing should not be used when it's not configured, but we cannot really latch on to any
|
||||
/// imaginary lifetime of tracing.
|
||||
pub struct TracingPanicHookGuard {
|
||||
act: bool,
|
||||
}
|
||||
|
||||
impl TracingPanicHookGuard {
|
||||
fn new() -> Self {
|
||||
TracingPanicHookGuard { act: true }
|
||||
}
|
||||
|
||||
/// Make this hook guard not do anything when dropped.
|
||||
pub fn forget(&mut self) {
|
||||
self.act = false;
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TracingPanicHookGuard {
|
||||
fn drop(&mut self) {
|
||||
if self.act {
|
||||
let _ = std::panic::take_hook();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Named symbol for our panic hook, which logs the panic.
|
||||
fn tracing_panic_hook(info: &std::panic::PanicInfo) {
|
||||
// following rust 1.66.1 std implementation:
|
||||
// https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
|
||||
let location = info.location();
|
||||
|
||||
let msg = match info.payload().downcast_ref::<&'static str>() {
|
||||
Some(s) => *s,
|
||||
None => match info.payload().downcast_ref::<String>() {
|
||||
Some(s) => &s[..],
|
||||
None => "Box<dyn Any>",
|
||||
},
|
||||
};
|
||||
|
||||
let thread = std::thread::current();
|
||||
let thread = thread.name().unwrap_or("<unnamed>");
|
||||
let backtrace = std::backtrace::Backtrace::capture();
|
||||
|
||||
let _entered = if let Some(location) = location {
|
||||
tracing::error_span!("panic", %thread, location = %PrettyLocation(location))
|
||||
} else {
|
||||
// very unlikely to hit here, but the guarantees of std could change
|
||||
tracing::error_span!("panic", %thread)
|
||||
}
|
||||
.entered();
|
||||
|
||||
if backtrace.status() == std::backtrace::BacktraceStatus::Captured {
|
||||
// this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really
|
||||
// get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to
|
||||
// string, maybe even to a TLS one but tracing already does that.
|
||||
tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}");
|
||||
} else {
|
||||
tracing::error!("{msg}");
|
||||
}
|
||||
|
||||
// ensure that we log something on the panic if this hook is left after tracing has been
|
||||
// unconfigured. worst case when teardown is racing the panic is to log the panic twice.
|
||||
tracing::dispatcher::get_default(|d| {
|
||||
if let Some(_none) = d.downcast_ref::<tracing::subscriber::NoSubscriber>() {
|
||||
let location = location.map(PrettyLocation);
|
||||
log_panic_to_stderr(thread, msg, location, &backtrace);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[cold]
|
||||
fn log_panic_to_stderr(
|
||||
thread: &str,
|
||||
msg: &str,
|
||||
location: Option<PrettyLocation<'_, '_>>,
|
||||
backtrace: &std::backtrace::Backtrace,
|
||||
) {
|
||||
eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}");
|
||||
}
|
||||
|
||||
struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>);
|
||||
|
||||
impl std::fmt::Display for PrettyLocation<'_, '_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for PrettyLocation<'_, '_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
<Self as std::fmt::Display>::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,485 +0,0 @@
|
||||
//! Server-side synchronous Postgres connection, as limited as we need.
|
||||
//! To use, create PostgresBackend and run() it, passing the Handler
|
||||
//! implementation determining how to process the queries. Currently its API
|
||||
//! is rather narrow, but we can extend it once required.
|
||||
|
||||
use crate::postgres_backend_async::{log_query_error, short_error, QueryError};
|
||||
use crate::sock_split::{BidiStream, ReadStream, WriteStream};
|
||||
use anyhow::Context;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::io::{self, Write};
|
||||
use std::net::{Shutdown, SocketAddr, TcpStream};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tracing::*;
|
||||
|
||||
pub trait Handler {
|
||||
/// Handle single query.
|
||||
/// postgres_backend will issue ReadyForQuery after calling this (this
|
||||
/// might be not what we want after CopyData streaming, but currently we don't
|
||||
/// care).
|
||||
fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
query_string: &str,
|
||||
) -> Result<(), QueryError>;
|
||||
|
||||
/// Called on startup packet receival, allows to process params.
|
||||
///
|
||||
/// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
|
||||
/// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
|
||||
/// to override whole init logic in implementations.
|
||||
fn startup(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
_sm: &FeStartupPacket,
|
||||
) -> Result<(), QueryError> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check auth jwt
|
||||
fn check_auth_jwt(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
_jwt_response: &[u8],
|
||||
) -> Result<(), QueryError> {
|
||||
Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
|
||||
}
|
||||
|
||||
fn is_shutdown_requested(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// PostgresBackend protocol state.
|
||||
/// XXX: The order of the constructors matters.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)]
|
||||
pub enum ProtoState {
|
||||
Initialization,
|
||||
Encrypted,
|
||||
Authentication,
|
||||
Established,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
|
||||
pub enum AuthType {
|
||||
Trust,
|
||||
// This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT
|
||||
NeonJWT,
|
||||
}
|
||||
|
||||
impl FromStr for AuthType {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"Trust" => Ok(Self::Trust),
|
||||
"NeonJWT" => Ok(Self::NeonJWT),
|
||||
_ => anyhow::bail!("invalid value \"{s}\" for auth type"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for AuthType {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(match self {
|
||||
AuthType::Trust => "Trust",
|
||||
AuthType::NeonJWT => "NeonJWT",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum ProcessMsgResult {
|
||||
Continue,
|
||||
Break,
|
||||
}
|
||||
|
||||
/// Always-writeable sock_split stream.
|
||||
/// May not be readable. See [`PostgresBackend::take_stream_in`]
|
||||
pub enum Stream {
|
||||
Bidirectional(BidiStream),
|
||||
WriteOnly(WriteStream),
|
||||
}
|
||||
|
||||
impl Stream {
|
||||
fn shutdown(&mut self, how: Shutdown) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Bidirectional(bidi_stream) => bidi_stream.shutdown(how),
|
||||
Self::WriteOnly(write_stream) => write_stream.shutdown(how),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl io::Write for Stream {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
match self {
|
||||
Self::Bidirectional(bidi_stream) => bidi_stream.write(buf),
|
||||
Self::WriteOnly(write_stream) => write_stream.write(buf),
|
||||
}
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Bidirectional(bidi_stream) => bidi_stream.flush(),
|
||||
Self::WriteOnly(write_stream) => write_stream.flush(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PostgresBackend {
|
||||
stream: Option<Stream>,
|
||||
// Output buffer. c.f. BeMessage::write why we are using BytesMut here.
|
||||
buf_out: BytesMut,
|
||||
|
||||
pub state: ProtoState,
|
||||
|
||||
auth_type: AuthType,
|
||||
|
||||
peer_addr: SocketAddr,
|
||||
pub tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
}
|
||||
|
||||
pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
|
||||
let mut query_string = query_string.to_vec();
|
||||
if let Some(ch) = query_string.last() {
|
||||
if *ch == 0 {
|
||||
query_string.pop();
|
||||
}
|
||||
}
|
||||
query_string
|
||||
}
|
||||
|
||||
// Helper function for socket read loops
|
||||
pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool {
|
||||
for cause in error.chain() {
|
||||
if let Some(io_error) = cause.downcast_ref::<io::Error>() {
|
||||
if io_error.kind() == std::io::ErrorKind::WouldBlock {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
// Cast a byte slice to a string slice, dropping null terminator if there's one.
|
||||
fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
|
||||
let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
|
||||
std::str::from_utf8(without_null).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
impl PostgresBackend {
|
||||
pub fn new(
|
||||
socket: TcpStream,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
set_read_timeout: bool,
|
||||
) -> io::Result<Self> {
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
if set_read_timeout {
|
||||
socket
|
||||
.set_read_timeout(Some(Duration::from_secs(5)))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))),
|
||||
buf_out: BytesMut::with_capacity(10 * 1024),
|
||||
state: ProtoState::Initialization,
|
||||
auth_type,
|
||||
tls_config,
|
||||
peer_addr,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn into_stream(self) -> Stream {
|
||||
self.stream.unwrap()
|
||||
}
|
||||
|
||||
/// Get direct reference (into the Option) to the read stream.
|
||||
fn get_stream_in(&mut self) -> anyhow::Result<&mut BidiStream> {
|
||||
match &mut self.stream {
|
||||
Some(Stream::Bidirectional(stream)) => Ok(stream),
|
||||
_ => anyhow::bail!("reader taken"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_peer_addr(&self) -> &SocketAddr {
|
||||
&self.peer_addr
|
||||
}
|
||||
|
||||
pub fn take_stream_in(&mut self) -> Option<ReadStream> {
|
||||
let stream = self.stream.take();
|
||||
match stream {
|
||||
Some(Stream::Bidirectional(bidi_stream)) => {
|
||||
let (read, write) = bidi_stream.split();
|
||||
self.stream = Some(Stream::WriteOnly(write));
|
||||
Some(read)
|
||||
}
|
||||
stream => {
|
||||
self.stream = stream;
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Read full message or return None if connection is closed.
|
||||
pub fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
|
||||
let (state, stream) = (self.state, self.get_stream_in()?);
|
||||
|
||||
use ProtoState::*;
|
||||
match state {
|
||||
Initialization | Encrypted => FeStartupPacket::read(stream),
|
||||
Authentication | Established => FeMessage::read(stream),
|
||||
}
|
||||
.map_err(QueryError::from)
|
||||
}
|
||||
|
||||
/// Write message into internal output buffer.
|
||||
pub fn write_message_noflush(&mut self, message: &BeMessage) -> io::Result<&mut Self> {
|
||||
BeMessage::write(&mut self.buf_out, message)?;
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
/// Flush output buffer into the socket.
|
||||
pub fn flush(&mut self) -> io::Result<&mut Self> {
|
||||
let stream = self.stream.as_mut().unwrap();
|
||||
stream.write_all(&self.buf_out)?;
|
||||
self.buf_out.clear();
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
/// Write message into internal buffer and flush it.
|
||||
pub fn write_message(&mut self, message: &BeMessage) -> io::Result<&mut Self> {
|
||||
self.write_message_noflush(message)?;
|
||||
self.flush()
|
||||
}
|
||||
|
||||
// Wrapper for run_message_loop() that shuts down socket when we are done
|
||||
pub fn run(mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
|
||||
let ret = self.run_message_loop(handler);
|
||||
if let Some(stream) = self.stream.as_mut() {
|
||||
let _ = stream.shutdown(Shutdown::Both);
|
||||
}
|
||||
ret
|
||||
}
|
||||
|
||||
fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
|
||||
trace!("postgres backend to {:?} started", self.peer_addr);
|
||||
|
||||
let mut unnamed_query_string = Bytes::new();
|
||||
|
||||
while !handler.is_shutdown_requested() {
|
||||
match self.read_message() {
|
||||
Ok(message) => {
|
||||
if let Some(msg) = message {
|
||||
trace!("got message {msg:?}");
|
||||
|
||||
match self.process_message(handler, msg, &mut unnamed_query_string)? {
|
||||
ProcessMsgResult::Continue => continue,
|
||||
ProcessMsgResult::Break => break,
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
if let QueryError::Other(e) = &e {
|
||||
if is_socket_read_timed_out(e) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
trace!("postgres backend to {:?} exited", self.peer_addr);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn start_tls(&mut self) -> anyhow::Result<()> {
|
||||
match self.stream.take() {
|
||||
Some(Stream::Bidirectional(bidi_stream)) => {
|
||||
let conn = rustls::ServerConnection::new(self.tls_config.clone().unwrap())?;
|
||||
self.stream = Some(Stream::Bidirectional(bidi_stream.start_tls(conn)?));
|
||||
Ok(())
|
||||
}
|
||||
stream => {
|
||||
self.stream = stream;
|
||||
anyhow::bail!("can't start TLs without bidi stream");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn process_message(
|
||||
&mut self,
|
||||
handler: &mut impl Handler,
|
||||
msg: FeMessage,
|
||||
unnamed_query_string: &mut Bytes,
|
||||
) -> Result<ProcessMsgResult, QueryError> {
|
||||
// Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
|
||||
// TODO: change that to proper top-level match of protocol state with separate message handling for each state
|
||||
if self.state < ProtoState::Established
|
||||
&& !matches!(
|
||||
msg,
|
||||
FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_)
|
||||
)
|
||||
{
|
||||
return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
|
||||
}
|
||||
|
||||
let have_tls = self.tls_config.is_some();
|
||||
match msg {
|
||||
FeMessage::StartupPacket(m) => {
|
||||
trace!("got startup message {m:?}");
|
||||
|
||||
match m {
|
||||
FeStartupPacket::SslRequest => {
|
||||
debug!("SSL requested");
|
||||
|
||||
self.write_message(&BeMessage::EncryptionResponse(have_tls))?;
|
||||
if have_tls {
|
||||
self.start_tls()?;
|
||||
self.state = ProtoState::Encrypted;
|
||||
}
|
||||
}
|
||||
FeStartupPacket::GssEncRequest => {
|
||||
debug!("GSS requested");
|
||||
self.write_message(&BeMessage::EncryptionResponse(false))?;
|
||||
}
|
||||
FeStartupPacket::StartupMessage { .. } => {
|
||||
if have_tls && !matches!(self.state, ProtoState::Encrypted) {
|
||||
self.write_message(&BeMessage::ErrorResponse(
|
||||
"must connect with TLS",
|
||||
None,
|
||||
))?;
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"client did not connect with TLS"
|
||||
)));
|
||||
}
|
||||
|
||||
// NB: startup() may change self.auth_type -- we are using that in proxy code
|
||||
// to bypass auth for new users.
|
||||
handler.startup(self, &m)?;
|
||||
|
||||
match self.auth_type {
|
||||
AuthType::Trust => {
|
||||
self.write_message_noflush(&BeMessage::AuthenticationOk)?
|
||||
.write_message_noflush(&BeMessage::CLIENT_ENCODING)?
|
||||
// The async python driver requires a valid server_version
|
||||
.write_message_noflush(&BeMessage::server_version("14.1"))?
|
||||
.write_message(&BeMessage::ReadyForQuery)?;
|
||||
self.state = ProtoState::Established;
|
||||
}
|
||||
AuthType::NeonJWT => {
|
||||
self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
|
||||
self.state = ProtoState::Authentication;
|
||||
}
|
||||
}
|
||||
}
|
||||
FeStartupPacket::CancelRequest { .. } => {
|
||||
return Ok(ProcessMsgResult::Break);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
FeMessage::PasswordMessage(m) => {
|
||||
trace!("got password message '{:?}'", m);
|
||||
|
||||
assert!(self.state == ProtoState::Authentication);
|
||||
|
||||
match self.auth_type {
|
||||
AuthType::Trust => unreachable!(),
|
||||
AuthType::NeonJWT => {
|
||||
let (_, jwt_response) = m.split_last().context("protocol violation")?;
|
||||
|
||||
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
|
||||
self.write_message(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
self.write_message_noflush(&BeMessage::AuthenticationOk)?
|
||||
.write_message_noflush(&BeMessage::CLIENT_ENCODING)?
|
||||
.write_message(&BeMessage::ReadyForQuery)?;
|
||||
self.state = ProtoState::Established;
|
||||
}
|
||||
|
||||
FeMessage::Query(body) => {
|
||||
// remove null terminator
|
||||
let query_string = cstr_to_str(&body)?;
|
||||
|
||||
trace!("got query {query_string:?}");
|
||||
if let Err(e) = handler.process_query(self, query_string) {
|
||||
log_query_error(query_string, &e);
|
||||
let short_error = short_error(&e);
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&short_error,
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
}
|
||||
self.write_message(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
|
||||
FeMessage::Parse(m) => {
|
||||
*unnamed_query_string = m.query_string;
|
||||
self.write_message(&BeMessage::ParseComplete)?;
|
||||
}
|
||||
|
||||
FeMessage::Describe(_) => {
|
||||
self.write_message_noflush(&BeMessage::ParameterDescription)?
|
||||
.write_message(&BeMessage::NoData)?;
|
||||
}
|
||||
|
||||
FeMessage::Bind(_) => {
|
||||
self.write_message(&BeMessage::BindComplete)?;
|
||||
}
|
||||
|
||||
FeMessage::Close(_) => {
|
||||
self.write_message(&BeMessage::CloseComplete)?;
|
||||
}
|
||||
|
||||
FeMessage::Execute(_) => {
|
||||
let query_string = cstr_to_str(unnamed_query_string)?;
|
||||
trace!("got execute {query_string:?}");
|
||||
if let Err(e) = handler.process_query(self, query_string) {
|
||||
log_query_error(query_string, &e);
|
||||
self.write_message(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
}
|
||||
// NOTE there is no ReadyForQuery message. This handler is used
|
||||
// for basebackup and it uses CopyOut which doesn't require
|
||||
// ReadyForQuery message and backend just switches back to
|
||||
// processing mode after sending CopyDone or ErrorResponse.
|
||||
}
|
||||
|
||||
FeMessage::Sync => {
|
||||
self.write_message(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
|
||||
FeMessage::Terminate => {
|
||||
return Ok(ProcessMsgResult::Break);
|
||||
}
|
||||
|
||||
// We prefer explicit pattern matching to wildcards, because
|
||||
// this helps us spot the places where new variants are missing
|
||||
FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"unexpected message type: {msg:?}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ProcessMsgResult::Continue)
|
||||
}
|
||||
}
|
||||
@@ -1,634 +0,0 @@
|
||||
//! Server-side asynchronous Postgres connection, as limited as we need.
|
||||
//! To use, create PostgresBackend and run() it, passing the Handler
|
||||
//! implementation determining how to process the queries. Currently its API
|
||||
//! is rather narrow, but we can extend it once required.
|
||||
|
||||
use crate::postgres_backend::AuthType;
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR};
|
||||
use std::io;
|
||||
use std::net::SocketAddr;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::task::Poll;
|
||||
use std::{future::Future, task::ready};
|
||||
use tracing::{debug, error, info, trace};
|
||||
|
||||
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
|
||||
use tokio_rustls::TlsAcceptor;
|
||||
|
||||
pub fn is_expected_io_error(e: &io::Error) -> bool {
|
||||
use io::ErrorKind::*;
|
||||
matches!(
|
||||
e.kind(),
|
||||
ConnectionRefused | ConnectionAborted | ConnectionReset
|
||||
)
|
||||
}
|
||||
|
||||
/// An error, occurred during query processing:
|
||||
/// either during the connection ([`ConnectionError`]) or before/after it.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum QueryError {
|
||||
/// The connection was lost while processing the query.
|
||||
#[error(transparent)]
|
||||
Disconnected(#[from] ConnectionError),
|
||||
/// Some other error
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<io::Error> for QueryError {
|
||||
fn from(e: io::Error) -> Self {
|
||||
Self::Disconnected(ConnectionError::Socket(e))
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryError {
|
||||
pub fn pg_error_code(&self) -> &'static [u8; 5] {
|
||||
match self {
|
||||
Self::Disconnected(_) => b"08006", // connection failure
|
||||
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
pub trait Handler {
|
||||
/// Handle single query.
|
||||
/// postgres_backend will issue ReadyForQuery after calling this (this
|
||||
/// might be not what we want after CopyData streaming, but currently we don't
|
||||
/// care).
|
||||
async fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
query_string: &str,
|
||||
) -> Result<(), QueryError>;
|
||||
|
||||
/// Called on startup packet receival, allows to process params.
|
||||
///
|
||||
/// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
|
||||
/// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
|
||||
/// to override whole init logic in implementations.
|
||||
fn startup(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
_sm: &FeStartupPacket,
|
||||
) -> Result<(), QueryError> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check auth jwt
|
||||
fn check_auth_jwt(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
_jwt_response: &[u8],
|
||||
) -> Result<(), QueryError> {
|
||||
Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
|
||||
}
|
||||
}
|
||||
|
||||
/// PostgresBackend protocol state.
|
||||
/// XXX: The order of the constructors matters.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)]
|
||||
pub enum ProtoState {
|
||||
Initialization,
|
||||
Encrypted,
|
||||
Authentication,
|
||||
Established,
|
||||
Closed,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum ProcessMsgResult {
|
||||
Continue,
|
||||
Break,
|
||||
}
|
||||
|
||||
/// Always-writeable sock_split stream.
|
||||
/// May not be readable. See [`PostgresBackend::take_stream_in`]
|
||||
pub enum Stream {
|
||||
Unencrypted(BufReader<tokio::net::TcpStream>),
|
||||
Tls(Box<tokio_rustls::server::TlsStream<BufReader<tokio::net::TcpStream>>>),
|
||||
Broken,
|
||||
}
|
||||
|
||||
impl AsyncWrite for Stream {
|
||||
fn poll_write(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &[u8],
|
||||
) -> Poll<io::Result<usize>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf),
|
||||
Self::Broken => unreachable!(),
|
||||
}
|
||||
}
|
||||
fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<io::Result<()>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_flush(cx),
|
||||
Self::Broken => unreachable!(),
|
||||
}
|
||||
}
|
||||
fn poll_shutdown(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<io::Result<()>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx),
|
||||
Self::Broken => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl AsyncRead for Stream {
|
||||
fn poll_read(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut tokio::io::ReadBuf<'_>,
|
||||
) -> Poll<io::Result<()>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf),
|
||||
Self::Broken => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PostgresBackend {
|
||||
stream: Stream,
|
||||
|
||||
// Output buffer. c.f. BeMessage::write why we are using BytesMut here.
|
||||
// The data between 0 and "current position" as tracked by the bytes::Buf
|
||||
// implementation of BytesMut, have already been written.
|
||||
buf_out: BytesMut,
|
||||
|
||||
pub state: ProtoState,
|
||||
|
||||
auth_type: AuthType,
|
||||
|
||||
peer_addr: SocketAddr,
|
||||
pub tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
}
|
||||
|
||||
pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
|
||||
let mut query_string = query_string.to_vec();
|
||||
if let Some(ch) = query_string.last() {
|
||||
if *ch == 0 {
|
||||
query_string.pop();
|
||||
}
|
||||
}
|
||||
query_string
|
||||
}
|
||||
|
||||
// Cast a byte slice to a string slice, dropping null terminator if there's one.
|
||||
fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
|
||||
let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
|
||||
std::str::from_utf8(without_null).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
impl PostgresBackend {
|
||||
pub fn new(
|
||||
socket: tokio::net::TcpStream,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
) -> io::Result<Self> {
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
|
||||
Ok(Self {
|
||||
stream: Stream::Unencrypted(BufReader::new(socket)),
|
||||
buf_out: BytesMut::with_capacity(10 * 1024),
|
||||
state: ProtoState::Initialization,
|
||||
auth_type,
|
||||
tls_config,
|
||||
peer_addr,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_peer_addr(&self) -> &SocketAddr {
|
||||
&self.peer_addr
|
||||
}
|
||||
|
||||
/// Read full message or return None if connection is closed.
|
||||
pub async fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
|
||||
use ProtoState::*;
|
||||
match self.state {
|
||||
Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await,
|
||||
Authentication | Established => FeMessage::read_fut(&mut self.stream).await,
|
||||
Closed => Ok(None),
|
||||
}
|
||||
.map_err(QueryError::from)
|
||||
}
|
||||
|
||||
/// Flush output buffer into the socket.
|
||||
pub async fn flush(&mut self) -> io::Result<()> {
|
||||
while self.buf_out.has_remaining() {
|
||||
let bytes_written = self.stream.write(self.buf_out.chunk()).await?;
|
||||
self.buf_out.advance(bytes_written);
|
||||
}
|
||||
self.buf_out.clear();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write message into internal output buffer.
|
||||
pub fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
|
||||
BeMessage::write(&mut self.buf_out, message)?;
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
/// Returns an AsyncWrite implementation that wraps all the data written
|
||||
/// to it in CopyData messages, and writes them to the connection
|
||||
///
|
||||
/// The caller is responsible for sending CopyOutResponse and CopyDone messages.
|
||||
pub fn copyout_writer(&mut self) -> CopyDataWriter {
|
||||
CopyDataWriter { pgb: self }
|
||||
}
|
||||
|
||||
/// A polling function that tries to write all the data from 'buf_out' to the
|
||||
/// underlying stream.
|
||||
fn poll_write_buf(
|
||||
&mut self,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
while self.buf_out.has_remaining() {
|
||||
match ready!(Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk())) {
|
||||
Ok(bytes_written) => self.buf_out.advance(bytes_written),
|
||||
Err(err) => return Poll::Ready(Err(err)),
|
||||
}
|
||||
}
|
||||
Poll::Ready(Ok(()))
|
||||
}
|
||||
|
||||
fn poll_flush(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), std::io::Error>> {
|
||||
Pin::new(&mut self.stream).poll_flush(cx)
|
||||
}
|
||||
|
||||
// Wrapper for run_message_loop() that shuts down socket when we are done
|
||||
pub async fn run<F, S>(
|
||||
mut self,
|
||||
handler: &mut impl Handler,
|
||||
shutdown_watcher: F,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
F: Fn() -> S,
|
||||
S: Future,
|
||||
{
|
||||
let ret = self.run_message_loop(handler, shutdown_watcher).await;
|
||||
let _ = self.stream.shutdown();
|
||||
ret
|
||||
}
|
||||
|
||||
async fn run_message_loop<F, S>(
|
||||
&mut self,
|
||||
handler: &mut impl Handler,
|
||||
shutdown_watcher: F,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
F: Fn() -> S,
|
||||
S: Future,
|
||||
{
|
||||
trace!("postgres backend to {:?} started", self.peer_addr);
|
||||
|
||||
tokio::select!(
|
||||
biased;
|
||||
|
||||
_ = shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
tracing::info!("shutdown request received during handshake");
|
||||
return Ok(())
|
||||
},
|
||||
|
||||
result = async {
|
||||
while self.state < ProtoState::Established {
|
||||
if let Some(msg) = self.read_message().await? {
|
||||
trace!("got message {msg:?} during handshake");
|
||||
|
||||
match self.process_handshake_message(handler, msg).await? {
|
||||
ProcessMsgResult::Continue => {
|
||||
self.flush().await?;
|
||||
continue;
|
||||
}
|
||||
ProcessMsgResult::Break => {
|
||||
trace!("postgres backend to {:?} exited during handshake", self.peer_addr);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
trace!("postgres backend to {:?} exited during handshake", self.peer_addr);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
Ok::<(), QueryError>(())
|
||||
} => {
|
||||
// Handshake complete.
|
||||
result?;
|
||||
}
|
||||
);
|
||||
|
||||
// Authentication completed
|
||||
let mut query_string = Bytes::new();
|
||||
while let Some(msg) = tokio::select!(
|
||||
biased;
|
||||
_ = shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
tracing::info!("shutdown request received in run_message_loop");
|
||||
Ok(None)
|
||||
},
|
||||
msg = self.read_message() => { msg },
|
||||
)? {
|
||||
trace!("got message {:?}", msg);
|
||||
|
||||
let result = self.process_message(handler, msg, &mut query_string).await;
|
||||
self.flush().await?;
|
||||
match result? {
|
||||
ProcessMsgResult::Continue => {
|
||||
self.flush().await?;
|
||||
continue;
|
||||
}
|
||||
ProcessMsgResult::Break => break,
|
||||
}
|
||||
}
|
||||
|
||||
trace!("postgres backend to {:?} exited", self.peer_addr);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn start_tls(&mut self) -> anyhow::Result<()> {
|
||||
if let Stream::Unencrypted(plain_stream) =
|
||||
std::mem::replace(&mut self.stream, Stream::Broken)
|
||||
{
|
||||
let acceptor = TlsAcceptor::from(self.tls_config.clone().unwrap());
|
||||
let tls_stream = acceptor.accept(plain_stream).await?;
|
||||
|
||||
self.stream = Stream::Tls(Box::new(tls_stream));
|
||||
return Ok(());
|
||||
};
|
||||
anyhow::bail!("TLS already started");
|
||||
}
|
||||
|
||||
async fn process_handshake_message(
|
||||
&mut self,
|
||||
handler: &mut impl Handler,
|
||||
msg: FeMessage,
|
||||
) -> Result<ProcessMsgResult, QueryError> {
|
||||
assert!(self.state < ProtoState::Established);
|
||||
let have_tls = self.tls_config.is_some();
|
||||
match msg {
|
||||
FeMessage::StartupPacket(m) => {
|
||||
trace!("got startup message {m:?}");
|
||||
|
||||
match m {
|
||||
FeStartupPacket::SslRequest => {
|
||||
debug!("SSL requested");
|
||||
|
||||
self.write_message(&BeMessage::EncryptionResponse(have_tls))?;
|
||||
if have_tls {
|
||||
self.start_tls().await?;
|
||||
self.state = ProtoState::Encrypted;
|
||||
}
|
||||
}
|
||||
FeStartupPacket::GssEncRequest => {
|
||||
debug!("GSS requested");
|
||||
self.write_message(&BeMessage::EncryptionResponse(false))?;
|
||||
}
|
||||
FeStartupPacket::StartupMessage { .. } => {
|
||||
if have_tls && !matches!(self.state, ProtoState::Encrypted) {
|
||||
self.write_message(&BeMessage::ErrorResponse(
|
||||
"must connect with TLS",
|
||||
None,
|
||||
))?;
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"client did not connect with TLS"
|
||||
)));
|
||||
}
|
||||
|
||||
// NB: startup() may change self.auth_type -- we are using that in proxy code
|
||||
// to bypass auth for new users.
|
||||
handler.startup(self, &m)?;
|
||||
|
||||
match self.auth_type {
|
||||
AuthType::Trust => {
|
||||
self.write_message(&BeMessage::AuthenticationOk)?
|
||||
.write_message(&BeMessage::CLIENT_ENCODING)?
|
||||
// The async python driver requires a valid server_version
|
||||
.write_message(&BeMessage::server_version("14.1"))?
|
||||
.write_message(&BeMessage::ReadyForQuery)?;
|
||||
self.state = ProtoState::Established;
|
||||
}
|
||||
AuthType::NeonJWT => {
|
||||
self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
|
||||
self.state = ProtoState::Authentication;
|
||||
}
|
||||
}
|
||||
}
|
||||
FeStartupPacket::CancelRequest { .. } => {
|
||||
self.state = ProtoState::Closed;
|
||||
return Ok(ProcessMsgResult::Break);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
FeMessage::PasswordMessage(m) => {
|
||||
trace!("got password message '{:?}'", m);
|
||||
|
||||
assert!(self.state == ProtoState::Authentication);
|
||||
|
||||
match self.auth_type {
|
||||
AuthType::Trust => unreachable!(),
|
||||
AuthType::NeonJWT => {
|
||||
let (_, jwt_response) = m.split_last().context("protocol violation")?;
|
||||
|
||||
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
|
||||
self.write_message(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
self.write_message(&BeMessage::AuthenticationOk)?
|
||||
.write_message(&BeMessage::CLIENT_ENCODING)?
|
||||
.write_message(&BeMessage::ReadyForQuery)?;
|
||||
self.state = ProtoState::Established;
|
||||
}
|
||||
|
||||
_ => {
|
||||
self.state = ProtoState::Closed;
|
||||
return Ok(ProcessMsgResult::Break);
|
||||
}
|
||||
}
|
||||
Ok(ProcessMsgResult::Continue)
|
||||
}
|
||||
|
||||
async fn process_message(
|
||||
&mut self,
|
||||
handler: &mut impl Handler,
|
||||
msg: FeMessage,
|
||||
unnamed_query_string: &mut Bytes,
|
||||
) -> Result<ProcessMsgResult, QueryError> {
|
||||
// Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
|
||||
// TODO: change that to proper top-level match of protocol state with separate message handling for each state
|
||||
assert!(self.state == ProtoState::Established);
|
||||
|
||||
match msg {
|
||||
FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => {
|
||||
return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
|
||||
}
|
||||
|
||||
FeMessage::Query(body) => {
|
||||
// remove null terminator
|
||||
let query_string = cstr_to_str(&body)?;
|
||||
|
||||
trace!("got query {query_string:?}");
|
||||
if let Err(e) = handler.process_query(self, query_string).await {
|
||||
log_query_error(query_string, &e);
|
||||
let short_error = short_error(&e);
|
||||
self.write_message(&BeMessage::ErrorResponse(
|
||||
&short_error,
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
}
|
||||
self.write_message(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
|
||||
FeMessage::Parse(m) => {
|
||||
*unnamed_query_string = m.query_string;
|
||||
self.write_message(&BeMessage::ParseComplete)?;
|
||||
}
|
||||
|
||||
FeMessage::Describe(_) => {
|
||||
self.write_message(&BeMessage::ParameterDescription)?
|
||||
.write_message(&BeMessage::NoData)?;
|
||||
}
|
||||
|
||||
FeMessage::Bind(_) => {
|
||||
self.write_message(&BeMessage::BindComplete)?;
|
||||
}
|
||||
|
||||
FeMessage::Close(_) => {
|
||||
self.write_message(&BeMessage::CloseComplete)?;
|
||||
}
|
||||
|
||||
FeMessage::Execute(_) => {
|
||||
let query_string = cstr_to_str(unnamed_query_string)?;
|
||||
trace!("got execute {query_string:?}");
|
||||
if let Err(e) = handler.process_query(self, query_string).await {
|
||||
log_query_error(query_string, &e);
|
||||
self.write_message(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
}
|
||||
// NOTE there is no ReadyForQuery message. This handler is used
|
||||
// for basebackup and it uses CopyOut which doesn't require
|
||||
// ReadyForQuery message and backend just switches back to
|
||||
// processing mode after sending CopyDone or ErrorResponse.
|
||||
}
|
||||
|
||||
FeMessage::Sync => {
|
||||
self.write_message(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
|
||||
FeMessage::Terminate => {
|
||||
return Ok(ProcessMsgResult::Break);
|
||||
}
|
||||
|
||||
// We prefer explicit pattern matching to wildcards, because
|
||||
// this helps us spot the places where new variants are missing
|
||||
FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"unexpected message type: {:?}",
|
||||
msg
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ProcessMsgResult::Continue)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
|
||||
/// messages.
|
||||
///
|
||||
|
||||
pub struct CopyDataWriter<'a> {
|
||||
pgb: &'a mut PostgresBackend,
|
||||
}
|
||||
|
||||
impl<'a> AsyncWrite for CopyDataWriter<'a> {
|
||||
fn poll_write(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &[u8],
|
||||
) -> Poll<Result<usize, std::io::Error>> {
|
||||
let this = self.get_mut();
|
||||
|
||||
// It's not strictly required to flush between each message, but makes it easier
|
||||
// to view in wireshark, and usually the messages that the callers write are
|
||||
// decently-sized anyway.
|
||||
match ready!(this.pgb.poll_write_buf(cx)) {
|
||||
Ok(()) => {}
|
||||
Err(err) => return Poll::Ready(Err(err)),
|
||||
}
|
||||
|
||||
// CopyData
|
||||
// XXX: if the input is large, we should split it into multiple messages.
|
||||
// Not sure what the threshold should be, but the ultimate hard limit is that
|
||||
// the length cannot exceed u32.
|
||||
this.pgb.write_message(&BeMessage::CopyData(buf))?;
|
||||
|
||||
Poll::Ready(Ok(buf.len()))
|
||||
}
|
||||
|
||||
fn poll_flush(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
let this = self.get_mut();
|
||||
match ready!(this.pgb.poll_write_buf(cx)) {
|
||||
Ok(()) => {}
|
||||
Err(err) => return Poll::Ready(Err(err)),
|
||||
}
|
||||
this.pgb.poll_flush(cx)
|
||||
}
|
||||
fn poll_shutdown(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
let this = self.get_mut();
|
||||
match ready!(this.pgb.poll_write_buf(cx)) {
|
||||
Ok(()) => {}
|
||||
Err(err) => return Poll::Ready(Err(err)),
|
||||
}
|
||||
this.pgb.poll_flush(cx)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn short_error(e: &QueryError) -> String {
|
||||
match e {
|
||||
QueryError::Disconnected(connection_error) => connection_error.to_string(),
|
||||
QueryError::Other(e) => format!("{e:#}"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn log_query_error(query: &str, e: &QueryError) {
|
||||
match e {
|
||||
QueryError::Disconnected(ConnectionError::Socket(io_error)) => {
|
||||
if is_expected_io_error(io_error) {
|
||||
info!("query handler for '{query}' failed with expected io error: {io_error}");
|
||||
} else {
|
||||
error!("query handler for '{query}' failed with io error: {io_error}");
|
||||
}
|
||||
}
|
||||
QueryError::Disconnected(other_connection_error) => {
|
||||
error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
|
||||
}
|
||||
QueryError::Other(e) => {
|
||||
error!("query handler for '{query}' failed: {e:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,206 +0,0 @@
|
||||
use std::{
|
||||
io::{self, BufReader, Write},
|
||||
net::{Shutdown, TcpStream},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use rustls::Connection;
|
||||
|
||||
/// Wrapper supporting reads of a shared TcpStream.
|
||||
pub struct ArcTcpRead(Arc<TcpStream>);
|
||||
|
||||
impl io::Read for ArcTcpRead {
|
||||
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
||||
(&*self.0).read(buf)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for ArcTcpRead {
|
||||
type Target = TcpStream;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0.deref()
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper around a TCP Stream supporting buffered reads.
|
||||
pub struct BufStream(BufReader<ArcTcpRead>);
|
||||
|
||||
impl io::Read for BufStream {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
self.0.read(buf)
|
||||
}
|
||||
}
|
||||
|
||||
impl io::Write for BufStream {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
self.get_ref().write(buf)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.get_ref().flush()
|
||||
}
|
||||
}
|
||||
|
||||
impl BufStream {
|
||||
/// Unwrap into the internal BufReader.
|
||||
fn into_reader(self) -> BufReader<ArcTcpRead> {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Returns a reference to the underlying TcpStream.
|
||||
fn get_ref(&self) -> &TcpStream {
|
||||
&self.0.get_ref().0
|
||||
}
|
||||
}
|
||||
|
||||
pub enum ReadStream {
|
||||
Tcp(BufReader<ArcTcpRead>),
|
||||
Tls(rustls_split::ReadHalf),
|
||||
}
|
||||
|
||||
impl io::Read for ReadStream {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
match self {
|
||||
Self::Tcp(reader) => reader.read(buf),
|
||||
Self::Tls(read_half) => read_half.read(buf),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ReadStream {
|
||||
pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Tcp(stream) => stream.get_ref().shutdown(how),
|
||||
Self::Tls(write_half) => write_half.shutdown(how),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum WriteStream {
|
||||
Tcp(Arc<TcpStream>),
|
||||
Tls(rustls_split::WriteHalf),
|
||||
}
|
||||
|
||||
impl WriteStream {
|
||||
pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Tcp(stream) => stream.shutdown(how),
|
||||
Self::Tls(write_half) => write_half.shutdown(how),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl io::Write for WriteStream {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
match self {
|
||||
Self::Tcp(stream) => stream.as_ref().write(buf),
|
||||
Self::Tls(write_half) => write_half.write(buf),
|
||||
}
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Tcp(stream) => stream.as_ref().flush(),
|
||||
Self::Tls(write_half) => write_half.flush(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type TlsStream<T> = rustls::StreamOwned<rustls::ServerConnection, T>;
|
||||
|
||||
pub enum BidiStream {
|
||||
Tcp(BufStream),
|
||||
/// This variant is boxed, because [`rustls::ServerConnection`] is quite larger than [`BufStream`].
|
||||
Tls(Box<TlsStream<BufStream>>),
|
||||
}
|
||||
|
||||
impl BidiStream {
|
||||
pub fn from_tcp(stream: TcpStream) -> Self {
|
||||
Self::Tcp(BufStream(BufReader::new(ArcTcpRead(Arc::new(stream)))))
|
||||
}
|
||||
|
||||
pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Tcp(stream) => stream.get_ref().shutdown(how),
|
||||
Self::Tls(tls_boxed) => {
|
||||
if how == Shutdown::Read {
|
||||
tls_boxed.sock.get_ref().shutdown(how)
|
||||
} else {
|
||||
tls_boxed.conn.send_close_notify();
|
||||
let res = tls_boxed.flush();
|
||||
tls_boxed.sock.get_ref().shutdown(how)?;
|
||||
res
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Split the bi-directional stream into two owned read and write halves.
|
||||
pub fn split(self) -> (ReadStream, WriteStream) {
|
||||
match self {
|
||||
Self::Tcp(stream) => {
|
||||
let reader = stream.into_reader();
|
||||
let stream: Arc<TcpStream> = reader.get_ref().0.clone();
|
||||
|
||||
(ReadStream::Tcp(reader), WriteStream::Tcp(stream))
|
||||
}
|
||||
Self::Tls(tls_boxed) => {
|
||||
let reader = tls_boxed.sock.into_reader();
|
||||
let buffer_data = reader.buffer().to_owned();
|
||||
let read_buf_cfg = rustls_split::BufCfg::with_data(buffer_data, 8192);
|
||||
let write_buf_cfg = rustls_split::BufCfg::with_capacity(8192);
|
||||
|
||||
// TODO would be nice to avoid the Arc here
|
||||
let socket = Arc::try_unwrap(reader.into_inner().0).unwrap();
|
||||
|
||||
let (read_half, write_half) = rustls_split::split(
|
||||
socket,
|
||||
Connection::Server(tls_boxed.conn),
|
||||
read_buf_cfg,
|
||||
write_buf_cfg,
|
||||
);
|
||||
(ReadStream::Tls(read_half), WriteStream::Tls(write_half))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_tls(self, mut conn: rustls::ServerConnection) -> io::Result<Self> {
|
||||
match self {
|
||||
Self::Tcp(mut stream) => {
|
||||
conn.complete_io(&mut stream)?;
|
||||
assert!(!conn.is_handshaking());
|
||||
Ok(Self::Tls(Box::new(TlsStream::new(conn, stream))))
|
||||
}
|
||||
Self::Tls { .. } => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"TLS is already started on this stream",
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl io::Read for BidiStream {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
match self {
|
||||
Self::Tcp(stream) => stream.read(buf),
|
||||
Self::Tls(tls_boxed) => tls_boxed.read(buf),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl io::Write for BidiStream {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
match self {
|
||||
Self::Tcp(stream) => stream.write(buf),
|
||||
Self::Tls(tls_boxed) => tls_boxed.write(buf),
|
||||
}
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Tcp(stream) => stream.flush(),
|
||||
Self::Tls(tls_boxed) => tls_boxed.flush(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,238 +0,0 @@
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
io::{Cursor, Read, Write},
|
||||
net::{TcpListener, TcpStream},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use utils::{
|
||||
postgres_backend::{AuthType, Handler, PostgresBackend},
|
||||
postgres_backend_async::QueryError,
|
||||
};
|
||||
|
||||
fn make_tcp_pair() -> (TcpStream, TcpStream) {
|
||||
let listener = TcpListener::bind("127.0.0.1:0").unwrap();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
let client_stream = TcpStream::connect(addr).unwrap();
|
||||
let (server_stream, _) = listener.accept().unwrap();
|
||||
(server_stream, client_stream)
|
||||
}
|
||||
|
||||
static KEY: Lazy<rustls::PrivateKey> = Lazy::new(|| {
|
||||
let mut cursor = Cursor::new(include_bytes!("key.pem"));
|
||||
rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
|
||||
});
|
||||
|
||||
static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
|
||||
let mut cursor = Cursor::new(include_bytes!("cert.pem"));
|
||||
rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
|
||||
});
|
||||
|
||||
#[test]
|
||||
// [false-positive](https://github.com/rust-lang/rust-clippy/issues/9274),
|
||||
// we resize the vector so doing some modifications after all
|
||||
#[allow(clippy::read_zero_byte_vec)]
|
||||
fn ssl() {
|
||||
let (mut client_sock, server_sock) = make_tcp_pair();
|
||||
|
||||
const QUERY: &str = "hello world";
|
||||
|
||||
let client_jh = std::thread::spawn(move || {
|
||||
// SSLRequest
|
||||
client_sock.write_u32::<BigEndian>(8).unwrap();
|
||||
client_sock.write_u32::<BigEndian>(80877103).unwrap();
|
||||
|
||||
let ssl_response = client_sock.read_u8().unwrap();
|
||||
assert_eq!(b'S', ssl_response);
|
||||
|
||||
let cfg = rustls::ClientConfig::builder()
|
||||
.with_safe_defaults()
|
||||
.with_root_certificates({
|
||||
let mut store = rustls::RootCertStore::empty();
|
||||
store.add(&CERT).unwrap();
|
||||
store
|
||||
})
|
||||
.with_no_client_auth();
|
||||
let client_config = Arc::new(cfg);
|
||||
|
||||
let dns_name = "localhost".try_into().unwrap();
|
||||
let mut conn = rustls::ClientConnection::new(client_config, dns_name).unwrap();
|
||||
|
||||
conn.complete_io(&mut client_sock).unwrap();
|
||||
assert!(!conn.is_handshaking());
|
||||
|
||||
let mut stream = rustls::Stream::new(&mut conn, &mut client_sock);
|
||||
|
||||
// StartupMessage
|
||||
stream.write_u32::<BigEndian>(9).unwrap();
|
||||
stream.write_u32::<BigEndian>(196608).unwrap();
|
||||
stream.write_u8(0).unwrap();
|
||||
stream.flush().unwrap();
|
||||
|
||||
// wait for ReadyForQuery
|
||||
let mut msg_buf = Vec::new();
|
||||
loop {
|
||||
let msg = stream.read_u8().unwrap();
|
||||
let size = stream.read_u32::<BigEndian>().unwrap() - 4;
|
||||
msg_buf.resize(size as usize, 0);
|
||||
stream.read_exact(&mut msg_buf).unwrap();
|
||||
|
||||
if msg == b'Z' {
|
||||
// ReadyForQuery
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Query
|
||||
stream.write_u8(b'Q').unwrap();
|
||||
stream
|
||||
.write_u32::<BigEndian>(4u32 + QUERY.len() as u32)
|
||||
.unwrap();
|
||||
stream.write_all(QUERY.as_ref()).unwrap();
|
||||
stream.flush().unwrap();
|
||||
|
||||
// ReadyForQuery
|
||||
let msg = stream.read_u8().unwrap();
|
||||
assert_eq!(msg, b'Z');
|
||||
});
|
||||
|
||||
struct TestHandler {
|
||||
got_query: bool,
|
||||
}
|
||||
impl Handler for TestHandler {
|
||||
fn process_query(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
query_string: &str,
|
||||
) -> Result<(), QueryError> {
|
||||
self.got_query = query_string == QUERY;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
let mut handler = TestHandler { got_query: false };
|
||||
|
||||
let cfg = rustls::ServerConfig::builder()
|
||||
.with_safe_defaults()
|
||||
.with_no_client_auth()
|
||||
.with_single_cert(vec![CERT.clone()], KEY.clone())
|
||||
.unwrap();
|
||||
let tls_config = Some(Arc::new(cfg));
|
||||
|
||||
let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap();
|
||||
pgb.run(&mut handler).unwrap();
|
||||
assert!(handler.got_query);
|
||||
|
||||
client_jh.join().unwrap();
|
||||
|
||||
// TODO consider shutdown behavior
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_ssl() {
|
||||
let (mut client_sock, server_sock) = make_tcp_pair();
|
||||
|
||||
let client_jh = std::thread::spawn(move || {
|
||||
let mut buf = BytesMut::new();
|
||||
|
||||
// SSLRequest
|
||||
buf.put_u32(8);
|
||||
buf.put_u32(80877103);
|
||||
client_sock.write_all(&buf).unwrap();
|
||||
buf.clear();
|
||||
|
||||
let ssl_response = client_sock.read_u8().unwrap();
|
||||
assert_eq!(b'N', ssl_response);
|
||||
});
|
||||
|
||||
struct TestHandler;
|
||||
|
||||
impl Handler for TestHandler {
|
||||
fn process_query(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
_query_string: &str,
|
||||
) -> Result<(), QueryError> {
|
||||
panic!()
|
||||
}
|
||||
}
|
||||
|
||||
let mut handler = TestHandler;
|
||||
|
||||
let pgb = PostgresBackend::new(server_sock, AuthType::Trust, None, true).unwrap();
|
||||
pgb.run(&mut handler).unwrap();
|
||||
|
||||
client_jh.join().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn server_forces_ssl() {
|
||||
let (mut client_sock, server_sock) = make_tcp_pair();
|
||||
|
||||
let client_jh = std::thread::spawn(move || {
|
||||
// StartupMessage
|
||||
client_sock.write_u32::<BigEndian>(9).unwrap();
|
||||
client_sock.write_u32::<BigEndian>(196608).unwrap();
|
||||
client_sock.write_u8(0).unwrap();
|
||||
client_sock.flush().unwrap();
|
||||
|
||||
// ErrorResponse
|
||||
assert_eq!(client_sock.read_u8().unwrap(), b'E');
|
||||
let len = client_sock.read_u32::<BigEndian>().unwrap() - 4;
|
||||
|
||||
let mut body = vec![0; len as usize];
|
||||
client_sock.read_exact(&mut body).unwrap();
|
||||
let mut body = Bytes::from(body);
|
||||
|
||||
let mut errors = HashMap::new();
|
||||
loop {
|
||||
let field_type = body.get_u8();
|
||||
if field_type == 0u8 {
|
||||
break;
|
||||
}
|
||||
|
||||
let end_idx = body.iter().position(|&b| b == 0u8).unwrap();
|
||||
let mut value = body.split_to(end_idx + 1);
|
||||
assert_eq!(value[end_idx], 0u8);
|
||||
value.truncate(end_idx);
|
||||
let old = errors.insert(field_type, value);
|
||||
assert!(old.is_none());
|
||||
}
|
||||
|
||||
assert!(!body.has_remaining());
|
||||
|
||||
assert_eq!("must connect with TLS", errors.get(&b'M').unwrap());
|
||||
|
||||
// TODO read failure
|
||||
});
|
||||
|
||||
struct TestHandler;
|
||||
impl Handler for TestHandler {
|
||||
fn process_query(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
_query_string: &str,
|
||||
) -> Result<(), QueryError> {
|
||||
panic!()
|
||||
}
|
||||
}
|
||||
let mut handler = TestHandler;
|
||||
|
||||
let cfg = rustls::ServerConfig::builder()
|
||||
.with_safe_defaults()
|
||||
.with_no_client_auth()
|
||||
.with_single_cert(vec![CERT.clone()], KEY.clone())
|
||||
.unwrap();
|
||||
let tls_config = Some(Arc::new(cfg));
|
||||
|
||||
let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap();
|
||||
let res = pgb.run(&mut handler).unwrap_err();
|
||||
assert_eq!("client did not connect with TLS", format!("{}", res));
|
||||
|
||||
client_jh.join().unwrap();
|
||||
|
||||
// TODO consider shutdown behavior
|
||||
}
|
||||
@@ -23,6 +23,7 @@ const_format.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
crc32c.workspace = true
|
||||
crossbeam-utils.workspace = true
|
||||
either.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
@@ -36,6 +37,7 @@ num-traits.workspace = true
|
||||
once_cell.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
postgres-types.workspace = true
|
||||
rand.workspace = true
|
||||
@@ -51,7 +53,7 @@ thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit.workspace = true
|
||||
toml_edit = { workspace = true, features = [ "serde" ] }
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
walkdir.workspace = true
|
||||
|
||||
@@ -33,6 +33,7 @@ use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
|
||||
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
||||
use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
|
||||
use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
|
||||
use postgres_ffi::TransactionId;
|
||||
use postgres_ffi::XLogFileName;
|
||||
use postgres_ffi::PG_TLI;
|
||||
@@ -190,14 +191,31 @@ where
|
||||
{
|
||||
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
|
||||
|
||||
// Gather and send relational files in each database if full backup is requested.
|
||||
if self.full_backup {
|
||||
for rel in self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.await?
|
||||
{
|
||||
self.add_rel(rel).await?;
|
||||
// If full backup is requested, include all relation files.
|
||||
// Otherwise only include init forks of unlogged relations.
|
||||
let rels = self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.await?;
|
||||
for &rel in rels.iter() {
|
||||
// Send init fork as main fork to provide well formed empty
|
||||
// contents of UNLOGGED relations. Postgres copies it in
|
||||
// `reinit.c` during recovery.
|
||||
if rel.forknum == INIT_FORKNUM {
|
||||
// I doubt we need _init fork itself, but having it at least
|
||||
// serves as a marker relation is unlogged.
|
||||
self.add_rel(rel, rel).await?;
|
||||
self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
|
||||
continue;
|
||||
}
|
||||
|
||||
if self.full_backup {
|
||||
if rel.forknum == MAIN_FORKNUM && rels.contains(&rel.with_forknum(INIT_FORKNUM))
|
||||
{
|
||||
// skip this, will include it when we reach the init fork
|
||||
continue;
|
||||
}
|
||||
self.add_rel(rel, rel).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -220,15 +238,16 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
|
||||
/// Add contents of relfilenode `src`, naming it as `dst`.
|
||||
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_rel_size(tag, self.lsn, false, self.ctx)
|
||||
.get_rel_size(src, self.lsn, false, self.ctx)
|
||||
.await?;
|
||||
|
||||
// If the relation is empty, create an empty file
|
||||
if nblocks == 0 {
|
||||
let file_name = tag.to_segfile_name(0);
|
||||
let file_name = dst.to_segfile_name(0);
|
||||
let header = new_tar_header(&file_name, 0)?;
|
||||
self.ar.append(&header, &mut io::empty()).await?;
|
||||
return Ok(());
|
||||
@@ -244,12 +263,12 @@ where
|
||||
for blknum in startblk..endblk {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx)
|
||||
.get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
|
||||
.await?;
|
||||
segment_data.extend_from_slice(&img[..]);
|
||||
}
|
||||
|
||||
let file_name = tag.to_segfile_name(seg as u32);
|
||||
let file_name = dst.to_segfile_name(seg as u32);
|
||||
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
|
||||
self.ar.append(&header, segment_data.as_slice()).await?;
|
||||
|
||||
|
||||
@@ -23,11 +23,10 @@ use pageserver::{
|
||||
tenant::mgr,
|
||||
virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
logging,
|
||||
postgres_backend::AuthType,
|
||||
project_git_version,
|
||||
logging, project_git_version,
|
||||
sentry_init::init_sentry,
|
||||
signals::{self, Signal},
|
||||
tcp_listener,
|
||||
@@ -88,6 +87,13 @@ fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
};
|
||||
|
||||
// Initialize logging, which must be initialized before the custom panic hook is installed.
|
||||
logging::init(conf.log_format)?;
|
||||
|
||||
// mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
|
||||
// disarming this hook on pageserver, because we never tear down tracing.
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(
|
||||
Some(GIT_VERSION.into()),
|
||||
@@ -210,9 +216,6 @@ fn start_pageserver(
|
||||
launch_ts: &'static LaunchTimestamp,
|
||||
conf: &'static PageServerConf,
|
||||
) -> anyhow::Result<()> {
|
||||
// Initialize logging
|
||||
logging::init(conf.log_format)?;
|
||||
|
||||
// Print version and launch timestamp to the log,
|
||||
// and expose them as prometheus metrics.
|
||||
// A changed version string indicates changed software.
|
||||
@@ -277,33 +280,17 @@ fn start_pageserver(
|
||||
};
|
||||
info!("Using auth: {:#?}", conf.auth_type);
|
||||
|
||||
// TODO: remove ZENITH_AUTH_TOKEN once it's not used anywhere in development/staging/prod configuration.
|
||||
match (var("ZENITH_AUTH_TOKEN"), var("NEON_AUTH_TOKEN")) {
|
||||
(old, Ok(v)) => {
|
||||
match var("NEON_AUTH_TOKEN") {
|
||||
Ok(v) => {
|
||||
info!("Loaded JWT token for authentication with Safekeeper");
|
||||
if let Ok(v_old) = old {
|
||||
warn!(
|
||||
"JWT token for Safekeeper is specified twice, ZENITH_AUTH_TOKEN is deprecated"
|
||||
);
|
||||
if v_old != v {
|
||||
warn!("JWT token for Safekeeper has two different values, choosing NEON_AUTH_TOKEN");
|
||||
}
|
||||
}
|
||||
pageserver::config::SAFEKEEPER_AUTH_TOKEN
|
||||
.set(Arc::new(v))
|
||||
.map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
|
||||
}
|
||||
(Ok(v), _) => {
|
||||
info!("Loaded JWT token for authentication with Safekeeper");
|
||||
warn!("Please update pageserver configuration: the JWT token should be NEON_AUTH_TOKEN, not ZENITH_AUTH_TOKEN");
|
||||
pageserver::config::SAFEKEEPER_AUTH_TOKEN
|
||||
.set(Arc::new(v))
|
||||
.map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
|
||||
}
|
||||
(_, Err(VarError::NotPresent)) => {
|
||||
Err(VarError::NotPresent) => {
|
||||
info!("No JWT token for authentication with Safekeeper detected");
|
||||
}
|
||||
(_, Err(e)) => {
|
||||
Err(e) => {
|
||||
return Err(e).with_context(|| {
|
||||
"Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable"
|
||||
})
|
||||
|
||||
@@ -21,10 +21,10 @@ use std::time::Duration;
|
||||
use toml_edit;
|
||||
use toml_edit::{Document, Item};
|
||||
|
||||
use postgres_backend::AuthType;
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
logging::LogFormat,
|
||||
postgres_backend::AuthType,
|
||||
};
|
||||
|
||||
use crate::tenant::config::TenantConf;
|
||||
@@ -698,6 +698,12 @@ impl PageServerConf {
|
||||
Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?);
|
||||
}
|
||||
|
||||
if let Some(image_creation_threshold) = item.get("image_creation_threshold") {
|
||||
t_conf.image_creation_threshold = Some(
|
||||
parse_toml_u64("image_creation_threshold", image_creation_threshold)?.try_into()?,
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(gc_horizon) = item.get("gc_horizon") {
|
||||
t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
|
||||
}
|
||||
@@ -731,6 +737,13 @@ impl PageServerConf {
|
||||
})?);
|
||||
}
|
||||
|
||||
if let Some(eviction_policy) = item.get("eviction_policy") {
|
||||
t_conf.eviction_policy = Some(
|
||||
toml_edit::de::from_item(eviction_policy.clone())
|
||||
.context("parse eviction_policy")?,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(t_conf)
|
||||
}
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
|
||||
const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize)]
|
||||
#[derive(Serialize, Debug)]
|
||||
struct Ids {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
tenant_id: TenantId,
|
||||
@@ -75,7 +75,7 @@ pub async fn collect_metrics(
|
||||
// define client here to reuse it for all requests
|
||||
let client = reqwest::Client::new();
|
||||
let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
|
||||
let mut prev_iteration_time: Option<std::time::Instant> = None;
|
||||
let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
@@ -86,11 +86,11 @@ pub async fn collect_metrics(
|
||||
_ = ticker.tick() => {
|
||||
|
||||
// send cached metrics every cached_metric_collection_interval
|
||||
let send_cached = prev_iteration_time
|
||||
.map(|x| x.elapsed() >= cached_metric_collection_interval)
|
||||
.unwrap_or(false);
|
||||
let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
|
||||
|
||||
prev_iteration_time = Some(std::time::Instant::now());
|
||||
if send_cached {
|
||||
prev_iteration_time = std::time::Instant::now();
|
||||
}
|
||||
|
||||
collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
|
||||
}
|
||||
@@ -287,6 +287,12 @@ pub async fn collect_metrics_iteration(
|
||||
}
|
||||
} else {
|
||||
error!("metrics endpoint refused the sent metrics: {:?}", res);
|
||||
for metric in chunk_to_send.iter() {
|
||||
// Report if the metric value is suspiciously large
|
||||
if metric.value > (1u64 << 40) {
|
||||
error!("potentially abnormal metric value: {:?}", metric);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
|
||||
@@ -245,6 +245,53 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
put:
|
||||
description: Garbage collect given timeline
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
"400":
|
||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
/v1/tenant/{tenant_id}/attach:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -437,6 +484,13 @@ paths:
|
||||
type: boolean
|
||||
description: |
|
||||
When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
|
||||
- name: retention_period
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
description: |
|
||||
Override the default retention period (in bytes) used for size calculation.
|
||||
get:
|
||||
description: |
|
||||
Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
|
||||
|
||||
@@ -7,19 +7,22 @@ use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::http::endpoint::RequestSpan;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
|
||||
use super::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
};
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::mgr::TenantMapInsertError;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
@@ -38,7 +41,7 @@ use utils::{
|
||||
|
||||
// Imports only used for testing APIs
|
||||
#[cfg(feature = "testing")]
|
||||
use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
|
||||
use super::models::ConfigureFailpointsRequest;
|
||||
|
||||
struct State {
|
||||
conf: &'static PageServerConf,
|
||||
@@ -479,11 +482,19 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
/// to debug any of the calculations. Requires `tenant_id` request parameter, supports
|
||||
/// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
|
||||
/// values.
|
||||
///
|
||||
/// 'retention_period' query parameter overrides the cutoff that is used to calculate the size
|
||||
/// (only if it is shorter than the real cutoff).
|
||||
///
|
||||
/// Note: we don't update the cached size and prometheus metric here.
|
||||
/// The retention period might be different, and it's nice to have a method to just calculate it
|
||||
/// without modifying anything anyway.
|
||||
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
|
||||
let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
|
||||
let headers = request.headers();
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
@@ -492,24 +503,29 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
|
||||
// this can be long operation
|
||||
let inputs = tenant
|
||||
.gather_size_inputs(&ctx)
|
||||
.gather_size_inputs(retention_period, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let size = if !inputs_only.unwrap_or(false) {
|
||||
Some(
|
||||
tenant
|
||||
.calc_and_update_cached_synthetic_size(&inputs)
|
||||
.map_err(ApiError::InternalServerError)?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let mut sizes = None;
|
||||
if !inputs_only.unwrap_or(false) {
|
||||
let storage_model = inputs
|
||||
.calculate_model()
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let size = storage_model.calculate();
|
||||
|
||||
/// Private response type with the additional "unstable" `inputs` field.
|
||||
///
|
||||
/// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is
|
||||
/// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`.
|
||||
// If request header expects html, return html
|
||||
if headers["Accept"] == "text/html" {
|
||||
return synthetic_size_html_response(inputs, storage_model, size);
|
||||
}
|
||||
sizes = Some(size);
|
||||
} else if headers["Accept"] == "text/html" {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"inputs_only parameter is incompatible with html output request"
|
||||
)));
|
||||
}
|
||||
|
||||
/// The type resides in the pageserver not to expose `ModelInputs`.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Serialize)]
|
||||
struct TenantHistorySize {
|
||||
@@ -519,6 +535,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
///
|
||||
/// Will be none if `?inputs_only=true` was given.
|
||||
size: Option<u64>,
|
||||
/// Size of each segment used in the model.
|
||||
/// Will be null if `?inputs_only=true` was given.
|
||||
segment_sizes: Option<Vec<tenant_size_model::SegmentSizeResult>>,
|
||||
inputs: crate::tenant::size::ModelInputs,
|
||||
}
|
||||
|
||||
@@ -526,7 +545,8 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
StatusCode::OK,
|
||||
TenantHistorySize {
|
||||
id: tenant_id,
|
||||
size,
|
||||
size: sizes.as_ref().map(|x| x.total_size),
|
||||
segment_sizes: sizes.map(|x| x.segments),
|
||||
inputs,
|
||||
},
|
||||
)
|
||||
@@ -591,6 +611,62 @@ async fn evict_timeline_layer_handler(request: Request<Body>) -> Result<Response
|
||||
}
|
||||
}
|
||||
|
||||
/// Get tenant_size SVG graph along with the JSON data.
|
||||
fn synthetic_size_html_response(
|
||||
inputs: ModelInputs,
|
||||
storage_model: StorageModel,
|
||||
sizes: SizeResult,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let mut timeline_ids: Vec<String> = Vec::new();
|
||||
let mut timeline_map: HashMap<TimelineId, usize> = HashMap::new();
|
||||
for (index, ti) in inputs.timeline_inputs.iter().enumerate() {
|
||||
timeline_map.insert(ti.timeline_id, index);
|
||||
timeline_ids.push(ti.timeline_id.to_string());
|
||||
}
|
||||
let seg_to_branch: Vec<usize> = inputs
|
||||
.segments
|
||||
.iter()
|
||||
.map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
|
||||
.collect();
|
||||
|
||||
let svg =
|
||||
tenant_size_model::svg::draw_svg(&storage_model, &timeline_ids, &seg_to_branch, &sizes)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let mut response = String::new();
|
||||
|
||||
use std::fmt::Write;
|
||||
write!(response, "<html>\n<body>\n").unwrap();
|
||||
write!(response, "<div>\n{svg}\n</div>").unwrap();
|
||||
writeln!(response, "Project size: {}", sizes.total_size).unwrap();
|
||||
writeln!(response, "<pre>").unwrap();
|
||||
writeln!(
|
||||
response,
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&inputs).unwrap()
|
||||
)
|
||||
.unwrap();
|
||||
writeln!(
|
||||
response,
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&sizes.segments).unwrap()
|
||||
)
|
||||
.unwrap();
|
||||
writeln!(response, "</pre>").unwrap();
|
||||
write!(response, "</body>\n</html>\n").unwrap();
|
||||
|
||||
html_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
|
||||
let response = Response::builder()
|
||||
.status(status)
|
||||
.header(hyper::header::CONTENT_TYPE, "text/html")
|
||||
.body(Body::from(data.as_bytes().to_vec()))
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
// Helper function to standardize the error messages we produce on bad durations
|
||||
//
|
||||
// Intended to be used with anyhow's `with_context`, e.g.:
|
||||
@@ -797,6 +873,14 @@ async fn update_tenant_config_handler(
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(eviction_policy) = request_data.eviction_policy {
|
||||
tenant_conf.eviction_policy = Some(
|
||||
serde_json::from_value(eviction_policy)
|
||||
.context("parse field `eviction_policy`")
|
||||
.map_err(ApiError::BadRequest)?,
|
||||
);
|
||||
}
|
||||
|
||||
let state = get_state(&request);
|
||||
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
|
||||
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
|
||||
@@ -842,7 +926,6 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
}
|
||||
|
||||
// Run GC immediately on given timeline.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
@@ -889,19 +972,22 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
timeline
|
||||
.freeze_and_flush()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
timeline
|
||||
.compact(&ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
timeline
|
||||
.freeze_and_flush()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
timeline
|
||||
.compact(&ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
|
||||
.await
|
||||
}
|
||||
|
||||
async fn timeline_download_remote_layers_handler_post(
|
||||
@@ -946,6 +1032,17 @@ async fn active_timeline_of_active_tenant(
|
||||
.map_err(ApiError::NotFound)
|
||||
}
|
||||
|
||||
async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
// Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook().
|
||||
// For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it.
|
||||
// Use catch_unwind to ensure that tokio nor hyper are distracted by our panic.
|
||||
let query = req.uri().query();
|
||||
let _ = std::panic::catch_unwind(|| {
|
||||
panic!("unconditional panic for testing panic hook integration; request query: {query:?}")
|
||||
});
|
||||
json_response(StatusCode::NO_CONTENT, ())
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(
|
||||
StatusCode::NOT_FOUND,
|
||||
@@ -995,7 +1092,8 @@ pub fn make_router(
|
||||
let handler = $handler;
|
||||
#[cfg(not(feature = "testing"))]
|
||||
let handler = cfg_disabled;
|
||||
handler
|
||||
|
||||
move |r| RequestSpan(handler).handle(r)
|
||||
}};
|
||||
}
|
||||
|
||||
@@ -1003,35 +1101,55 @@ pub fn make_router(
|
||||
.data(Arc::new(
|
||||
State::new(conf, auth, remote_storage).context("Failed to initialize router state")?,
|
||||
))
|
||||
.get("/v1/status", status_handler)
|
||||
.get("/v1/status", |r| RequestSpan(status_handler).handle(r))
|
||||
.put(
|
||||
"/v1/failpoints",
|
||||
testing_api!("manage failpoints", failpoints_handler),
|
||||
)
|
||||
.get("/v1/tenant", tenant_list_handler)
|
||||
.post("/v1/tenant", tenant_create_handler)
|
||||
.get("/v1/tenant/:tenant_id", tenant_status)
|
||||
.get("/v1/tenant/:tenant_id/size", tenant_size_handler)
|
||||
.put("/v1/tenant/config", update_tenant_config_handler)
|
||||
.get("/v1/tenant/:tenant_id/config", get_tenant_config_handler)
|
||||
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
|
||||
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
|
||||
.post("/v1/tenant/:tenant_id/attach", tenant_attach_handler)
|
||||
.post("/v1/tenant/:tenant_id/detach", tenant_detach_handler)
|
||||
.post("/v1/tenant/:tenant_id/load", tenant_load_handler)
|
||||
.post("/v1/tenant/:tenant_id/ignore", tenant_ignore_handler)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_detail_handler,
|
||||
)
|
||||
.get("/v1/tenant", |r| RequestSpan(tenant_list_handler).handle(r))
|
||||
.post("/v1/tenant", |r| {
|
||||
RequestSpan(tenant_create_handler).handle(r)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id", |r| {
|
||||
RequestSpan(tenant_status).handle(r)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/synthetic_size", |r| {
|
||||
RequestSpan(tenant_size_handler).handle(r)
|
||||
})
|
||||
.put("/v1/tenant/config", |r| {
|
||||
RequestSpan(update_tenant_config_handler).handle(r)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/config", |r| {
|
||||
RequestSpan(get_tenant_config_handler).handle(r)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/timeline", |r| {
|
||||
RequestSpan(timeline_list_handler).handle(r)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/timeline", |r| {
|
||||
RequestSpan(timeline_create_handler).handle(r)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/attach", |r| {
|
||||
RequestSpan(tenant_attach_handler).handle(r)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/detach", |r| {
|
||||
RequestSpan(tenant_detach_handler).handle(r)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/load", |r| {
|
||||
RequestSpan(tenant_load_handler).handle(r)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/ignore", |r| {
|
||||
RequestSpan(tenant_ignore_handler).handle(r)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
RequestSpan(timeline_detail_handler).handle(r)
|
||||
})
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
|
||||
get_lsn_by_timestamp_handler,
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc",
|
||||
testing_api!("run timeline GC", timeline_gc_handler),
|
||||
|r| RequestSpan(get_lsn_by_timestamp_handler).handle(r),
|
||||
)
|
||||
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
|
||||
RequestSpan(timeline_gc_handler).handle(r)
|
||||
})
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
|
||||
testing_api!("run timeline compaction", timeline_compact_handler),
|
||||
@@ -1042,27 +1160,26 @@ pub fn make_router(
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|
||||
timeline_download_remote_layers_handler_post,
|
||||
|r| RequestSpan(timeline_download_remote_layers_handler_post).handle(r),
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|
||||
timeline_download_remote_layers_handler_get,
|
||||
)
|
||||
.delete(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_delete_handler,
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer",
|
||||
layer_map_info_handler,
|
||||
|r| RequestSpan(timeline_download_remote_layers_handler_get).handle(r),
|
||||
)
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
RequestSpan(timeline_delete_handler).handle(r)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
|
||||
RequestSpan(layer_map_info_handler).handle(r)
|
||||
})
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||
layer_download_handler,
|
||||
|r| RequestSpan(layer_download_handler).handle(r),
|
||||
)
|
||||
.delete(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||
evict_timeline_layer_handler,
|
||||
|r| RequestSpan(evict_timeline_layer_handler).handle(r),
|
||||
)
|
||||
.get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -20,7 +20,8 @@ use pageserver_api::models::{
|
||||
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
|
||||
PagestreamNblocksRequest, PagestreamNblocksResponse,
|
||||
};
|
||||
use pq_proto::ConnectionError;
|
||||
use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use pq_proto::FeStartupPacket;
|
||||
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
|
||||
use std::io;
|
||||
@@ -35,8 +36,6 @@ use utils::{
|
||||
auth::{Claims, JwtAuth, Scope},
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
postgres_backend_async::{self, is_expected_io_error, PostgresBackend, QueryError},
|
||||
simple_rcu::RcuReadGuard,
|
||||
};
|
||||
|
||||
@@ -64,11 +63,11 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
let msg = format!("pageserver is shutting down");
|
||||
let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg, None));
|
||||
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None));
|
||||
Err(QueryError::Other(anyhow::anyhow!(msg)))
|
||||
}
|
||||
|
||||
msg = pgb.read_message() => { msg }
|
||||
msg = pgb.read_message() => { msg.map_err(QueryError::from)}
|
||||
};
|
||||
|
||||
match msg {
|
||||
@@ -79,14 +78,16 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
|
||||
FeMessage::Sync => continue,
|
||||
FeMessage::Terminate => {
|
||||
let msg = "client terminated connection with Terminate message during COPY";
|
||||
let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
||||
pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
|
||||
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
||||
// error can't happen here, ErrorResponse serialization should be always ok
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
|
||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||
break;
|
||||
}
|
||||
m => {
|
||||
let msg = format!("unexpected message {m:?}");
|
||||
pgb.write_message(&BeMessage::ErrorResponse(&msg, None))?;
|
||||
// error can't happen here, ErrorResponse serialization should be always ok
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
|
||||
Err(io::Error::new(io::ErrorKind::Other, msg))?;
|
||||
break;
|
||||
}
|
||||
@@ -96,16 +97,17 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
|
||||
}
|
||||
Ok(None) => {
|
||||
let msg = "client closed connection during COPY";
|
||||
let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
||||
pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
|
||||
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
||||
// error can't happen here, ErrorResponse serialization should be always ok
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
|
||||
pgb.flush().await?;
|
||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||
}
|
||||
Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
|
||||
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
|
||||
Err(io_error)?;
|
||||
}
|
||||
Err(other) => {
|
||||
Err(io::Error::new(io::ErrorKind::Other, other))?;
|
||||
Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -212,7 +214,7 @@ async fn page_service_conn_main(
|
||||
// we've been requested to shut down
|
||||
Ok(())
|
||||
}
|
||||
Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
|
||||
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
|
||||
if is_expected_io_error(&io_error) {
|
||||
info!("Postgres client disconnected ({io_error})");
|
||||
Ok(())
|
||||
@@ -311,7 +313,7 @@ impl PageServerHandler {
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
// switch client to COPYBOTH
|
||||
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
||||
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
||||
pgb.flush().await?;
|
||||
|
||||
let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id);
|
||||
@@ -380,7 +382,7 @@ impl PageServerHandler {
|
||||
})
|
||||
});
|
||||
|
||||
pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
|
||||
pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
|
||||
pgb.flush().await?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -416,7 +418,7 @@ impl PageServerHandler {
|
||||
|
||||
// Import basebackup provided via CopyData
|
||||
info!("importing basebackup");
|
||||
pgb.write_message(&BeMessage::CopyInResponse)?;
|
||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||
pgb.flush().await?;
|
||||
|
||||
let mut copyin_stream = Box::pin(copyin_stream(pgb));
|
||||
@@ -468,7 +470,7 @@ impl PageServerHandler {
|
||||
|
||||
// Import wal provided via CopyData
|
||||
info!("importing wal");
|
||||
pgb.write_message(&BeMessage::CopyInResponse)?;
|
||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||
pgb.flush().await?;
|
||||
let mut copyin_stream = Box::pin(copyin_stream(pgb));
|
||||
let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
|
||||
@@ -678,7 +680,7 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
// switch client to COPYOUT
|
||||
pgb.write_message(&BeMessage::CopyOutResponse)?;
|
||||
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
|
||||
pgb.flush().await?;
|
||||
|
||||
// Send a tarball of the latest layer on the timeline
|
||||
@@ -695,7 +697,7 @@ impl PageServerHandler {
|
||||
.await?;
|
||||
}
|
||||
|
||||
pgb.write_message(&BeMessage::CopyDone)?;
|
||||
pgb.write_message_noflush(&BeMessage::CopyDone)?;
|
||||
pgb.flush().await?;
|
||||
info!("basebackup complete");
|
||||
|
||||
@@ -721,7 +723,7 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl postgres_backend_async::Handler for PageServerHandler {
|
||||
impl postgres_backend::Handler for PageServerHandler {
|
||||
fn check_auth_jwt(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
@@ -812,7 +814,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
|
||||
.await?;
|
||||
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
}
|
||||
// return pair of prev_lsn and last_lsn
|
||||
else if query_string.starts_with("get_last_record_rlsn ") {
|
||||
@@ -835,15 +837,15 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
|
||||
pgb.write_message(&BeMessage::RowDescription(&[
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::text_col(b"prev_lsn"),
|
||||
RowDescriptor::text_col(b"last_lsn"),
|
||||
]))?
|
||||
.write_message(&BeMessage::DataRow(&[
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(end_of_timeline.prev.to_string().as_bytes()),
|
||||
Some(end_of_timeline.last.to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
}
|
||||
// same as basebackup, but result includes relational data as well
|
||||
else if query_string.starts_with("fullbackup ") {
|
||||
@@ -884,7 +886,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx)
|
||||
.await?;
|
||||
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("import basebackup ") {
|
||||
// Import the `base` section (everything but the wal) of a basebackup.
|
||||
// Assumes the tenant already exists on this pageserver.
|
||||
@@ -929,10 +931,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Err(e) => {
|
||||
error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
|
||||
pgb.write_message(&BeMessage::ErrorResponse(
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?
|
||||
@@ -965,10 +967,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
.handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
|
||||
.await
|
||||
{
|
||||
Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Err(e) => {
|
||||
error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
|
||||
pgb.write_message(&BeMessage::ErrorResponse(
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?
|
||||
@@ -977,7 +979,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
} else if query_string.to_ascii_lowercase().starts_with("set ") {
|
||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||
// on connect
|
||||
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("show ") {
|
||||
// show <tenant_id>
|
||||
let (_, params_raw) = query_string.split_at("show ".len());
|
||||
@@ -993,7 +995,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
||||
pgb.write_message(&BeMessage::RowDescription(&[
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"checkpoint_distance"),
|
||||
RowDescriptor::int8_col(b"checkpoint_timeout"),
|
||||
RowDescriptor::int8_col(b"compaction_target_size"),
|
||||
@@ -1004,7 +1006,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
RowDescriptor::int8_col(b"image_creation_threshold"),
|
||||
RowDescriptor::int8_col(b"pitr_interval"),
|
||||
]))?
|
||||
.write_message(&BeMessage::DataRow(&[
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(tenant.get_checkpoint_distance().to_string().as_bytes()),
|
||||
Some(
|
||||
tenant
|
||||
@@ -1027,7 +1029,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
Some(tenant.get_image_creation_threshold().to_string().as_bytes()),
|
||||
Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"unknown command {query_string}"
|
||||
@@ -1055,7 +1057,7 @@ impl From<GetActiveTenantError> for QueryError {
|
||||
fn from(e: GetActiveTenantError) -> Self {
|
||||
match e {
|
||||
GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
|
||||
ConnectionError::Socket(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
|
||||
ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
|
||||
),
|
||||
GetActiveTenantError::Other(e) => QueryError::Other(e),
|
||||
}
|
||||
|
||||
@@ -7,11 +7,11 @@ use std::fmt;
|
||||
use std::ops::{AddAssign, Range};
|
||||
use std::time::Duration;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
|
||||
/// Key used in the Repository kv-store.
|
||||
///
|
||||
/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
|
||||
/// for what we actually store in these fields.
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
|
||||
pub struct Key {
|
||||
pub field1: u8,
|
||||
pub field2: u32,
|
||||
|
||||
@@ -231,6 +231,9 @@ pub enum TaskKind {
|
||||
// Compaction. One per tenant.
|
||||
Compaction,
|
||||
|
||||
// Eviction. One per timeline.
|
||||
Eviction,
|
||||
|
||||
// Initial logical size calculation
|
||||
InitialLogicalSizeCalculation,
|
||||
|
||||
|
||||
@@ -2418,6 +2418,9 @@ impl Tenant {
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
|
||||
pub async fn gather_size_inputs(
|
||||
&self,
|
||||
// `max_retention_period` overrides the cutoff that is used to calculate the size
|
||||
// (only if it is shorter than the real cutoff).
|
||||
max_retention_period: Option<u64>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<size::ModelInputs> {
|
||||
let logical_sizes_at_once = self
|
||||
@@ -2425,32 +2428,41 @@ impl Tenant {
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.inner();
|
||||
|
||||
// TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries
|
||||
// are for testing/experimenting, we tolerate this.
|
||||
// TODO: Having a single mutex block concurrent reads is not great for performance.
|
||||
//
|
||||
// But the only case where we need to run multiple of these at once is when we
|
||||
// request a size for a tenant manually via API, while another background calculation
|
||||
// is in progress (which is not a common case).
|
||||
//
|
||||
// See more for on the issue #2748 condenced out of the initial PR review.
|
||||
let mut shared_cache = self.cached_logical_sizes.lock().await;
|
||||
|
||||
size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await
|
||||
size::gather_inputs(
|
||||
self,
|
||||
logical_sizes_at_once,
|
||||
max_retention_period,
|
||||
&mut shared_cache,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Calculate synthetic tenant size
|
||||
/// Calculate synthetic tenant size and cache the result.
|
||||
/// This is periodically called by background worker.
|
||||
/// result is cached in tenant struct
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
|
||||
pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
|
||||
let inputs = self.gather_size_inputs(ctx).await?;
|
||||
let inputs = self.gather_size_inputs(None, ctx).await?;
|
||||
|
||||
self.calc_and_update_cached_synthetic_size(&inputs)
|
||||
}
|
||||
|
||||
/// Calculate synthetic size , cache it and set metric value
|
||||
pub fn calc_and_update_cached_synthetic_size(
|
||||
&self,
|
||||
inputs: &size::ModelInputs,
|
||||
) -> anyhow::Result<u64> {
|
||||
let size = inputs.calculate()?;
|
||||
|
||||
self.set_cached_synthetic_size(size);
|
||||
|
||||
Ok(size)
|
||||
}
|
||||
|
||||
/// Cache given synthetic size and update the metric value
|
||||
pub fn set_cached_synthetic_size(&self, size: u64) {
|
||||
self.cached_synthetic_tenant_size
|
||||
.store(size, Ordering::Relaxed);
|
||||
|
||||
@@ -2458,8 +2470,6 @@ impl Tenant {
|
||||
.get_metric_with_label_values(&[&self.tenant_id.to_string()])
|
||||
.unwrap()
|
||||
.set(size);
|
||||
|
||||
Ok(size)
|
||||
}
|
||||
|
||||
pub fn get_cached_synthetic_size(&self) -> u64 {
|
||||
@@ -2757,6 +2767,7 @@ pub mod harness {
|
||||
lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
|
||||
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
|
||||
trace_read_requests: Some(tenant_conf.trace_read_requests),
|
||||
eviction_policy: Some(tenant_conf.eviction_policy),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,7 +74,7 @@ where
|
||||
{
|
||||
reader: R,
|
||||
/// last accessed page
|
||||
cache: Option<(u32, R::BlockLease)>,
|
||||
cache: (u32, [u8; PAGE_SZ]),
|
||||
}
|
||||
|
||||
impl<R> BlockCursor<R>
|
||||
@@ -84,22 +84,20 @@ where
|
||||
pub fn new(reader: R) -> Self {
|
||||
BlockCursor {
|
||||
reader,
|
||||
cache: None,
|
||||
cache: (u32::MAX, [0u8; PAGE_SZ]),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn read_blk(&mut self, blknum: u32) -> Result<&Self, std::io::Error> {
|
||||
// Fast return if this is the same block as before
|
||||
if let Some((cached_blk, _buf)) = &self.cache {
|
||||
if *cached_blk == blknum {
|
||||
return Ok(self);
|
||||
}
|
||||
if self.cache.0 == blknum {
|
||||
return Ok(self);
|
||||
}
|
||||
|
||||
// Read the block from the underlying reader, and cache it
|
||||
self.cache = None;
|
||||
let buf = self.reader.read_blk(blknum)?;
|
||||
self.cache = Some((blknum, buf));
|
||||
self.cache.0 = blknum;
|
||||
self.cache.1[..].copy_from_slice(&buf[..]);
|
||||
|
||||
Ok(self)
|
||||
}
|
||||
@@ -112,7 +110,7 @@ where
|
||||
type Target = [u8; PAGE_SZ];
|
||||
|
||||
fn deref(&self) -> &<Self as Deref>::Target {
|
||||
&self.cache.as_ref().unwrap().1
|
||||
&self.cache.1
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -91,6 +91,7 @@ pub struct TenantConf {
|
||||
/// to avoid eager reconnects.
|
||||
pub max_lsn_wal_lag: NonZeroU64,
|
||||
pub trace_read_requests: bool,
|
||||
pub eviction_policy: EvictionPolicy,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -102,6 +103,7 @@ pub struct TenantConfOpt {
|
||||
pub checkpoint_distance: Option<u64>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(with = "humantime_serde")]
|
||||
#[serde(default)]
|
||||
pub checkpoint_timeout: Option<Duration>,
|
||||
|
||||
@@ -153,6 +155,34 @@ pub struct TenantConfOpt {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub trace_read_requests: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub eviction_policy: Option<EvictionPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum EvictionPolicy {
|
||||
NoEviction,
|
||||
LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
|
||||
}
|
||||
|
||||
impl EvictionPolicy {
|
||||
pub fn discriminant_str(&self) -> &'static str {
|
||||
match self {
|
||||
EvictionPolicy::NoEviction => "NoEviction",
|
||||
EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct EvictionPolicyLayerAccessThreshold {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub period: Duration,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub threshold: Duration,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -189,6 +219,7 @@ impl TenantConfOpt {
|
||||
trace_read_requests: self
|
||||
.trace_read_requests
|
||||
.unwrap_or(global_conf.trace_read_requests),
|
||||
eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -261,6 +292,7 @@ impl Default for TenantConf {
|
||||
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
||||
.expect("cannot parse default max walreceiver Lsn wal lag"),
|
||||
trace_read_requests: false,
|
||||
eviction_policy: EvictionPolicy::NoEviction,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -427,7 +427,6 @@ mod tests {
|
||||
let actual = cursor.read_blob(pos)?;
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
drop(cursor);
|
||||
|
||||
// Test a large blob that spans multiple pages
|
||||
let mut large_data = Vec::new();
|
||||
|
||||
@@ -53,7 +53,6 @@ use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use anyhow::Result;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
@@ -62,8 +61,6 @@ use utils::lsn::Lsn;
|
||||
use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
||||
pub use historic_layer_coverage::Replacement;
|
||||
|
||||
use self::historic_layer_coverage::LayerKey;
|
||||
|
||||
use super::storage_layer::range_eq;
|
||||
|
||||
///
|
||||
@@ -90,18 +87,11 @@ pub struct LayerMap<L: ?Sized> {
|
||||
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
|
||||
|
||||
/// Index of the historic layers optimized for search
|
||||
historic: BufferedHistoricLayerCoverage<LayerKey>,
|
||||
|
||||
/// All layers accessible by key. Useful for:
|
||||
/// 1. Iterating all layers
|
||||
/// 2. Dereferencing a self.historic search result
|
||||
/// 3. Replacing a layer with a remote/local version without
|
||||
/// rebuilding the self.historic index.
|
||||
mapping: HashMap<LayerKey, Arc<L>>,
|
||||
historic: BufferedHistoricLayerCoverage<Arc<L>>,
|
||||
|
||||
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
|
||||
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
|
||||
l0_delta_layers: HashMap<LayerKey, Arc<L>>,
|
||||
l0_delta_layers: Vec<Arc<L>>,
|
||||
}
|
||||
|
||||
impl<L: ?Sized> Default for LayerMap<L> {
|
||||
@@ -110,9 +100,8 @@ impl<L: ?Sized> Default for LayerMap<L> {
|
||||
open_layer: None,
|
||||
next_open_layer_at: None,
|
||||
frozen_layers: VecDeque::default(),
|
||||
l0_delta_layers: HashMap::default(),
|
||||
l0_delta_layers: Vec::default(),
|
||||
historic: BufferedHistoricLayerCoverage::default(),
|
||||
mapping: HashMap::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -150,6 +139,30 @@ where
|
||||
self.layer_map.remove_historic_noflush(layer)
|
||||
}
|
||||
|
||||
/// Replaces existing layer iff it is the `expected`.
|
||||
///
|
||||
/// If the expected layer has been removed it will not be inserted by this function.
|
||||
///
|
||||
/// Returned `Replacement` describes succeeding in replacement or the reason why it could not
|
||||
/// be done.
|
||||
///
|
||||
/// TODO replacement can be done without buffering and rebuilding layer map updates.
|
||||
/// One way to do that is to add a layer of indirection for returned values, so
|
||||
/// that we can replace values only by updating a hashmap.
|
||||
pub fn replace_historic(
|
||||
&mut self,
|
||||
expected: &Arc<L>,
|
||||
new: Arc<L>,
|
||||
) -> anyhow::Result<Replacement<Arc<L>>> {
|
||||
fail::fail_point!("layermap-replace-notfound", |_| Ok(
|
||||
// this is not what happens if an L0 layer was not found a anyhow error but perhaps
|
||||
// that should be changed. this is good enough to show a replacement failure.
|
||||
Replacement::NotFound
|
||||
));
|
||||
|
||||
self.layer_map.replace_historic_noflush(expected, new)
|
||||
}
|
||||
|
||||
// We will flush on drop anyway, but this method makes it
|
||||
// more explicit that there is some work being done.
|
||||
/// Apply all updates
|
||||
@@ -221,38 +234,33 @@ where
|
||||
match (latest_delta, latest_image) {
|
||||
(None, None) => None,
|
||||
(None, Some(image)) => {
|
||||
let image = self.mapping.get(&image).unwrap();
|
||||
let lsn_floor = image.get_lsn_range().start;
|
||||
Some(SearchResult {
|
||||
layer: image.clone(),
|
||||
layer: image,
|
||||
lsn_floor,
|
||||
})
|
||||
}
|
||||
(Some(delta), None) => {
|
||||
let delta = self.mapping.get(&delta).unwrap();
|
||||
let lsn_floor = delta.get_lsn_range().start;
|
||||
Some(SearchResult {
|
||||
layer: delta.clone(),
|
||||
layer: delta,
|
||||
lsn_floor,
|
||||
})
|
||||
}
|
||||
(Some(delta), Some(image)) => {
|
||||
let image = self.mapping.get(&image).unwrap();
|
||||
let delta = self.mapping.get(&delta).unwrap();
|
||||
|
||||
let img_lsn = image.get_lsn_range().start;
|
||||
let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
|
||||
let image_exact_match = img_lsn + 1 == end_lsn;
|
||||
if image_is_newer || image_exact_match {
|
||||
Some(SearchResult {
|
||||
layer: image.clone(),
|
||||
layer: image,
|
||||
lsn_floor: img_lsn,
|
||||
})
|
||||
} else {
|
||||
let lsn_floor =
|
||||
std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
|
||||
Some(SearchResult {
|
||||
layer: delta.clone(),
|
||||
layer: delta,
|
||||
lsn_floor,
|
||||
})
|
||||
}
|
||||
@@ -271,12 +279,13 @@ where
|
||||
/// Helper function for BatchedUpdates::insert_historic
|
||||
///
|
||||
pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
|
||||
let key = LayerKey::from(&*layer);
|
||||
self.historic.insert(key.clone(), key.clone());
|
||||
self.mapping.insert(key.clone(), layer.clone());
|
||||
self.historic.insert(
|
||||
historic_layer_coverage::LayerKey::from(&*layer),
|
||||
Arc::clone(&layer),
|
||||
);
|
||||
|
||||
if Self::is_l0(&layer) {
|
||||
self.l0_delta_layers.insert(key, layer.clone());
|
||||
self.l0_delta_layers.push(layer);
|
||||
}
|
||||
|
||||
NUM_ONDISK_LAYERS.inc();
|
||||
@@ -288,28 +297,27 @@ where
|
||||
/// Helper function for BatchedUpdates::remove_historic
|
||||
///
|
||||
pub fn remove_historic_noflush(&mut self, layer: Arc<L>) {
|
||||
let key = historic_layer_coverage::LayerKey::from(&*layer);
|
||||
self.historic.remove(key.clone());
|
||||
self.mapping.remove(&key.clone());
|
||||
self.historic
|
||||
.remove(historic_layer_coverage::LayerKey::from(&*layer));
|
||||
|
||||
if Self::is_l0(&layer) {
|
||||
self.l0_delta_layers.remove(&key);
|
||||
let len_before = self.l0_delta_layers.len();
|
||||
self.l0_delta_layers
|
||||
.retain(|other| !Self::compare_arced_layers(other, &layer));
|
||||
// this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
|
||||
// there's a chance that the comparison fails at runtime due to it comparing (pointer,
|
||||
// vtable) pairs.
|
||||
assert_eq!(
|
||||
self.l0_delta_layers.len(),
|
||||
len_before - 1,
|
||||
"failed to locate removed historic layer from l0_delta_layers"
|
||||
);
|
||||
}
|
||||
|
||||
NUM_ONDISK_LAYERS.dec();
|
||||
}
|
||||
|
||||
/// Replaces existing layer iff it is the `expected`.
|
||||
///
|
||||
/// If the expected layer has been removed it will not be inserted by this function.
|
||||
///
|
||||
/// Returned `Replacement` describes succeeding in replacement or the reason why it could not
|
||||
/// be done.
|
||||
///
|
||||
/// TODO replacement can be done without buffering and rebuilding layer map updates.
|
||||
/// One way to do that is to add a layer of indirection for returned values, so
|
||||
/// that we can replace values only by updating a hashmap.
|
||||
pub fn replace_historic(
|
||||
pub(self) fn replace_historic_noflush(
|
||||
&mut self,
|
||||
expected: &Arc<L>,
|
||||
new: Arc<L>,
|
||||
@@ -330,23 +338,29 @@ where
|
||||
"expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}"
|
||||
);
|
||||
|
||||
use std::collections::hash_map::Entry;
|
||||
|
||||
if expected_l0 {
|
||||
match self.mapping.entry(key.clone()) {
|
||||
Entry::Occupied(mut entry) => entry.insert(new.clone()),
|
||||
Entry::Vacant(_) => anyhow::bail!("layer doesn't exist"),
|
||||
};
|
||||
let l0_index = if expected_l0 {
|
||||
// find the index in case replace worked, we need to replace that as well
|
||||
Some(
|
||||
self.l0_delta_layers
|
||||
.iter()
|
||||
.position(|slot| Self::compare_arced_layers(slot, expected))
|
||||
.ok_or_else(|| anyhow::anyhow!("existing l0 delta layer was not found"))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
match self.mapping.entry(key.clone()) {
|
||||
Entry::Occupied(mut entry) => entry.insert(new.clone()),
|
||||
Entry::Vacant(_) => anyhow::bail!("layer doesn't exist"),
|
||||
};
|
||||
let replaced = self.historic.replace(&key, new.clone(), |existing| {
|
||||
Self::compare_arced_layers(existing, expected)
|
||||
});
|
||||
|
||||
Ok(Replacement::Replaced {
|
||||
in_buffered: false,
|
||||
})
|
||||
if let Replacement::Replaced { .. } = &replaced {
|
||||
if let Some(index) = l0_index {
|
||||
self.l0_delta_layers[index] = new;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(replaced)
|
||||
}
|
||||
|
||||
/// Helper function for BatchedUpdates::drop.
|
||||
@@ -374,8 +388,8 @@ where
|
||||
let start = key.start.to_i128();
|
||||
let end = key.end.to_i128();
|
||||
|
||||
let layer_covers = |key: Option<&LayerKey>| match key {
|
||||
Some(key) => self.mapping.get(key).unwrap().get_lsn_range().start >= lsn.start,
|
||||
let layer_covers = |layer: Option<Arc<L>>| match layer {
|
||||
Some(layer) => layer.get_lsn_range().start >= lsn.start,
|
||||
None => false,
|
||||
};
|
||||
|
||||
@@ -395,7 +409,7 @@ where
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
|
||||
self.mapping.values().cloned()
|
||||
self.historic.iter()
|
||||
}
|
||||
|
||||
///
|
||||
@@ -422,13 +436,10 @@ where
|
||||
// Initialize loop variables
|
||||
let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
|
||||
let mut current_key = start;
|
||||
let mut current_val = version.image_coverage.query(start)
|
||||
.map(|key| self.mapping.get(&key).unwrap().clone());
|
||||
let mut current_val = version.image_coverage.query(start);
|
||||
|
||||
// Loop through the change events and push intervals
|
||||
for (change_key, change_val) in version.image_coverage.range(start..end) {
|
||||
let change_val = change_val.map(|key| self.mapping.get(&key).unwrap().clone());
|
||||
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
|
||||
coverage.push((kr, current_val.take()));
|
||||
current_key = change_key;
|
||||
@@ -522,7 +533,6 @@ where
|
||||
for (change_key, change_val) in version.delta_coverage.range(start..end) {
|
||||
// If there's a relevant delta in this part, add 1 and recurse down
|
||||
if let Some(val) = current_val {
|
||||
let val = self.mapping.get(&val).unwrap().clone();
|
||||
if val.get_lsn_range().end > lsn.start {
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
|
||||
let lr = lsn.start..val.get_lsn_range().start;
|
||||
@@ -545,7 +555,6 @@ where
|
||||
|
||||
// Consider the last part
|
||||
if let Some(val) = current_val {
|
||||
let val = self.mapping.get(&val).unwrap().clone();
|
||||
if val.get_lsn_range().end > lsn.start {
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(end);
|
||||
let lr = lsn.start..val.get_lsn_range().start;
|
||||
@@ -702,7 +711,7 @@ where
|
||||
|
||||
/// Return all L0 delta layers
|
||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
|
||||
Ok(self.l0_delta_layers.values().cloned().collect())
|
||||
Ok(self.l0_delta_layers.clone())
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
@@ -727,6 +736,32 @@ where
|
||||
println!("End dump LayerMap");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
|
||||
///
|
||||
/// Returns `true` if the two `Arc` point to the same layer, false otherwise.
|
||||
#[inline(always)]
|
||||
pub fn compare_arced_layers(left: &Arc<L>, right: &Arc<L>) -> bool {
|
||||
// "dyn Trait" objects are "fat pointers" in that they have two components:
|
||||
// - pointer to the object
|
||||
// - pointer to the vtable
|
||||
//
|
||||
// rust does not provide a guarantee that these vtables are unique, but however
|
||||
// `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
|
||||
// pointer and the vtable need to be equal.
|
||||
//
|
||||
// See: https://github.com/rust-lang/rust/issues/103763
|
||||
//
|
||||
// A future version of rust will most likely use this form below, where we cast each
|
||||
// pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
|
||||
// not affect the comparison.
|
||||
//
|
||||
// See: https://github.com/rust-lang/rust/pull/106450
|
||||
let left = Arc::as_ptr(left) as *const ();
|
||||
let right = Arc::as_ptr(right) as *const ();
|
||||
|
||||
left == right
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -787,6 +822,7 @@ mod tests {
|
||||
assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
|
||||
|
||||
let replaced = map
|
||||
.batch_update()
|
||||
.replace_historic(&remote, downloaded.clone())
|
||||
.expect("name derived attributes are the same");
|
||||
assert!(
|
||||
|
||||
@@ -12,7 +12,7 @@ use super::layer_coverage::LayerCoverageTuple;
|
||||
/// These three values are enough to uniquely identify a layer, since
|
||||
/// a layer is obligated to contain all contents within range, so two
|
||||
/// deltas (or images) with the same range have identical content.
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct LayerKey {
|
||||
// TODO I use i128 and u64 because it was easy for prototyping,
|
||||
// testing, and benchmarking. If we can use the Lsn and Key
|
||||
@@ -438,6 +438,46 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
||||
///
|
||||
/// Returns a `Replacement` value describing the outcome; only the case of
|
||||
/// `Replacement::Replaced` modifies the map and requires a rebuild.
|
||||
pub fn replace<F>(
|
||||
&mut self,
|
||||
layer_key: &LayerKey,
|
||||
new: Value,
|
||||
check_expected: F,
|
||||
) -> Replacement<Value>
|
||||
where
|
||||
F: FnOnce(&Value) -> bool,
|
||||
{
|
||||
let (slot, in_buffered) = match self.buffer.get(layer_key) {
|
||||
Some(inner @ Some(_)) => {
|
||||
// we compare against the buffered version, because there will be a later
|
||||
// rebuild before querying
|
||||
(inner.as_ref(), true)
|
||||
}
|
||||
Some(None) => {
|
||||
// buffer has removal for this key; it will not be equivalent by any check_expected.
|
||||
return Replacement::RemovalBuffered;
|
||||
}
|
||||
None => {
|
||||
// no pending modification for the key, check layers
|
||||
(self.layers.get(layer_key), false)
|
||||
}
|
||||
};
|
||||
|
||||
match slot {
|
||||
Some(existing) if !check_expected(existing) => {
|
||||
// unfortunate clone here, but otherwise the nll borrowck grows the region of
|
||||
// 'a to cover the whole function, and we could not mutate in the other
|
||||
// Some(existing) branch
|
||||
Replacement::Unexpected(existing.clone())
|
||||
}
|
||||
None => Replacement::NotFound,
|
||||
Some(_existing) => {
|
||||
self.insert(layer_key.to_owned(), new);
|
||||
Replacement::Replaced { in_buffered }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn rebuild(&mut self) {
|
||||
// Find the first LSN that needs to be rebuilt
|
||||
let rebuild_since: u64 = match self.buffer.iter().next() {
|
||||
@@ -481,6 +521,17 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
||||
)
|
||||
}
|
||||
|
||||
/// Iterate all the layers
|
||||
pub fn iter(&self) -> impl '_ + Iterator<Item = Value> {
|
||||
// NOTE we can actually perform this without rebuilding,
|
||||
// but it's not necessary for now.
|
||||
if !self.buffer.is_empty() {
|
||||
panic!("rebuild pls")
|
||||
}
|
||||
|
||||
self.layers.values().cloned()
|
||||
}
|
||||
|
||||
/// Return a reference to a queryable map, assuming all updates
|
||||
/// have already been processed using self.rebuild()
|
||||
pub fn get(&self) -> anyhow::Result<&HistoricLayerCoverage<Value>> {
|
||||
@@ -619,3 +670,139 @@ fn test_retroactive_simple() {
|
||||
assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_retroactive_replacement() {
|
||||
let mut map = BufferedHistoricLayerCoverage::new();
|
||||
|
||||
let keys = [
|
||||
LayerKey {
|
||||
key: 0..5,
|
||||
lsn: 100..101,
|
||||
is_image: true,
|
||||
},
|
||||
LayerKey {
|
||||
key: 3..9,
|
||||
lsn: 110..111,
|
||||
is_image: true,
|
||||
},
|
||||
LayerKey {
|
||||
key: 4..6,
|
||||
lsn: 120..121,
|
||||
is_image: true,
|
||||
},
|
||||
];
|
||||
|
||||
let layers = [
|
||||
"Image 1".to_string(),
|
||||
"Image 2".to_string(),
|
||||
"Image 3".to_string(),
|
||||
];
|
||||
|
||||
for (key, layer) in keys.iter().zip(layers.iter()) {
|
||||
map.insert(key.to_owned(), layer.to_owned());
|
||||
}
|
||||
|
||||
// rebuild is not necessary here, because replace works for both buffered updates and existing
|
||||
// layers.
|
||||
|
||||
for (key, orig_layer) in keys.iter().zip(layers.iter()) {
|
||||
let replacement = format!("Remote {orig_layer}");
|
||||
|
||||
// evict
|
||||
let ret = map.replace(key, replacement.clone(), |l| l == orig_layer);
|
||||
assert!(
|
||||
matches!(ret, Replacement::Replaced { .. }),
|
||||
"replace {orig_layer}: {ret:?}"
|
||||
);
|
||||
map.rebuild();
|
||||
|
||||
let at = key.lsn.end + 1;
|
||||
|
||||
let version = map.get().expect("rebuilt").get_version(at).unwrap();
|
||||
assert_eq!(
|
||||
version.image_coverage.query(4).as_deref(),
|
||||
Some(replacement.as_str()),
|
||||
"query for 4 at version {at} after eviction",
|
||||
);
|
||||
|
||||
// download
|
||||
let ret = map.replace(key, orig_layer.clone(), |l| l == &replacement);
|
||||
assert!(
|
||||
matches!(ret, Replacement::Replaced { .. }),
|
||||
"replace {orig_layer} back: {ret:?}"
|
||||
);
|
||||
map.rebuild();
|
||||
let version = map.get().expect("rebuilt").get_version(at).unwrap();
|
||||
assert_eq!(
|
||||
version.image_coverage.query(4).as_deref(),
|
||||
Some(orig_layer.as_str()),
|
||||
"query for 4 at version {at} after download",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn missing_key_is_not_inserted_with_replace() {
|
||||
let mut map = BufferedHistoricLayerCoverage::new();
|
||||
let key = LayerKey {
|
||||
key: 0..5,
|
||||
lsn: 100..101,
|
||||
is_image: true,
|
||||
};
|
||||
|
||||
let ret = map.replace(&key, "should not replace", |_| true);
|
||||
assert!(matches!(ret, Replacement::NotFound), "{ret:?}");
|
||||
map.rebuild();
|
||||
assert!(map
|
||||
.get()
|
||||
.expect("no changes to rebuild")
|
||||
.get_version(102)
|
||||
.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replacing_buffered_insert_and_remove() {
|
||||
let mut map = BufferedHistoricLayerCoverage::new();
|
||||
let key = LayerKey {
|
||||
key: 0..5,
|
||||
lsn: 100..101,
|
||||
is_image: true,
|
||||
};
|
||||
|
||||
map.insert(key.clone(), "Image 1");
|
||||
let ret = map.replace(&key, "Remote Image 1", |&l| l == "Image 1");
|
||||
assert!(
|
||||
matches!(ret, Replacement::Replaced { in_buffered: true }),
|
||||
"{ret:?}"
|
||||
);
|
||||
map.rebuild();
|
||||
|
||||
assert_eq!(
|
||||
map.get()
|
||||
.expect("rebuilt")
|
||||
.get_version(102)
|
||||
.unwrap()
|
||||
.image_coverage
|
||||
.query(4),
|
||||
Some("Remote Image 1")
|
||||
);
|
||||
|
||||
map.remove(key.clone());
|
||||
let ret = map.replace(&key, "should not replace", |_| true);
|
||||
assert!(
|
||||
matches!(ret, Replacement::RemovalBuffered),
|
||||
"cannot replace after scheduled remove: {ret:?}"
|
||||
);
|
||||
|
||||
map.rebuild();
|
||||
|
||||
let ret = map.replace(&key, "should not replace", |_| true);
|
||||
assert!(
|
||||
matches!(ret, Replacement::NotFound),
|
||||
"cannot replace after remove + rebuild: {ret:?}"
|
||||
);
|
||||
|
||||
let at_version = map.get().expect("rebuilt").get_version(102);
|
||||
assert!(at_version.is_none());
|
||||
}
|
||||
|
||||
@@ -101,24 +101,24 @@ impl<Value: Clone> LayerCoverage<Value> {
|
||||
/// Get the latest (by lsn.end) layer at a given key
|
||||
///
|
||||
/// Complexity: O(log N)
|
||||
pub fn query(&self, key: i128) -> Option<&Value> {
|
||||
pub fn query(&self, key: i128) -> Option<Value> {
|
||||
self.nodes
|
||||
.range(..=key)
|
||||
.rev()
|
||||
.next()?
|
||||
.1
|
||||
.as_ref()
|
||||
.map(|(_, v)| v)
|
||||
.map(|(_, v)| v.clone())
|
||||
}
|
||||
|
||||
/// Iterate the changes in layer coverage in a given range. You will likely
|
||||
/// want to start with self.query(key.start), and then follow up with self.range
|
||||
///
|
||||
/// Complexity: O(log N + result_size)
|
||||
pub fn range(&self, key: Range<i128>) -> impl '_ + Iterator<Item = (i128, Option<&Value>)> {
|
||||
pub fn range(&self, key: Range<i128>) -> impl '_ + Iterator<Item = (i128, Option<Value>)> {
|
||||
self.nodes
|
||||
.range(key)
|
||||
.map(|(k, v)| (*k, v.as_ref().map(|x| &x.1)))
|
||||
.map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
|
||||
}
|
||||
|
||||
/// O(1) clone
|
||||
|
||||
@@ -540,13 +540,11 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use {
|
||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub async fn immediate_gc(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
|
||||
@@ -571,14 +571,15 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Launch a delete operation in the background.
|
||||
///
|
||||
/// The operation does not modify local state but assumes the local files have already been
|
||||
/// deleted, and is used to mirror those changes to remote.
|
||||
///
|
||||
/// Note: This schedules an index file upload before the deletions. The
|
||||
/// deletion won't actually be performed, until any previously scheduled
|
||||
/// upload operations, and the index file upload, have completed
|
||||
/// succesfully.
|
||||
///
|
||||
pub fn schedule_layer_file_deletion(
|
||||
self: &Arc<Self>,
|
||||
names: &[LayerFileName],
|
||||
|
||||
@@ -6,11 +6,13 @@
|
||||
use std::collections::HashSet;
|
||||
use std::future::Future;
|
||||
use std::path::Path;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
@@ -26,6 +28,8 @@ async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Er
|
||||
fs::File::open(path).await?.sync_all().await
|
||||
}
|
||||
|
||||
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
|
||||
|
||||
///
|
||||
/// If 'metadata' is given, we will validate that the downloaded file's size matches that
|
||||
/// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
|
||||
@@ -64,22 +68,28 @@ pub async fn download_layer_file<'a>(
|
||||
// TODO: this doesn't use the cached fd for some reason?
|
||||
let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create a destination file for layer '{}'",
|
||||
"create a destination file for layer '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
let mut download = storage.download(&remote_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
|
||||
"open a download stream for layer with remote storage path '{remote_path:?}'"
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
|
||||
format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let bytes_amount = tokio::time::timeout(MAX_DOWNLOAD_DURATION, tokio::io::copy(&mut download.download_stream, &mut destination_file))
|
||||
.await
|
||||
.map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))?
|
||||
.with_context(|| {
|
||||
format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok((destination_file, bytes_amount))
|
||||
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
).await?;
|
||||
@@ -300,7 +310,7 @@ where
|
||||
}
|
||||
Err(DownloadError::Other(ref err)) => {
|
||||
// Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
|
||||
error!("{description} still failed after {attempts} retries, giving up: {err:?}");
|
||||
warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -13,6 +13,7 @@ use crate::task_mgr::TaskKind;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use either::Either;
|
||||
use enum_map::EnumMap;
|
||||
use enumset::EnumSet;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
@@ -92,7 +93,23 @@ pub enum ValueReconstructResult {
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct LayerAccessStats(Mutex<LayerAccessStatsInner>);
|
||||
pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
|
||||
|
||||
/// This struct holds two instances of [`LayerAccessStatsInner`].
|
||||
/// Accesses are recorded to both instances.
|
||||
/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`].
|
||||
/// The `for_eviction_policy` is never reset.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
struct LayerAccessStatsLocked {
|
||||
for_scraping_api: LayerAccessStatsInner,
|
||||
for_eviction_policy: LayerAccessStatsInner,
|
||||
}
|
||||
|
||||
impl LayerAccessStatsLocked {
|
||||
fn iter_mut(&mut self) -> impl Iterator<Item = &mut LayerAccessStatsInner> {
|
||||
[&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
struct LayerAccessStatsInner {
|
||||
@@ -103,11 +120,11 @@ struct LayerAccessStatsInner {
|
||||
last_residence_changes: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct LayerAccessStatFullDetails {
|
||||
when: SystemTime,
|
||||
task_kind: TaskKind,
|
||||
access_kind: LayerAccessKind,
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(super) struct LayerAccessStatFullDetails {
|
||||
pub(super) when: SystemTime,
|
||||
pub(super) task_kind: TaskKind,
|
||||
pub(super) access_kind: LayerAccessKind,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, strum_macros::EnumString)]
|
||||
@@ -126,7 +143,7 @@ fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 {
|
||||
}
|
||||
|
||||
impl LayerAccessStatFullDetails {
|
||||
fn to_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
|
||||
fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
|
||||
let Self {
|
||||
when,
|
||||
task_kind,
|
||||
@@ -142,13 +159,13 @@ impl LayerAccessStatFullDetails {
|
||||
|
||||
impl LayerAccessStats {
|
||||
pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
|
||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsInner::default()));
|
||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
||||
new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
|
||||
new
|
||||
}
|
||||
|
||||
pub(crate) fn for_new_layer_file() -> Self {
|
||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsInner::default()));
|
||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
||||
new.record_residence_event(
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
@@ -176,38 +193,43 @@ impl LayerAccessStats {
|
||||
status: LayerResidenceStatus,
|
||||
reason: LayerResidenceEventReason,
|
||||
) {
|
||||
let mut inner = self.0.lock().unwrap();
|
||||
inner
|
||||
.last_residence_changes
|
||||
.write(LayerResidenceEvent::new(status, reason));
|
||||
let mut locked = self.0.lock().unwrap();
|
||||
locked.iter_mut().for_each(|inner| {
|
||||
inner
|
||||
.last_residence_changes
|
||||
.write(LayerResidenceEvent::new(status, reason))
|
||||
});
|
||||
}
|
||||
|
||||
fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
|
||||
let mut inner = self.0.lock().unwrap();
|
||||
let this_access = LayerAccessStatFullDetails {
|
||||
when: SystemTime::now(),
|
||||
task_kind,
|
||||
access_kind,
|
||||
};
|
||||
inner
|
||||
.first_access
|
||||
.get_or_insert_with(|| this_access.clone());
|
||||
inner.count_by_access_kind[access_kind] += 1;
|
||||
inner.task_kind_flag |= task_kind;
|
||||
inner.last_accesses.write(this_access);
|
||||
|
||||
let mut locked = self.0.lock().unwrap();
|
||||
locked.iter_mut().for_each(|inner| {
|
||||
inner.first_access.get_or_insert(this_access);
|
||||
inner.count_by_access_kind[access_kind] += 1;
|
||||
inner.task_kind_flag |= task_kind;
|
||||
inner.last_accesses.write(this_access);
|
||||
})
|
||||
}
|
||||
fn to_api_model(
|
||||
|
||||
fn as_api_model(
|
||||
&self,
|
||||
reset: LayerAccessStatsReset,
|
||||
) -> pageserver_api::models::LayerAccessStats {
|
||||
let mut inner = self.0.lock().unwrap();
|
||||
let mut locked = self.0.lock().unwrap();
|
||||
let inner = &mut locked.for_scraping_api;
|
||||
let LayerAccessStatsInner {
|
||||
first_access,
|
||||
count_by_access_kind,
|
||||
task_kind_flag,
|
||||
last_accesses,
|
||||
last_residence_changes,
|
||||
} = &*inner;
|
||||
} = inner;
|
||||
let ret = pageserver_api::models::LayerAccessStats {
|
||||
access_count_by_access_kind: count_by_access_kind
|
||||
.iter()
|
||||
@@ -217,8 +239,8 @@ impl LayerAccessStats {
|
||||
.iter()
|
||||
.map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
|
||||
.collect(),
|
||||
first: first_access.as_ref().map(|a| a.to_api_model()),
|
||||
accesses_history: last_accesses.map(|m| m.to_api_model()),
|
||||
first: first_access.as_ref().map(|a| a.as_api_model()),
|
||||
accesses_history: last_accesses.map(|m| m.as_api_model()),
|
||||
residence_events_history: last_residence_changes.clone(),
|
||||
};
|
||||
match reset {
|
||||
@@ -232,6 +254,20 @@ impl LayerAccessStats {
|
||||
}
|
||||
ret
|
||||
}
|
||||
|
||||
pub(super) fn most_recent_access_or_residence_event(
|
||||
&self,
|
||||
) -> Either<LayerAccessStatFullDetails, LayerResidenceEvent> {
|
||||
let locked = self.0.lock().unwrap();
|
||||
let inner = &locked.for_eviction_policy;
|
||||
match inner.last_accesses.recent() {
|
||||
Some(a) => Either::Left(*a),
|
||||
None => match inner.last_residence_changes.recent() {
|
||||
Some(e) => Either::Right(e.clone()),
|
||||
None => unreachable!("constructors for LayerAccessStats ensure that there's always a residence change event"),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
|
||||
@@ -328,7 +364,7 @@ pub trait PersistentLayer: Layer {
|
||||
}
|
||||
|
||||
/// Permanently remove this layer from disk.
|
||||
fn delete(&self) -> Result<()>;
|
||||
fn delete_resident_layer_file(&self) -> Result<()>;
|
||||
|
||||
fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
|
||||
None
|
||||
@@ -449,3 +485,14 @@ enum PathOrConf {
|
||||
Path(PathBuf),
|
||||
Conf(&'static PageServerConf),
|
||||
}
|
||||
|
||||
/// Range wrapping newtype, which uses display to render Debug.
|
||||
///
|
||||
/// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
|
||||
struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range<T>);
|
||||
|
||||
impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}..{}", self.0.start, self.0.end)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -194,8 +194,10 @@ pub struct DeltaLayer {
|
||||
|
||||
impl std::fmt::Debug for DeltaLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
use super::RangeDisplayDebug;
|
||||
|
||||
f.debug_struct("DeltaLayer")
|
||||
.field("key_range", &self.key_range)
|
||||
.field("key_range", &RangeDisplayDebug(&self.key_range))
|
||||
.field("lsn_range", &self.lsn_range)
|
||||
.field("file_size", &self.file_size)
|
||||
.field("inner", &self.inner)
|
||||
@@ -436,7 +438,7 @@ impl PersistentLayer for DeltaLayer {
|
||||
))
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
@@ -450,7 +452,7 @@ impl PersistentLayer for DeltaLayer {
|
||||
let layer_file_name = self.filename().file_name();
|
||||
let lsn_range = self.get_lsn_range();
|
||||
|
||||
let access_stats = self.access_stats.to_api_model(reset);
|
||||
let access_stats = self.access_stats.as_api_model(reset);
|
||||
|
||||
HistoricLayerInfo::Delta {
|
||||
layer_file_name,
|
||||
|
||||
@@ -10,12 +10,23 @@ use std::str::FromStr;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
// Note: Timeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
#[derive(PartialEq, Eq, Clone, Hash)]
|
||||
pub struct DeltaFileName {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DeltaFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
use super::RangeDisplayDebug;
|
||||
|
||||
f.debug_struct("DeltaFileName")
|
||||
.field("key_range", &RangeDisplayDebug(&self.key_range))
|
||||
.field("lsn_range", &self.lsn_range)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for DeltaFileName {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
@@ -100,12 +111,23 @@ impl fmt::Display for DeltaFileName {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
#[derive(PartialEq, Eq, Clone, Hash)]
|
||||
pub struct ImageFileName {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
use super::RangeDisplayDebug;
|
||||
|
||||
f.debug_struct("ImageFileName")
|
||||
.field("key_range", &RangeDisplayDebug(&self.key_range))
|
||||
.field("lsn", &self.lsn)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for ImageFileName {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
|
||||
@@ -119,8 +119,10 @@ pub struct ImageLayer {
|
||||
|
||||
impl std::fmt::Debug for ImageLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
use super::RangeDisplayDebug;
|
||||
|
||||
f.debug_struct("ImageLayer")
|
||||
.field("key_range", &self.key_range)
|
||||
.field("key_range", &RangeDisplayDebug(&self.key_range))
|
||||
.field("file_size", &self.file_size)
|
||||
.field("lsn", &self.lsn)
|
||||
.field("inner", &self.inner)
|
||||
@@ -250,7 +252,7 @@ impl PersistentLayer for ImageLayer {
|
||||
unimplemented!();
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
@@ -269,7 +271,7 @@ impl PersistentLayer for ImageLayer {
|
||||
layer_file_size: Some(self.file_size),
|
||||
lsn_start: lsn_range.start,
|
||||
remote: false,
|
||||
access_stats: self.access_stats.to_api_model(reset),
|
||||
access_stats: self.access_stats.as_api_model(reset),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -49,6 +49,17 @@ pub struct RemoteLayer {
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
|
||||
|
||||
/// Has `LayerMap::replace` failed for this (true) or not (false).
|
||||
///
|
||||
/// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
|
||||
/// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
|
||||
/// unprocessable, because a LayerMap::replace failed.
|
||||
///
|
||||
/// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
|
||||
/// a possible fast loop between `Timeline::get_reconstruct_data` and
|
||||
/// `Timeline::download_remote_layer`, which also logs.
|
||||
pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for RemoteLayer {
|
||||
@@ -144,8 +155,8 @@ impl PersistentLayer for RemoteLayer {
|
||||
bail!("cannot iterate a remote layer");
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
Ok(())
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
bail!("remote layer has no layer file");
|
||||
}
|
||||
|
||||
fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
|
||||
@@ -171,7 +182,7 @@ impl PersistentLayer for RemoteLayer {
|
||||
lsn_start: lsn_range.start,
|
||||
lsn_end: lsn_range.end,
|
||||
remote: true,
|
||||
access_stats: self.access_stats.to_api_model(reset),
|
||||
access_stats: self.access_stats.as_api_model(reset),
|
||||
}
|
||||
} else {
|
||||
HistoricLayerInfo::Image {
|
||||
@@ -179,7 +190,7 @@ impl PersistentLayer for RemoteLayer {
|
||||
layer_file_size: self.layer_metadata.file_size(),
|
||||
lsn_start: lsn_range.start,
|
||||
remote: true,
|
||||
access_stats: self.access_stats.to_api_model(reset),
|
||||
access_stats: self.access_stats.as_api_model(reset),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -207,6 +218,7 @@ impl RemoteLayer {
|
||||
file_name: fname.to_owned().into(),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
|
||||
access_stats,
|
||||
}
|
||||
}
|
||||
@@ -228,6 +240,7 @@ impl RemoteLayer {
|
||||
file_name: fname.to_owned().into(),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
|
||||
access_stats,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
@@ -11,6 +11,7 @@ use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::id::TenantId;
|
||||
|
||||
@@ -53,37 +54,55 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
info!("starting");
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
let mut first = true;
|
||||
loop {
|
||||
trace!("waking up");
|
||||
|
||||
let tenant = tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request");
|
||||
return;
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
|
||||
ControlFlow::Break(()) => return,
|
||||
ControlFlow::Continue(tenant) => tenant,
|
||||
},
|
||||
};
|
||||
};
|
||||
|
||||
let mut sleep_duration = tenant.get_compaction_period();
|
||||
if sleep_duration == Duration::ZERO {
|
||||
info!("automatic compaction is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
sleep_duration = Duration::from_secs(10);
|
||||
} else {
|
||||
// Run compaction
|
||||
if let Err(e) = tenant.compaction_iteration(&ctx).await {
|
||||
sleep_duration = wait_duration;
|
||||
error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
let period = tenant.get_compaction_period();
|
||||
|
||||
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
|
||||
// loop, #3501. There are also additional "allowed_errors" in tests.
|
||||
if first {
|
||||
first = false;
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let started_at = Instant::now();
|
||||
|
||||
let sleep_duration = if period == Duration::ZERO {
|
||||
info!("automatic compaction is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
Duration::from_secs(10)
|
||||
} else {
|
||||
// Run compaction
|
||||
if let Err(e) = tenant.compaction_iteration(&ctx).await {
|
||||
error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
|
||||
wait_duration
|
||||
} else {
|
||||
period
|
||||
}
|
||||
};
|
||||
|
||||
warn_when_period_overrun(started_at.elapsed(), period, "compaction");
|
||||
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
},
|
||||
@@ -105,14 +124,16 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
info!("starting");
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
// GC might require downloading, to find the cutoff LSN that corresponds to the
|
||||
// cutoff specified as time.
|
||||
let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
let mut first = true;
|
||||
loop {
|
||||
trace!("waking up");
|
||||
|
||||
let tenant = tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request");
|
||||
return;
|
||||
},
|
||||
@@ -122,27 +143,38 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
},
|
||||
};
|
||||
|
||||
let gc_period = tenant.get_gc_period();
|
||||
let gc_horizon = tenant.get_gc_horizon();
|
||||
let mut sleep_duration = gc_period;
|
||||
if sleep_duration == Duration::ZERO {
|
||||
info!("automatic GC is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
sleep_duration = Duration::from_secs(10);
|
||||
} else {
|
||||
// Run gc
|
||||
if gc_horizon > 0 {
|
||||
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await
|
||||
{
|
||||
sleep_duration = wait_duration;
|
||||
error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
}
|
||||
let period = tenant.get_gc_period();
|
||||
|
||||
if first {
|
||||
first = false;
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let started_at = Instant::now();
|
||||
|
||||
let gc_horizon = tenant.get_gc_horizon();
|
||||
let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
|
||||
info!("automatic GC is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
Duration::from_secs(10)
|
||||
} else {
|
||||
// Run gc
|
||||
let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
|
||||
if let Err(e) = res {
|
||||
error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
|
||||
wait_duration
|
||||
} else {
|
||||
period
|
||||
}
|
||||
};
|
||||
|
||||
warn_when_period_overrun(started_at.elapsed(), period, "gc");
|
||||
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
},
|
||||
@@ -197,3 +229,49 @@ async fn wait_for_active_tenant(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("cancelled")]
|
||||
pub(crate) struct Cancelled;
|
||||
|
||||
/// Provide a random delay for background task initialization.
|
||||
///
|
||||
/// This delay prevents a thundering herd of background tasks and will likely keep them running on
|
||||
/// different periods for more stable load.
|
||||
pub(crate) async fn random_init_delay(
|
||||
period: Duration,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), Cancelled> {
|
||||
use rand::Rng;
|
||||
|
||||
let d = {
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
// gen_range asserts that the range cannot be empty, which it could be because period can
|
||||
// be set to zero to disable gc or compaction, so lets set it to be at least 10s.
|
||||
let period = std::cmp::max(period, Duration::from_secs(10));
|
||||
|
||||
// semi-ok default as the source of jitter
|
||||
rng.gen_range(Duration::ZERO..=period)
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => Err(Cancelled),
|
||||
_ = tokio::time::sleep(d) => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
|
||||
// Duration::ZERO will happen because it's the "disable [bgtask]" value.
|
||||
if elapsed >= period && period != Duration::ZERO {
|
||||
// humantime does no significant digits clamping whereas Duration's debug is a bit more
|
||||
// intelligent. however it makes sense to keep the "configuration format" for period, even
|
||||
// though there's no way to output the actual config value.
|
||||
warn!(
|
||||
?elapsed,
|
||||
period = %humantime::format_duration(period),
|
||||
task,
|
||||
"task iteration took longer than the configured period"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user