mirror of
https://github.com/neondatabase/neon.git
synced 2026-03-17 15:20:37 +00:00
Compare commits
2 Commits
arthur/sim
...
layer_map_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c471c25744 | ||
|
|
e030830397 |
@@ -14,6 +14,3 @@ opt-level = 1
|
|||||||
|
|
||||||
[alias]
|
[alias]
|
||||||
build_testing = ["build", "--features", "testing"]
|
build_testing = ["build", "--features", "testing"]
|
||||||
|
|
||||||
[build]
|
|
||||||
rustflags = ["-C", "default-linker-libraries"]
|
|
||||||
|
|||||||
@@ -21,4 +21,3 @@
|
|||||||
!workspace_hack/
|
!workspace_hack/
|
||||||
!neon_local/
|
!neon_local/
|
||||||
!scripts/ninstall.sh
|
!scripts/ninstall.sh
|
||||||
!vm-cgconfig.conf
|
|
||||||
|
|||||||
@@ -2,11 +2,11 @@ storage:
|
|||||||
vars:
|
vars:
|
||||||
bucket_name: neon-prod-storage-ap-southeast-1
|
bucket_name: neon-prod-storage-ap-southeast-1
|
||||||
bucket_region: ap-southeast-1
|
bucket_region: ap-southeast-1
|
||||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
console_mgmt_base_url: http://console-release.local
|
||||||
broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
|
broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
|
||||||
pageserver_config_stub:
|
pageserver_config_stub:
|
||||||
pg_distrib_dir: /usr/local
|
pg_distrib_dir: /usr/local
|
||||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||||
metric_collection_interval: 10min
|
metric_collection_interval: 10min
|
||||||
remote_storage:
|
remote_storage:
|
||||||
bucket_name: "{{ bucket_name }}"
|
bucket_name: "{{ bucket_name }}"
|
||||||
@@ -32,7 +32,7 @@ storage:
|
|||||||
hosts:
|
hosts:
|
||||||
safekeeper-0.ap-southeast-1.aws.neon.tech:
|
safekeeper-0.ap-southeast-1.aws.neon.tech:
|
||||||
ansible_host: i-0d6f1dc5161eef894
|
ansible_host: i-0d6f1dc5161eef894
|
||||||
|
safekeeper-1.ap-southeast-1.aws.neon.tech:
|
||||||
|
ansible_host: i-0e338adda8eb2d19f
|
||||||
safekeeper-2.ap-southeast-1.aws.neon.tech:
|
safekeeper-2.ap-southeast-1.aws.neon.tech:
|
||||||
ansible_host: i-04fb63634e4679eb9
|
ansible_host: i-04fb63634e4679eb9
|
||||||
safekeeper-3.ap-southeast-1.aws.neon.tech:
|
|
||||||
ansible_host: i-05481f3bc88cfc2d4
|
|
||||||
|
|||||||
4
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
4
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
@@ -2,11 +2,11 @@ storage:
|
|||||||
vars:
|
vars:
|
||||||
bucket_name: neon-prod-storage-eu-central-1
|
bucket_name: neon-prod-storage-eu-central-1
|
||||||
bucket_region: eu-central-1
|
bucket_region: eu-central-1
|
||||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
console_mgmt_base_url: http://console-release.local
|
||||||
broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
|
broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
|
||||||
pageserver_config_stub:
|
pageserver_config_stub:
|
||||||
pg_distrib_dir: /usr/local
|
pg_distrib_dir: /usr/local
|
||||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||||
metric_collection_interval: 10min
|
metric_collection_interval: 10min
|
||||||
remote_storage:
|
remote_storage:
|
||||||
bucket_name: "{{ bucket_name }}"
|
bucket_name: "{{ bucket_name }}"
|
||||||
|
|||||||
6
.github/ansible/prod.us-east-2.hosts.yaml
vendored
6
.github/ansible/prod.us-east-2.hosts.yaml
vendored
@@ -2,11 +2,11 @@ storage:
|
|||||||
vars:
|
vars:
|
||||||
bucket_name: neon-prod-storage-us-east-2
|
bucket_name: neon-prod-storage-us-east-2
|
||||||
bucket_region: us-east-2
|
bucket_region: us-east-2
|
||||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
console_mgmt_base_url: http://console-release.local
|
||||||
broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
|
broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
|
||||||
pageserver_config_stub:
|
pageserver_config_stub:
|
||||||
pg_distrib_dir: /usr/local
|
pg_distrib_dir: /usr/local
|
||||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||||
metric_collection_interval: 10min
|
metric_collection_interval: 10min
|
||||||
remote_storage:
|
remote_storage:
|
||||||
bucket_name: "{{ bucket_name }}"
|
bucket_name: "{{ bucket_name }}"
|
||||||
@@ -27,8 +27,6 @@ storage:
|
|||||||
ansible_host: i-062227ba7f119eb8c
|
ansible_host: i-062227ba7f119eb8c
|
||||||
pageserver-1.us-east-2.aws.neon.tech:
|
pageserver-1.us-east-2.aws.neon.tech:
|
||||||
ansible_host: i-0b3ec0afab5968938
|
ansible_host: i-0b3ec0afab5968938
|
||||||
pageserver-2.us-east-2.aws.neon.tech:
|
|
||||||
ansible_host: i-0d7a1c4325e71421d
|
|
||||||
|
|
||||||
safekeepers:
|
safekeepers:
|
||||||
hosts:
|
hosts:
|
||||||
|
|||||||
6
.github/ansible/prod.us-west-2.hosts.yaml
vendored
6
.github/ansible/prod.us-west-2.hosts.yaml
vendored
@@ -2,11 +2,11 @@ storage:
|
|||||||
vars:
|
vars:
|
||||||
bucket_name: neon-prod-storage-us-west-2
|
bucket_name: neon-prod-storage-us-west-2
|
||||||
bucket_region: us-west-2
|
bucket_region: us-west-2
|
||||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
console_mgmt_base_url: http://console-release.local
|
||||||
broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
|
broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
|
||||||
pageserver_config_stub:
|
pageserver_config_stub:
|
||||||
pg_distrib_dir: /usr/local
|
pg_distrib_dir: /usr/local
|
||||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||||
metric_collection_interval: 10min
|
metric_collection_interval: 10min
|
||||||
remote_storage:
|
remote_storage:
|
||||||
bucket_name: "{{ bucket_name }}"
|
bucket_name: "{{ bucket_name }}"
|
||||||
@@ -29,8 +29,6 @@ storage:
|
|||||||
ansible_host: i-0c834be1dddba8b3f
|
ansible_host: i-0c834be1dddba8b3f
|
||||||
pageserver-2.us-west-2.aws.neon.tech:
|
pageserver-2.us-west-2.aws.neon.tech:
|
||||||
ansible_host: i-051642d372c0a4f32
|
ansible_host: i-051642d372c0a4f32
|
||||||
pageserver-3.us-west-2.aws.neon.tech:
|
|
||||||
ansible_host: i-00c3844beb9ad1c6b
|
|
||||||
|
|
||||||
safekeepers:
|
safekeepers:
|
||||||
hosts:
|
hosts:
|
||||||
|
|||||||
40
.github/ansible/production.hosts.yaml
vendored
Normal file
40
.github/ansible/production.hosts.yaml
vendored
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
---
|
||||||
|
storage:
|
||||||
|
vars:
|
||||||
|
console_mgmt_base_url: http://console-release.local
|
||||||
|
bucket_name: zenith-storage-oregon
|
||||||
|
bucket_region: us-west-2
|
||||||
|
broker_endpoint: http://storage-broker.prod.local:50051
|
||||||
|
pageserver_config_stub:
|
||||||
|
pg_distrib_dir: /usr/local
|
||||||
|
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||||
|
metric_collection_interval: 10min
|
||||||
|
remote_storage:
|
||||||
|
bucket_name: "{{ bucket_name }}"
|
||||||
|
bucket_region: "{{ bucket_region }}"
|
||||||
|
prefix_in_bucket: "{{ inventory_hostname }}"
|
||||||
|
safekeeper_s3_prefix: prod-1/wal
|
||||||
|
hostname_suffix: ".local"
|
||||||
|
remote_user: admin
|
||||||
|
sentry_environment: production
|
||||||
|
|
||||||
|
children:
|
||||||
|
pageservers:
|
||||||
|
hosts:
|
||||||
|
zenith-1-ps-2:
|
||||||
|
console_region_id: aws-us-west-2
|
||||||
|
zenith-1-ps-3:
|
||||||
|
console_region_id: aws-us-west-2
|
||||||
|
zenith-1-ps-4:
|
||||||
|
console_region_id: aws-us-west-2
|
||||||
|
zenith-1-ps-5:
|
||||||
|
console_region_id: aws-us-west-2
|
||||||
|
|
||||||
|
safekeepers:
|
||||||
|
hosts:
|
||||||
|
zenith-1-sk-1:
|
||||||
|
console_region_id: aws-us-west-2
|
||||||
|
zenith-1-sk-2:
|
||||||
|
console_region_id: aws-us-west-2
|
||||||
|
zenith-1-sk-4:
|
||||||
|
console_region_id: aws-us-west-2
|
||||||
9
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
9
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
@@ -2,17 +2,12 @@ storage:
|
|||||||
vars:
|
vars:
|
||||||
bucket_name: neon-dev-storage-eu-west-1
|
bucket_name: neon-dev-storage-eu-west-1
|
||||||
bucket_region: eu-west-1
|
bucket_region: eu-west-1
|
||||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.build
|
console_mgmt_base_url: http://console-staging.local
|
||||||
broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
|
broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
|
||||||
pageserver_config_stub:
|
pageserver_config_stub:
|
||||||
pg_distrib_dir: /usr/local
|
pg_distrib_dir: /usr/local
|
||||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
|
metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
|
||||||
metric_collection_interval: 10min
|
metric_collection_interval: 10min
|
||||||
tenant_config:
|
|
||||||
eviction_policy:
|
|
||||||
kind: "LayerAccessThreshold"
|
|
||||||
period: "20m"
|
|
||||||
threshold: "20m"
|
|
||||||
remote_storage:
|
remote_storage:
|
||||||
bucket_name: "{{ bucket_name }}"
|
bucket_name: "{{ bucket_name }}"
|
||||||
bucket_region: "{{ bucket_region }}"
|
bucket_region: "{{ bucket_region }}"
|
||||||
|
|||||||
13
.github/ansible/staging.us-east-2.hosts.yaml
vendored
13
.github/ansible/staging.us-east-2.hosts.yaml
vendored
@@ -2,17 +2,12 @@ storage:
|
|||||||
vars:
|
vars:
|
||||||
bucket_name: neon-staging-storage-us-east-2
|
bucket_name: neon-staging-storage-us-east-2
|
||||||
bucket_region: us-east-2
|
bucket_region: us-east-2
|
||||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.build
|
console_mgmt_base_url: http://console-staging.local
|
||||||
broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
|
broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
|
||||||
pageserver_config_stub:
|
pageserver_config_stub:
|
||||||
pg_distrib_dir: /usr/local
|
pg_distrib_dir: /usr/local
|
||||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
|
metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
|
||||||
metric_collection_interval: 10min
|
metric_collection_interval: 10min
|
||||||
tenant_config:
|
|
||||||
eviction_policy:
|
|
||||||
kind: "LayerAccessThreshold"
|
|
||||||
period: "20m"
|
|
||||||
threshold: "20m"
|
|
||||||
remote_storage:
|
remote_storage:
|
||||||
bucket_name: "{{ bucket_name }}"
|
bucket_name: "{{ bucket_name }}"
|
||||||
bucket_region: "{{ bucket_region }}"
|
bucket_region: "{{ bucket_region }}"
|
||||||
@@ -36,8 +31,6 @@ storage:
|
|||||||
ansible_host: i-01e31cdf7e970586a
|
ansible_host: i-01e31cdf7e970586a
|
||||||
pageserver-3.us-east-2.aws.neon.build:
|
pageserver-3.us-east-2.aws.neon.build:
|
||||||
ansible_host: i-0602a0291365ef7cc
|
ansible_host: i-0602a0291365ef7cc
|
||||||
pageserver-99.us-east-2.aws.neon.build:
|
|
||||||
ansible_host: i-0c39491109bb88824
|
|
||||||
|
|
||||||
safekeepers:
|
safekeepers:
|
||||||
hosts:
|
hosts:
|
||||||
@@ -47,5 +40,3 @@ storage:
|
|||||||
ansible_host: i-0171efc3604a7b907
|
ansible_host: i-0171efc3604a7b907
|
||||||
safekeeper-2.us-east-2.aws.neon.build:
|
safekeeper-2.us-east-2.aws.neon.build:
|
||||||
ansible_host: i-0de0b03a51676a6ce
|
ansible_host: i-0de0b03a51676a6ce
|
||||||
safekeeper-99.us-east-2.aws.neon.build:
|
|
||||||
ansible_host: i-0d61b6a2ea32028d5
|
|
||||||
|
|||||||
@@ -1,31 +1,16 @@
|
|||||||
# Helm chart values for neon-proxy-scram.
|
# Helm chart values for neon-proxy-scram.
|
||||||
# This is a YAML-formatted file.
|
# This is a YAML-formatted file.
|
||||||
|
|
||||||
deploymentStrategy:
|
|
||||||
type: RollingUpdate
|
|
||||||
rollingUpdate:
|
|
||||||
maxSurge: 100%
|
|
||||||
maxUnavailable: 50%
|
|
||||||
|
|
||||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
|
||||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
|
||||||
# but doesn't receive new ones
|
|
||||||
containerLifecycle:
|
|
||||||
preStop:
|
|
||||||
exec:
|
|
||||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
|
||||||
terminationGracePeriodSeconds: 604800
|
|
||||||
|
|
||||||
image:
|
image:
|
||||||
repository: neondatabase/neon
|
repository: neondatabase/neon
|
||||||
|
|
||||||
settings:
|
settings:
|
||||||
authBackend: "console"
|
authBackend: "console"
|
||||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||||
domain: "*.eu-west-1.aws.neon.build"
|
domain: "*.eu-west-1.aws.neon.build"
|
||||||
sentryEnvironment: "staging"
|
sentryEnvironment: "staging"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||||
metricCollectionInterval: "1min"
|
metricCollectionInterval: "1min"
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy pods
|
# -- Additional labels for neon-proxy pods
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ settings:
|
|||||||
uri: "https://console.stage.neon.tech/psql_session/"
|
uri: "https://console.stage.neon.tech/psql_session/"
|
||||||
domain: "pg.neon.build"
|
domain: "pg.neon.build"
|
||||||
sentryEnvironment: "staging"
|
sentryEnvironment: "staging"
|
||||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||||
metricCollectionInterval: "1min"
|
metricCollectionInterval: "1min"
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy-link pods
|
# -- Additional labels for neon-proxy-link pods
|
||||||
|
|||||||
@@ -6,11 +6,11 @@ image:
|
|||||||
|
|
||||||
settings:
|
settings:
|
||||||
authBackend: "console"
|
authBackend: "console"
|
||||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||||
domain: "*.cloud.stage.neon.tech"
|
domain: "*.cloud.stage.neon.tech"
|
||||||
sentryEnvironment: "staging"
|
sentryEnvironment: "staging"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||||
metricCollectionInterval: "1min"
|
metricCollectionInterval: "1min"
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy pods
|
# -- Additional labels for neon-proxy pods
|
||||||
|
|||||||
@@ -1,31 +1,16 @@
|
|||||||
# Helm chart values for neon-proxy-scram.
|
# Helm chart values for neon-proxy-scram.
|
||||||
# This is a YAML-formatted file.
|
# This is a YAML-formatted file.
|
||||||
|
|
||||||
deploymentStrategy:
|
|
||||||
type: RollingUpdate
|
|
||||||
rollingUpdate:
|
|
||||||
maxSurge: 100%
|
|
||||||
maxUnavailable: 50%
|
|
||||||
|
|
||||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
|
||||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
|
||||||
# but doesn't receive new ones
|
|
||||||
containerLifecycle:
|
|
||||||
preStop:
|
|
||||||
exec:
|
|
||||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
|
||||||
terminationGracePeriodSeconds: 604800
|
|
||||||
|
|
||||||
image:
|
image:
|
||||||
repository: neondatabase/neon
|
repository: neondatabase/neon
|
||||||
|
|
||||||
settings:
|
settings:
|
||||||
authBackend: "console"
|
authBackend: "console"
|
||||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||||
domain: "*.us-east-2.aws.neon.build"
|
domain: "*.us-east-2.aws.neon.build"
|
||||||
sentryEnvironment: "staging"
|
sentryEnvironment: "staging"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||||
metricCollectionInterval: "1min"
|
metricCollectionInterval: "1min"
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy pods
|
# -- Additional labels for neon-proxy pods
|
||||||
|
|||||||
@@ -1,32 +1,16 @@
|
|||||||
# Helm chart values for neon-proxy-scram.
|
# Helm chart values for neon-proxy-scram.
|
||||||
# This is a YAML-formatted file.
|
# This is a YAML-formatted file.
|
||||||
|
|
||||||
deploymentStrategy:
|
|
||||||
type: RollingUpdate
|
|
||||||
rollingUpdate:
|
|
||||||
maxSurge: 100%
|
|
||||||
maxUnavailable: 50%
|
|
||||||
|
|
||||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
|
||||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
|
||||||
# but doesn't receive new ones
|
|
||||||
containerLifecycle:
|
|
||||||
preStop:
|
|
||||||
exec:
|
|
||||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
|
||||||
terminationGracePeriodSeconds: 604800
|
|
||||||
|
|
||||||
|
|
||||||
image:
|
image:
|
||||||
repository: neondatabase/neon
|
repository: neondatabase/neon
|
||||||
|
|
||||||
settings:
|
settings:
|
||||||
authBackend: "console"
|
authBackend: "console"
|
||||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
authEndpoint: "http://console-release.local/management/api/v2"
|
||||||
domain: "*.ap-southeast-1.aws.neon.tech"
|
domain: "*.ap-southeast-1.aws.neon.tech"
|
||||||
sentryEnvironment: "production"
|
sentryEnvironment: "production"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||||
metricCollectionInterval: "10min"
|
metricCollectionInterval: "10min"
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy pods
|
# -- Additional labels for neon-proxy pods
|
||||||
|
|||||||
@@ -1,32 +1,16 @@
|
|||||||
# Helm chart values for neon-proxy-scram.
|
# Helm chart values for neon-proxy-scram.
|
||||||
# This is a YAML-formatted file.
|
# This is a YAML-formatted file.
|
||||||
|
|
||||||
deploymentStrategy:
|
|
||||||
type: RollingUpdate
|
|
||||||
rollingUpdate:
|
|
||||||
maxSurge: 100%
|
|
||||||
maxUnavailable: 50%
|
|
||||||
|
|
||||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
|
||||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
|
||||||
# but doesn't receive new ones
|
|
||||||
containerLifecycle:
|
|
||||||
preStop:
|
|
||||||
exec:
|
|
||||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
|
||||||
terminationGracePeriodSeconds: 604800
|
|
||||||
|
|
||||||
|
|
||||||
image:
|
image:
|
||||||
repository: neondatabase/neon
|
repository: neondatabase/neon
|
||||||
|
|
||||||
settings:
|
settings:
|
||||||
authBackend: "console"
|
authBackend: "console"
|
||||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
authEndpoint: "http://console-release.local/management/api/v2"
|
||||||
domain: "*.eu-central-1.aws.neon.tech"
|
domain: "*.eu-central-1.aws.neon.tech"
|
||||||
sentryEnvironment: "production"
|
sentryEnvironment: "production"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||||
metricCollectionInterval: "10min"
|
metricCollectionInterval: "10min"
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy pods
|
# -- Additional labels for neon-proxy pods
|
||||||
|
|||||||
@@ -1,32 +1,16 @@
|
|||||||
# Helm chart values for neon-proxy-scram.
|
# Helm chart values for neon-proxy-scram.
|
||||||
# This is a YAML-formatted file.
|
# This is a YAML-formatted file.
|
||||||
|
|
||||||
deploymentStrategy:
|
|
||||||
type: RollingUpdate
|
|
||||||
rollingUpdate:
|
|
||||||
maxSurge: 100%
|
|
||||||
maxUnavailable: 50%
|
|
||||||
|
|
||||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
|
||||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
|
||||||
# but doesn't receive new ones
|
|
||||||
containerLifecycle:
|
|
||||||
preStop:
|
|
||||||
exec:
|
|
||||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
|
||||||
terminationGracePeriodSeconds: 604800
|
|
||||||
|
|
||||||
|
|
||||||
image:
|
image:
|
||||||
repository: neondatabase/neon
|
repository: neondatabase/neon
|
||||||
|
|
||||||
settings:
|
settings:
|
||||||
authBackend: "console"
|
authBackend: "console"
|
||||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
authEndpoint: "http://console-release.local/management/api/v2"
|
||||||
domain: "*.us-east-2.aws.neon.tech"
|
domain: "*.us-east-2.aws.neon.tech"
|
||||||
sentryEnvironment: "production"
|
sentryEnvironment: "production"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||||
metricCollectionInterval: "10min"
|
metricCollectionInterval: "10min"
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy pods
|
# -- Additional labels for neon-proxy pods
|
||||||
|
|||||||
@@ -6,11 +6,11 @@ image:
|
|||||||
|
|
||||||
settings:
|
settings:
|
||||||
authBackend: "console"
|
authBackend: "console"
|
||||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
authEndpoint: "http://console-release.local/management/api/v2"
|
||||||
domain: "*.cloud.neon.tech"
|
domain: "*.cloud.neon.tech"
|
||||||
sentryEnvironment: "production"
|
sentryEnvironment: "production"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||||
metricCollectionInterval: "10min"
|
metricCollectionInterval: "10min"
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy pods
|
# -- Additional labels for neon-proxy pods
|
||||||
|
|||||||
@@ -1,32 +1,16 @@
|
|||||||
# Helm chart values for neon-proxy-scram.
|
# Helm chart values for neon-proxy-scram.
|
||||||
# This is a YAML-formatted file.
|
# This is a YAML-formatted file.
|
||||||
|
|
||||||
deploymentStrategy:
|
|
||||||
type: RollingUpdate
|
|
||||||
rollingUpdate:
|
|
||||||
maxSurge: 100%
|
|
||||||
maxUnavailable: 50%
|
|
||||||
|
|
||||||
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
|
|
||||||
# The pod(s) will stay in Terminating, keeps the existing connections
|
|
||||||
# but doesn't receive new ones
|
|
||||||
containerLifecycle:
|
|
||||||
preStop:
|
|
||||||
exec:
|
|
||||||
command: ["/bin/sh", "-c", "sleep 604800"]
|
|
||||||
terminationGracePeriodSeconds: 604800
|
|
||||||
|
|
||||||
|
|
||||||
image:
|
image:
|
||||||
repository: neondatabase/neon
|
repository: neondatabase/neon
|
||||||
|
|
||||||
settings:
|
settings:
|
||||||
authBackend: "console"
|
authBackend: "console"
|
||||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
authEndpoint: "http://console-release.local/management/api/v2"
|
||||||
domain: "*.us-west-2.aws.neon.tech"
|
domain: "*.us-west-2.aws.neon.tech"
|
||||||
sentryEnvironment: "production"
|
sentryEnvironment: "production"
|
||||||
wssPort: 8443
|
wssPort: 8443
|
||||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||||
metricCollectionInterval: "10min"
|
metricCollectionInterval: "10min"
|
||||||
|
|
||||||
# -- Additional labels for neon-proxy pods
|
# -- Additional labels for neon-proxy pods
|
||||||
|
|||||||
56
.github/helm-values/production.neon-storage-broker.yaml
vendored
Normal file
56
.github/helm-values/production.neon-storage-broker.yaml
vendored
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
# Helm chart values for neon-storage-broker
|
||||||
|
podLabels:
|
||||||
|
neon_env: production
|
||||||
|
neon_service: storage-broker
|
||||||
|
|
||||||
|
# Use L4 LB
|
||||||
|
service:
|
||||||
|
# service.annotations -- Annotations to add to the service
|
||||||
|
annotations:
|
||||||
|
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||||
|
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||||
|
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||||
|
# assign service to this name at external-dns
|
||||||
|
external-dns.alpha.kubernetes.io/hostname: storage-broker.prod.local
|
||||||
|
# service.type -- Service type
|
||||||
|
type: LoadBalancer
|
||||||
|
# service.port -- broker listen port
|
||||||
|
port: 50051
|
||||||
|
|
||||||
|
ingress:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
metrics:
|
||||||
|
enabled: true
|
||||||
|
serviceMonitor:
|
||||||
|
enabled: true
|
||||||
|
selector:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
|
||||||
|
extraManifests:
|
||||||
|
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||||
|
kind: VMServiceScrape
|
||||||
|
metadata:
|
||||||
|
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||||
|
labels:
|
||||||
|
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||||
|
app.kubernetes.io/name: neon-storage-broker
|
||||||
|
app.kubernetes.io/instance: neon-storage-broker
|
||||||
|
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||||
|
app.kubernetes.io/managed-by: Helm
|
||||||
|
namespace: "{{ .Release.Namespace }}"
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: "neon-storage-broker"
|
||||||
|
endpoints:
|
||||||
|
- port: broker
|
||||||
|
path: /metrics
|
||||||
|
interval: 10s
|
||||||
|
scrapeTimeout: 10s
|
||||||
|
namespaceSelector:
|
||||||
|
matchNames:
|
||||||
|
- "{{ .Release.Namespace }}"
|
||||||
|
|
||||||
|
settings:
|
||||||
|
sentryEnvironment: "production"
|
||||||
25
.github/workflows/build_and_test.yml
vendored
25
.github/workflows/build_and_test.yml
vendored
@@ -611,31 +611,34 @@ jobs:
|
|||||||
run:
|
run:
|
||||||
shell: sh -eu {0}
|
shell: sh -eu {0}
|
||||||
env:
|
env:
|
||||||
VM_BUILDER_VERSION: v0.4.6
|
VM_INFORMANT_VERSION: 0.1.1
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Downloading latest vm-builder
|
||||||
uses: actions/checkout@v1
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Downloading vm-builder
|
|
||||||
run: |
|
run: |
|
||||||
curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder
|
||||||
chmod +x vm-builder
|
chmod +x vm-builder
|
||||||
|
|
||||||
- name: Pulling compute-node image
|
- name: Pulling compute-node image
|
||||||
run: |
|
run: |
|
||||||
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||||
|
|
||||||
- name: Building VM compute-node rootfs
|
- name: Downloading VM informant version ${{ env.VM_INFORMANT_VERSION }}
|
||||||
run: |
|
run: |
|
||||||
docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node .
|
curl -fL https://github.com/neondatabase/autoscaling/releases/download/${{ env.VM_INFORMANT_VERSION }}/vm-informant -o vm-informant
|
||||||
|
chmod +x vm-informant
|
||||||
|
|
||||||
|
- name: Adding VM informant to compute-node image
|
||||||
|
run: |
|
||||||
|
ID=$(docker create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}})
|
||||||
|
docker cp vm-informant $ID:/bin/vm-informant
|
||||||
|
docker commit $ID temp-vm-compute-node
|
||||||
|
docker rm -f $ID
|
||||||
|
|
||||||
- name: Build vm image
|
- name: Build vm image
|
||||||
run: |
|
run: |
|
||||||
# note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
|
# note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
|
||||||
./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
./vm-builder -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||||
|
|
||||||
- name: Pushing vm-compute-node image
|
- name: Pushing vm-compute-node image
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
2
.github/workflows/deploy-dev.yml
vendored
2
.github/workflows/deploy-dev.yml
vendored
@@ -67,7 +67,7 @@ jobs:
|
|||||||
./get_binaries.sh
|
./get_binaries.sh
|
||||||
|
|
||||||
ansible-galaxy collection install sivel.toiletwater
|
ansible-galaxy collection install sivel.toiletwater
|
||||||
ansible-playbook -v deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||||
rm -f neon_install.tar.gz .neon_current_version
|
rm -f neon_install.tar.gz .neon_current_version
|
||||||
|
|
||||||
- name: Cleanup ansible folder
|
- name: Cleanup ansible folder
|
||||||
|
|||||||
81
.github/workflows/deploy-prod.yml
vendored
81
.github/workflows/deploy-prod.yml
vendored
@@ -40,9 +40,7 @@ concurrency:
|
|||||||
jobs:
|
jobs:
|
||||||
deploy-prod-new:
|
deploy-prod-new:
|
||||||
runs-on: prod
|
runs-on: prod
|
||||||
container:
|
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||||
image: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
|
||||||
options: --user root --privileged
|
|
||||||
if: inputs.deployStorage && inputs.disclamerAcknowledged
|
if: inputs.deployStorage && inputs.disclamerAcknowledged
|
||||||
defaults:
|
defaults:
|
||||||
run:
|
run:
|
||||||
@@ -68,7 +66,7 @@ jobs:
|
|||||||
./get_binaries.sh
|
./get_binaries.sh
|
||||||
|
|
||||||
ansible-galaxy collection install sivel.toiletwater
|
ansible-galaxy collection install sivel.toiletwater
|
||||||
ansible-playbook -v deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||||
rm -f neon_install.tar.gz .neon_current_version
|
rm -f neon_install.tar.gz .neon_current_version
|
||||||
|
|
||||||
deploy-proxy-prod-new:
|
deploy-proxy-prod-new:
|
||||||
@@ -165,3 +163,78 @@ jobs:
|
|||||||
- name: Deploy storage-broker
|
- name: Deploy storage-broker
|
||||||
run:
|
run:
|
||||||
helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
||||||
|
|
||||||
|
# Deploy to old account below
|
||||||
|
|
||||||
|
deploy:
|
||||||
|
runs-on: prod
|
||||||
|
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||||
|
if: inputs.deployStorage && inputs.disclamerAcknowledged
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash
|
||||||
|
environment:
|
||||||
|
name: prod-old
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 0
|
||||||
|
ref: ${{ inputs.branch }}
|
||||||
|
|
||||||
|
- name: Redeploy
|
||||||
|
run: |
|
||||||
|
export DOCKER_TAG=${{ inputs.dockerTag }}
|
||||||
|
cd "$(pwd)/.github/ansible"
|
||||||
|
|
||||||
|
./get_binaries.sh
|
||||||
|
|
||||||
|
eval $(ssh-agent)
|
||||||
|
echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key
|
||||||
|
echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
|
||||||
|
chmod 0600 ssh-key
|
||||||
|
ssh-add ssh-key
|
||||||
|
rm -f ssh-key ssh-key-cert.pub
|
||||||
|
ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
|
||||||
|
ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||||
|
rm -f neon_install.tar.gz .neon_current_version
|
||||||
|
|
||||||
|
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied
|
||||||
|
- name: Cleanup ansible folder
|
||||||
|
run: rm -rf ~/.ansible
|
||||||
|
|
||||||
|
deploy-storage-broker:
|
||||||
|
name: deploy storage broker on old staging and old prod
|
||||||
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
|
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||||
|
if: inputs.deployStorageBroker && inputs.disclamerAcknowledged
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash
|
||||||
|
environment:
|
||||||
|
name: prod-old
|
||||||
|
env:
|
||||||
|
KUBECONFIG: .kubeconfig
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
fetch-depth: 0
|
||||||
|
ref: ${{ inputs.branch }}
|
||||||
|
|
||||||
|
- name: Store kubeconfig file
|
||||||
|
run: |
|
||||||
|
echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
|
||||||
|
chmod 0600 ${KUBECONFIG}
|
||||||
|
|
||||||
|
- name: Add neon helm chart
|
||||||
|
run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||||
|
|
||||||
|
- name: Deploy storage-broker
|
||||||
|
run:
|
||||||
|
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
||||||
|
|
||||||
|
- name: Cleanup helm folder
|
||||||
|
run: rm -rf ~/.cache
|
||||||
|
|||||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -18,5 +18,3 @@ test_output/
|
|||||||
*.o
|
*.o
|
||||||
*.so
|
*.so
|
||||||
*.Po
|
*.Po
|
||||||
|
|
||||||
tmp
|
|
||||||
|
|||||||
175
Cargo.lock
generated
175
Cargo.lock
generated
@@ -679,25 +679,6 @@ version = "0.3.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "cbindgen"
|
|
||||||
version = "0.24.5"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "4b922faaf31122819ec80c4047cc684c6979a087366c069611e33649bf98e18d"
|
|
||||||
dependencies = [
|
|
||||||
"clap 3.2.23",
|
|
||||||
"heck",
|
|
||||||
"indexmap",
|
|
||||||
"log",
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"serde",
|
|
||||||
"serde_json",
|
|
||||||
"syn",
|
|
||||||
"tempfile",
|
|
||||||
"toml",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cc"
|
name = "cc"
|
||||||
version = "1.0.79"
|
version = "1.0.79"
|
||||||
@@ -776,12 +757,9 @@ version = "3.2.23"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5"
|
checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"atty",
|
|
||||||
"bitflags",
|
"bitflags",
|
||||||
"clap_lex 0.2.4",
|
"clap_lex 0.2.4",
|
||||||
"indexmap",
|
"indexmap",
|
||||||
"strsim",
|
|
||||||
"termcolor",
|
|
||||||
"textwrap",
|
"textwrap",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -876,7 +854,6 @@ dependencies = [
|
|||||||
"opentelemetry",
|
"opentelemetry",
|
||||||
"postgres",
|
"postgres",
|
||||||
"regex",
|
"regex",
|
||||||
"reqwest",
|
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"tar",
|
"tar",
|
||||||
@@ -940,7 +917,6 @@ dependencies = [
|
|||||||
"reqwest",
|
"reqwest",
|
||||||
"safekeeper_api",
|
"safekeeper_api",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
|
||||||
"serde_with",
|
"serde_with",
|
||||||
"storage_broker",
|
"storage_broker",
|
||||||
"tar",
|
"tar",
|
||||||
@@ -1036,20 +1012,6 @@ version = "1.1.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
|
checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "crossbeam"
|
|
||||||
version = "0.8.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c"
|
|
||||||
dependencies = [
|
|
||||||
"cfg-if",
|
|
||||||
"crossbeam-channel",
|
|
||||||
"crossbeam-deque",
|
|
||||||
"crossbeam-epoch",
|
|
||||||
"crossbeam-queue",
|
|
||||||
"crossbeam-utils",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crossbeam-channel"
|
name = "crossbeam-channel"
|
||||||
version = "0.5.6"
|
version = "0.5.6"
|
||||||
@@ -1084,16 +1046,6 @@ dependencies = [
|
|||||||
"scopeguard",
|
"scopeguard",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "crossbeam-queue"
|
|
||||||
version = "0.3.8"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add"
|
|
||||||
dependencies = [
|
|
||||||
"cfg-if",
|
|
||||||
"crossbeam-utils",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crossbeam-utils"
|
name = "crossbeam-utils"
|
||||||
version = "0.8.14"
|
version = "0.8.14"
|
||||||
@@ -2157,16 +2109,6 @@ version = "0.3.16"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
|
checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "mime_guess"
|
|
||||||
version = "2.0.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
|
|
||||||
dependencies = [
|
|
||||||
"mime",
|
|
||||||
"unicase",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "minimal-lexical"
|
name = "minimal-lexical"
|
||||||
version = "0.2.1"
|
version = "0.2.1"
|
||||||
@@ -2479,7 +2421,6 @@ dependencies = [
|
|||||||
"crc32c",
|
"crc32c",
|
||||||
"criterion",
|
"criterion",
|
||||||
"crossbeam-utils",
|
"crossbeam-utils",
|
||||||
"either",
|
|
||||||
"enum-map",
|
"enum-map",
|
||||||
"enumset",
|
"enumset",
|
||||||
"fail",
|
"fail",
|
||||||
@@ -2543,7 +2484,6 @@ dependencies = [
|
|||||||
"enum-map",
|
"enum-map",
|
||||||
"postgres_ffi",
|
"postgres_ffi",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
|
||||||
"serde_with",
|
"serde_with",
|
||||||
"utils",
|
"utils",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
@@ -2941,7 +2881,6 @@ dependencies = [
|
|||||||
"md5",
|
"md5",
|
||||||
"metrics",
|
"metrics",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"opentelemetry",
|
|
||||||
"parking_lot",
|
"parking_lot",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"pq_proto",
|
"pq_proto",
|
||||||
@@ -2950,8 +2889,6 @@ dependencies = [
|
|||||||
"rcgen",
|
"rcgen",
|
||||||
"regex",
|
"regex",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"reqwest-middleware",
|
|
||||||
"reqwest-tracing",
|
|
||||||
"routerify",
|
"routerify",
|
||||||
"rstest",
|
"rstest",
|
||||||
"rustls",
|
"rustls",
|
||||||
@@ -2961,7 +2898,6 @@ dependencies = [
|
|||||||
"serde_json",
|
"serde_json",
|
||||||
"sha2",
|
"sha2",
|
||||||
"socket2",
|
"socket2",
|
||||||
"sync_wrapper",
|
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tls-listener",
|
"tls-listener",
|
||||||
"tokio",
|
"tokio",
|
||||||
@@ -2969,9 +2905,7 @@ dependencies = [
|
|||||||
"tokio-postgres-rustls",
|
"tokio-postgres-rustls",
|
||||||
"tokio-rustls",
|
"tokio-rustls",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-opentelemetry",
|
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
"tracing-utils",
|
|
||||||
"url",
|
"url",
|
||||||
"utils",
|
"utils",
|
||||||
"uuid",
|
"uuid",
|
||||||
@@ -3101,7 +3035,6 @@ dependencies = [
|
|||||||
"hyper",
|
"hyper",
|
||||||
"metrics",
|
"metrics",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"pin-project-lite",
|
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
@@ -3113,6 +3046,15 @@ dependencies = [
|
|||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "remove_dir_all"
|
||||||
|
version = "0.5.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
|
||||||
|
dependencies = [
|
||||||
|
"winapi",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "reqwest"
|
name = "reqwest"
|
||||||
version = "0.11.14"
|
version = "0.11.14"
|
||||||
@@ -3133,7 +3075,6 @@ dependencies = [
|
|||||||
"js-sys",
|
"js-sys",
|
||||||
"log",
|
"log",
|
||||||
"mime",
|
"mime",
|
||||||
"mime_guess",
|
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"percent-encoding",
|
"percent-encoding",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
@@ -3153,36 +3094,6 @@ dependencies = [
|
|||||||
"winreg",
|
"winreg",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "reqwest-middleware"
|
|
||||||
version = "0.2.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "4a1c03e9011a8c59716ad13115550469e081e2e9892656b0ba6a47c907921894"
|
|
||||||
dependencies = [
|
|
||||||
"anyhow",
|
|
||||||
"async-trait",
|
|
||||||
"http",
|
|
||||||
"reqwest",
|
|
||||||
"serde",
|
|
||||||
"task-local-extensions",
|
|
||||||
"thiserror",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "reqwest-tracing"
|
|
||||||
version = "0.4.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "b739d87a6b2cf4743968ad2b4cef648fbe0204c19999509824425babb2097bce"
|
|
||||||
dependencies = [
|
|
||||||
"async-trait",
|
|
||||||
"opentelemetry",
|
|
||||||
"reqwest",
|
|
||||||
"reqwest-middleware",
|
|
||||||
"task-local-extensions",
|
|
||||||
"tracing",
|
|
||||||
"tracing-opentelemetry",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ring"
|
name = "ring"
|
||||||
version = "0.16.20"
|
version = "0.16.20"
|
||||||
@@ -3353,11 +3264,9 @@ dependencies = [
|
|||||||
"async-trait",
|
"async-trait",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"bytes",
|
"bytes",
|
||||||
"chrono",
|
|
||||||
"clap 4.1.4",
|
"clap 4.1.4",
|
||||||
"const_format",
|
"const_format",
|
||||||
"crc32c",
|
"crc32c",
|
||||||
"crossbeam",
|
|
||||||
"fs2",
|
"fs2",
|
||||||
"git-version",
|
"git-version",
|
||||||
"hex",
|
"hex",
|
||||||
@@ -3371,11 +3280,9 @@ dependencies = [
|
|||||||
"postgres-protocol",
|
"postgres-protocol",
|
||||||
"postgres_ffi",
|
"postgres_ffi",
|
||||||
"pq_proto",
|
"pq_proto",
|
||||||
"rand",
|
|
||||||
"regex",
|
"regex",
|
||||||
"remote_storage",
|
"remote_storage",
|
||||||
"safekeeper_api",
|
"safekeeper_api",
|
||||||
"scopeguard",
|
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"serde_with",
|
"serde_with",
|
||||||
@@ -3879,26 +3786,18 @@ dependencies = [
|
|||||||
"xattr",
|
"xattr",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "task-local-extensions"
|
|
||||||
version = "0.1.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "4167afbec18ae012de40f8cf1b9bf48420abb390678c34821caa07d924941cc4"
|
|
||||||
dependencies = [
|
|
||||||
"tokio",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tempfile"
|
name = "tempfile"
|
||||||
version = "3.4.0"
|
version = "3.3.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95"
|
checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"fastrand",
|
"fastrand",
|
||||||
|
"libc",
|
||||||
"redox_syscall",
|
"redox_syscall",
|
||||||
"rustix",
|
"remove_dir_all",
|
||||||
"windows-sys 0.42.0",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3906,8 +3805,6 @@ name = "tenant_size_model"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"serde",
|
|
||||||
"serde_json",
|
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -4457,15 +4354,6 @@ dependencies = [
|
|||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "unicase"
|
|
||||||
version = "2.6.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
|
|
||||||
dependencies = [
|
|
||||||
"version_check",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-bidi"
|
name = "unicode-bidi"
|
||||||
version = "0.3.10"
|
version = "0.3.10"
|
||||||
@@ -4555,7 +4443,6 @@ dependencies = [
|
|||||||
"byteorder",
|
"byteorder",
|
||||||
"bytes",
|
"bytes",
|
||||||
"criterion",
|
"criterion",
|
||||||
"futures",
|
|
||||||
"git-version",
|
"git-version",
|
||||||
"heapless",
|
"heapless",
|
||||||
"hex",
|
"hex",
|
||||||
@@ -4585,7 +4472,6 @@ dependencies = [
|
|||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
"url",
|
"url",
|
||||||
"uuid",
|
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -4637,38 +4523,6 @@ dependencies = [
|
|||||||
"winapi-util",
|
"winapi-util",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "walproposer"
|
|
||||||
version = "0.1.0"
|
|
||||||
dependencies = [
|
|
||||||
"anyhow",
|
|
||||||
"atty",
|
|
||||||
"bindgen",
|
|
||||||
"byteorder",
|
|
||||||
"bytes",
|
|
||||||
"cbindgen",
|
|
||||||
"crc32c",
|
|
||||||
"env_logger",
|
|
||||||
"hex",
|
|
||||||
"hyper",
|
|
||||||
"libc",
|
|
||||||
"log",
|
|
||||||
"memoffset 0.8.0",
|
|
||||||
"once_cell",
|
|
||||||
"postgres",
|
|
||||||
"postgres_ffi",
|
|
||||||
"rand",
|
|
||||||
"regex",
|
|
||||||
"safekeeper",
|
|
||||||
"scopeguard",
|
|
||||||
"serde",
|
|
||||||
"thiserror",
|
|
||||||
"tracing",
|
|
||||||
"tracing-subscriber",
|
|
||||||
"utils",
|
|
||||||
"workspace_hack",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "want"
|
name = "want"
|
||||||
version = "0.3.0"
|
version = "0.3.0"
|
||||||
@@ -4924,6 +4778,7 @@ dependencies = [
|
|||||||
"either",
|
"either",
|
||||||
"fail",
|
"fail",
|
||||||
"futures",
|
"futures",
|
||||||
|
"futures-channel",
|
||||||
"futures-executor",
|
"futures-executor",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"hashbrown 0.12.3",
|
"hashbrown 0.12.3",
|
||||||
|
|||||||
10
Cargo.toml
10
Cargo.toml
@@ -38,7 +38,6 @@ comfy-table = "6.1"
|
|||||||
const_format = "0.2"
|
const_format = "0.2"
|
||||||
crc32c = "0.6"
|
crc32c = "0.6"
|
||||||
crossbeam-utils = "0.8.5"
|
crossbeam-utils = "0.8.5"
|
||||||
either = "1.8"
|
|
||||||
enum-map = "2.4.2"
|
enum-map = "2.4.2"
|
||||||
enumset = "1.0.12"
|
enumset = "1.0.12"
|
||||||
fail = "0.5.0"
|
fail = "0.5.0"
|
||||||
@@ -69,6 +68,7 @@ once_cell = "1.13"
|
|||||||
opentelemetry = "0.18.0"
|
opentelemetry = "0.18.0"
|
||||||
opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||||
opentelemetry-semantic-conventions = "0.10.0"
|
opentelemetry-semantic-conventions = "0.10.0"
|
||||||
|
tracing-opentelemetry = "0.18.0"
|
||||||
parking_lot = "0.12"
|
parking_lot = "0.12"
|
||||||
pin-project-lite = "0.2"
|
pin-project-lite = "0.2"
|
||||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||||
@@ -76,8 +76,6 @@ prost = "0.11"
|
|||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
regex = "1.4"
|
regex = "1.4"
|
||||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
|
|
||||||
reqwest-middleware = "0.2.0"
|
|
||||||
routerify = "3"
|
routerify = "3"
|
||||||
rpds = "0.12.0"
|
rpds = "0.12.0"
|
||||||
rustls = "0.20"
|
rustls = "0.20"
|
||||||
@@ -94,7 +92,6 @@ socket2 = "0.4.4"
|
|||||||
strum = "0.24"
|
strum = "0.24"
|
||||||
strum_macros = "0.24"
|
strum_macros = "0.24"
|
||||||
svg_fmt = "0.4.1"
|
svg_fmt = "0.4.1"
|
||||||
sync_wrapper = "0.1.2"
|
|
||||||
tar = "0.4"
|
tar = "0.4"
|
||||||
thiserror = "1.0"
|
thiserror = "1.0"
|
||||||
tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
|
tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
|
||||||
@@ -107,7 +104,6 @@ toml = "0.5"
|
|||||||
toml_edit = { version = "0.17", features = ["easy"] }
|
toml_edit = { version = "0.17", features = ["easy"] }
|
||||||
tonic = {version = "0.8", features = ["tls", "tls-roots"]}
|
tonic = {version = "0.8", features = ["tls", "tls-roots"]}
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
tracing-opentelemetry = "0.18.0"
|
|
||||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||||
url = "2.2"
|
url = "2.2"
|
||||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||||
@@ -138,12 +134,10 @@ postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
|
|||||||
pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
|
pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
|
||||||
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
|
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
|
||||||
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
|
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
|
||||||
safekeeper = { path = "./safekeeper/" }
|
|
||||||
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
||||||
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
||||||
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||||
utils = { version = "0.1", path = "./libs/utils/" }
|
utils = { version = "0.1", path = "./libs/utils/" }
|
||||||
walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
|
||||||
|
|
||||||
## Common library dependency
|
## Common library dependency
|
||||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||||
@@ -152,7 +146,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
|||||||
criterion = "0.4"
|
criterion = "0.4"
|
||||||
rcgen = "0.10"
|
rcgen = "0.10"
|
||||||
rstest = "0.16"
|
rstest = "0.16"
|
||||||
tempfile = "3.4"
|
tempfile = "3.2"
|
||||||
tonic-build = "0.8"
|
tonic-build = "0.8"
|
||||||
|
|
||||||
# This is only needed for proxy's tests.
|
# This is only needed for proxy's tests.
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
ARG PG_VERSION
|
|
||||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||||
ARG IMAGE=rust
|
ARG IMAGE=rust
|
||||||
ARG TAG=pinned
|
ARG TAG=pinned
|
||||||
@@ -12,7 +11,7 @@ FROM debian:bullseye-slim AS build-deps
|
|||||||
RUN apt update && \
|
RUN apt update && \
|
||||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
|
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
|
||||||
libicu-dev libxslt1-dev
|
libicu-dev
|
||||||
|
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
#
|
#
|
||||||
@@ -24,24 +23,18 @@ FROM build-deps AS pg-build
|
|||||||
ARG PG_VERSION
|
ARG PG_VERSION
|
||||||
COPY vendor/postgres-${PG_VERSION} postgres
|
COPY vendor/postgres-${PG_VERSION} postgres
|
||||||
RUN cd postgres && \
|
RUN cd postgres && \
|
||||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu \
|
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu && \
|
||||||
--with-libxml --with-libxslt && \
|
|
||||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||||
# Install headers
|
# Install headers
|
||||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
||||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||||
# Enable some of contrib extensions
|
# Enable some of contrib extensions
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||||
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
|
|
||||||
|
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
#
|
#
|
||||||
@@ -57,18 +50,17 @@ RUN apt update && \
|
|||||||
libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
|
libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
|
||||||
protobuf-c-compiler xsltproc
|
protobuf-c-compiler xsltproc
|
||||||
|
|
||||||
# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
|
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz && \
|
||||||
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
|
tar zxvf SFCGAL-v1.3.10.tar.gz && \
|
||||||
mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
|
cd SFCGAL-v1.3.10 && cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||||
cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
|
|
||||||
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
|
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||||
make clean && cp -R /sfcgal/* /
|
make clean && cp -R /sfcgal/* /
|
||||||
|
|
||||||
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||||
|
tar xvzf postgis-3.3.1.tar.gz && \
|
||||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
|
cd postgis-3.3.1 && \
|
||||||
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
|
|
||||||
./autogen.sh && \
|
./autogen.sh && \
|
||||||
|
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||||
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||||
cd extensions/postgis && \
|
cd extensions/postgis && \
|
||||||
@@ -82,15 +74,6 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
|
|||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
|
||||||
|
|
||||||
RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
|
|
||||||
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
|
|
||||||
mkdir build && \
|
|
||||||
cd build && \
|
|
||||||
cmake .. && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
|
|
||||||
|
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
#
|
#
|
||||||
# Layer "plv8-build"
|
# Layer "plv8-build"
|
||||||
@@ -100,17 +83,30 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
|
|||||||
FROM build-deps AS plv8-build
|
FROM build-deps AS plv8-build
|
||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
RUN apt update && \
|
RUN apt update && \
|
||||||
apt install -y ninja-build python3-dev libncurses5 binutils clang
|
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
|
||||||
|
|
||||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \
|
# https://github.com/plv8/plv8/issues/475:
|
||||||
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
|
# v8 uses gold for linking and sets `--thread-count=4` which breaks
|
||||||
|
# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
|
||||||
|
# Install newer gold version manually as debian-testing binutils version updates
|
||||||
|
# libc version, which in turn breaks other extension built against non-testing libc.
|
||||||
|
RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
|
||||||
|
tar xvzf binutils-2.38.tar.gz && \
|
||||||
|
cd binutils-2.38 && \
|
||||||
|
cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||||
|
cd ../bfd && ./configure && make bfdver.h && \
|
||||||
|
cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
|
||||||
|
cp /usr/local/bin/ld.gold /usr/bin/gold
|
||||||
|
|
||||||
|
# Sed is used to patch for https://github.com/plv8/plv8/issues/503
|
||||||
|
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||||
|
tar xvzf v3.1.4.tar.gz && \
|
||||||
|
cd plv8-3.1.4 && \
|
||||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||||
|
sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
|
||||||
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||||
rm -rf /plv8-* && \
|
rm -rf /plv8-* && \
|
||||||
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
|
|
||||||
|
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
#
|
#
|
||||||
@@ -128,17 +124,20 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2
|
|||||||
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
|
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
|
||||||
&& rm /tmp/cmake-install.sh
|
&& rm /tmp/cmake-install.sh
|
||||||
|
|
||||||
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
|
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
|
||||||
mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
|
tar xvzf h3.tgz && \
|
||||||
mkdir build && cd build && \
|
cd h3-4.0.1 && \
|
||||||
|
mkdir build && \
|
||||||
|
cd build && \
|
||||||
cmake .. -DCMAKE_BUILD_TYPE=Release && \
|
cmake .. -DCMAKE_BUILD_TYPE=Release && \
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||||
DESTDIR=/h3 make install && \
|
DESTDIR=/h3 make install && \
|
||||||
cp -R /h3/usr / && \
|
cp -R /h3/usr / && \
|
||||||
rm -rf build
|
rm -rf build
|
||||||
|
|
||||||
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \
|
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
|
||||||
mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
|
tar xvzf h3-pg.tgz && \
|
||||||
|
cd h3-pg-4.0.1 && \
|
||||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||||
@@ -154,8 +153,9 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3
|
|||||||
FROM build-deps AS unit-pg-build
|
FROM build-deps AS unit-pg-build
|
||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
|
||||||
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
|
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \
|
||||||
mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
|
tar xvzf 7.7.tar.gz && \
|
||||||
|
cd postgresql-unit-7.7 && \
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||||
# unit extension's "create extension" script relies on absolute install path to fill some reference tables.
|
# unit extension's "create extension" script relies on absolute install path to fill some reference tables.
|
||||||
@@ -165,156 +165,6 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
|
|||||||
find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \
|
find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control
|
||||||
|
|
||||||
#########################################################################################
|
|
||||||
#
|
|
||||||
# Layer "vector-pg-build"
|
|
||||||
# compile pgvector extension
|
|
||||||
#
|
|
||||||
#########################################################################################
|
|
||||||
FROM build-deps AS vector-pg-build
|
|
||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
|
|
||||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
|
|
||||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
|
||||||
|
|
||||||
#########################################################################################
|
|
||||||
#
|
|
||||||
# Layer "pgjwt-pg-build"
|
|
||||||
# compile pgjwt extension
|
|
||||||
#
|
|
||||||
#########################################################################################
|
|
||||||
FROM build-deps AS pgjwt-pg-build
|
|
||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
|
|
||||||
# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
|
|
||||||
RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
|
|
||||||
mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
|
|
||||||
|
|
||||||
#########################################################################################
|
|
||||||
#
|
|
||||||
# Layer "hypopg-pg-build"
|
|
||||||
# compile hypopg extension
|
|
||||||
#
|
|
||||||
#########################################################################################
|
|
||||||
FROM build-deps AS hypopg-pg-build
|
|
||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
|
|
||||||
RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypopg.tar.gz && \
|
|
||||||
mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
|
|
||||||
|
|
||||||
#########################################################################################
|
|
||||||
#
|
|
||||||
# Layer "pg-hashids-pg-build"
|
|
||||||
# compile pg_hashids extension
|
|
||||||
#
|
|
||||||
#########################################################################################
|
|
||||||
FROM build-deps AS pg-hashids-pg-build
|
|
||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
|
|
||||||
RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
|
|
||||||
mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
|
|
||||||
|
|
||||||
#########################################################################################
|
|
||||||
#
|
|
||||||
# Layer "rum-pg-build"
|
|
||||||
# compile rum extension
|
|
||||||
#
|
|
||||||
#########################################################################################
|
|
||||||
FROM build-deps AS rum-pg-build
|
|
||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
|
|
||||||
RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
|
|
||||||
mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
|
|
||||||
|
|
||||||
#########################################################################################
|
|
||||||
#
|
|
||||||
# Layer "pgtap-pg-build"
|
|
||||||
# compile pgTAP extension
|
|
||||||
#
|
|
||||||
#########################################################################################
|
|
||||||
FROM build-deps AS pgtap-pg-build
|
|
||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
|
|
||||||
RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
|
|
||||||
mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
|
|
||||||
|
|
||||||
#########################################################################################
|
|
||||||
#
|
|
||||||
# Layer "rust extensions"
|
|
||||||
# This layer is used to build `pgx` deps
|
|
||||||
#
|
|
||||||
#########################################################################################
|
|
||||||
FROM build-deps AS rust-extensions-build
|
|
||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y curl libclang-dev cmake && \
|
|
||||||
useradd -ms /bin/bash nonroot -b /home
|
|
||||||
|
|
||||||
ENV HOME=/home/nonroot
|
|
||||||
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
|
||||||
USER nonroot
|
|
||||||
WORKDIR /home/nonroot
|
|
||||||
ARG PG_VERSION
|
|
||||||
|
|
||||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
|
||||||
chmod +x rustup-init && \
|
|
||||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
|
||||||
rm rustup-init && \
|
|
||||||
cargo install --git https://github.com/vadim2404/pgx --branch neon_abi_v0.6.1 --locked cargo-pgx && \
|
|
||||||
/bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
|
||||||
|
|
||||||
USER root
|
|
||||||
|
|
||||||
#########################################################################################
|
|
||||||
#
|
|
||||||
# Layer "pg-jsonschema-pg-build"
|
|
||||||
# Compile "pg_jsonschema" extension
|
|
||||||
#
|
|
||||||
#########################################################################################
|
|
||||||
|
|
||||||
FROM rust-extensions-build AS pg-jsonschema-pg-build
|
|
||||||
|
|
||||||
RUN git clone --depth=1 --single-branch --branch neon_abi_v0.1.4 https://github.com/vadim2404/pg_jsonschema/ && \
|
|
||||||
cd pg_jsonschema && \
|
|
||||||
cargo pgx install --release && \
|
|
||||||
# it's needed to enable extension because it uses untrusted C language
|
|
||||||
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_jsonschema.control && \
|
|
||||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
|
|
||||||
|
|
||||||
#########################################################################################
|
|
||||||
#
|
|
||||||
# Layer "pg-graphql-pg-build"
|
|
||||||
# Compile "pg_graphql" extension
|
|
||||||
#
|
|
||||||
#########################################################################################
|
|
||||||
|
|
||||||
FROM rust-extensions-build AS pg-graphql-pg-build
|
|
||||||
|
|
||||||
RUN git clone --depth=1 --single-branch --branch neon_abi_v1.1.0 https://github.com/vadim2404/pg_graphql && \
|
|
||||||
cd pg_graphql && \
|
|
||||||
cargo pgx install --release && \
|
|
||||||
# it's needed to enable extension because it uses untrusted C language
|
|
||||||
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
|
|
||||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
|
|
||||||
|
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
#
|
#
|
||||||
# Layer "neon-pg-ext-build"
|
# Layer "neon-pg-ext-build"
|
||||||
@@ -328,14 +178,6 @@ COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
|||||||
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY --from=h3-pg-build /h3/usr /
|
COPY --from=h3-pg-build /h3/usr /
|
||||||
COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
COPY pgxn/ pgxn/
|
COPY pgxn/ pgxn/
|
||||||
|
|
||||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||||
@@ -386,9 +228,7 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
|||||||
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||||
chown -R postgres:postgres /var/db/postgres && \
|
chown -R postgres:postgres /var/db/postgres && \
|
||||||
chmod 0750 /var/db/postgres/compute && \
|
chmod 0750 /var/db/postgres/compute && \
|
||||||
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
|
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||||
# create folder for file cache
|
|
||||||
mkdir -p -m 777 /neon/cache
|
|
||||||
|
|
||||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||||
@@ -398,7 +238,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
|
|||||||
# libicu67, locales for collations (including ICU)
|
# libicu67, locales for collations (including ICU)
|
||||||
# libossp-uuid16 for extension ossp-uuid
|
# libossp-uuid16 for extension ossp-uuid
|
||||||
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
||||||
# libxml2, libxslt1.1 for xml2
|
|
||||||
RUN apt update && \
|
RUN apt update && \
|
||||||
apt install --no-install-recommends -y \
|
apt install --no-install-recommends -y \
|
||||||
locales \
|
locales \
|
||||||
@@ -410,8 +249,6 @@ RUN apt update && \
|
|||||||
libproj19 \
|
libproj19 \
|
||||||
libprotobuf-c1 \
|
libprotobuf-c1 \
|
||||||
libsfcgal1 \
|
libsfcgal1 \
|
||||||
libxml2 \
|
|
||||||
libxslt1.1 \
|
|
||||||
gdb && \
|
gdb && \
|
||||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||||
|
|||||||
@@ -1,25 +0,0 @@
|
|||||||
# Note: this file *mostly* just builds on Dockerfile.compute-node
|
|
||||||
|
|
||||||
ARG SRC_IMAGE
|
|
||||||
ARG VM_INFORMANT_VERSION=v0.1.6
|
|
||||||
|
|
||||||
# Pull VM informant and set up inittab
|
|
||||||
FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant
|
|
||||||
|
|
||||||
RUN set -e \
|
|
||||||
&& rm -f /etc/inittab \
|
|
||||||
&& touch /etc/inittab
|
|
||||||
|
|
||||||
RUN set -e \
|
|
||||||
&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart'" >> /etc/inittab
|
|
||||||
|
|
||||||
# Combine, starting from non-VM compute node image.
|
|
||||||
FROM $SRC_IMAGE as base
|
|
||||||
|
|
||||||
# Temporarily set user back to root so we can run adduser
|
|
||||||
USER root
|
|
||||||
RUN adduser vm-informant --disabled-password --no-create-home
|
|
||||||
USER postgres
|
|
||||||
|
|
||||||
COPY --from=informant /etc/inittab /etc/inittab
|
|
||||||
COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant
|
|
||||||
20
Makefile
20
Makefile
@@ -39,8 +39,6 @@ endif
|
|||||||
# been no changes to the files. Changing the mtime triggers an
|
# been no changes to the files. Changing the mtime triggers an
|
||||||
# unnecessary rebuild of 'postgres_ffi'.
|
# unnecessary rebuild of 'postgres_ffi'.
|
||||||
PG_CONFIGURE_OPTS += INSTALL='$(ROOT_PROJECT_DIR)/scripts/ninstall.sh -C'
|
PG_CONFIGURE_OPTS += INSTALL='$(ROOT_PROJECT_DIR)/scripts/ninstall.sh -C'
|
||||||
PG_CONFIGURE_OPTS += CC=clang
|
|
||||||
PG_CONFIGURE_OPTS += CCX=clang++
|
|
||||||
|
|
||||||
# Choose whether we should be silent or verbose
|
# Choose whether we should be silent or verbose
|
||||||
CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose)
|
CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose)
|
||||||
@@ -136,23 +134,11 @@ neon-pg-ext-%: postgres-%
|
|||||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
|
-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
|
||||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
|
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
|
||||||
|
|
||||||
.PHONY:
|
|
||||||
neon-pg-ext-walproposer:
|
|
||||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
|
||||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-v15 \
|
|
||||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
|
|
||||||
|
|
||||||
.PHONY: neon-pg-ext-clean-%
|
.PHONY: neon-pg-ext-clean-%
|
||||||
neon-pg-ext-clean-%:
|
neon-pg-ext-clean-%:
|
||||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
|
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
|
||||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
|
||||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
|
||||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
|
|
||||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
|
|
||||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
|
||||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
|
|
||||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
|
|
||||||
|
|
||||||
.PHONY: neon-pg-ext
|
.PHONY: neon-pg-ext
|
||||||
neon-pg-ext: \
|
neon-pg-ext: \
|
||||||
|
|||||||
15
README.md
15
README.md
@@ -34,11 +34,6 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \
|
|||||||
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
|
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
|
||||||
protobuf-devel
|
protobuf-devel
|
||||||
```
|
```
|
||||||
* On Arch based systems, these packages are needed:
|
|
||||||
```bash
|
|
||||||
pacman -S base-devel readline zlib libseccomp openssl clang \
|
|
||||||
postgresql-libs cmake postgresql protobuf
|
|
||||||
```
|
|
||||||
|
|
||||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||||
```
|
```
|
||||||
@@ -88,10 +83,9 @@ cd neon
|
|||||||
|
|
||||||
# The preferred and default is to make a debug build. This will create a
|
# The preferred and default is to make a debug build. This will create a
|
||||||
# demonstrably slower build than a release build. For a release build,
|
# demonstrably slower build than a release build. For a release build,
|
||||||
# use "BUILD_TYPE=release make -j`nproc` -s"
|
# use "BUILD_TYPE=release make -j`nproc`"
|
||||||
# Remove -s for the verbose build log
|
|
||||||
|
|
||||||
make -j`nproc` -s
|
make -j`nproc`
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Building on OSX
|
#### Building on OSX
|
||||||
@@ -105,10 +99,9 @@ cd neon
|
|||||||
|
|
||||||
# The preferred and default is to make a debug build. This will create a
|
# The preferred and default is to make a debug build. This will create a
|
||||||
# demonstrably slower build than a release build. For a release build,
|
# demonstrably slower build than a release build. For a release build,
|
||||||
# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu` -s"
|
# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`"
|
||||||
# Remove -s for the verbose build log
|
|
||||||
|
|
||||||
make -j`sysctl -n hw.logicalcpu` -s
|
make -j`sysctl -n hw.logicalcpu`
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Dependency installation notes
|
#### Dependency installation notes
|
||||||
|
|||||||
@@ -17,7 +17,6 @@ regex.workspace = true
|
|||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
tar.workspace = true
|
tar.workspace = true
|
||||||
reqwest = { workspace = true, features = ["json"] }
|
|
||||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||||
tokio-postgres.workspace = true
|
tokio-postgres.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ use tracing::{error, info};
|
|||||||
|
|
||||||
use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
|
use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
|
||||||
use compute_tools::http::api::launch_http_server;
|
use compute_tools::http::api::launch_http_server;
|
||||||
|
use compute_tools::informant::spawn_vm_informant_if_present;
|
||||||
use compute_tools::logger::*;
|
use compute_tools::logger::*;
|
||||||
use compute_tools::monitor::launch_monitor;
|
use compute_tools::monitor::launch_monitor;
|
||||||
use compute_tools::params::*;
|
use compute_tools::params::*;
|
||||||
@@ -65,9 +66,6 @@ fn main() -> Result<()> {
|
|||||||
let spec = matches.get_one::<String>("spec");
|
let spec = matches.get_one::<String>("spec");
|
||||||
let spec_path = matches.get_one::<String>("spec-path");
|
let spec_path = matches.get_one::<String>("spec-path");
|
||||||
|
|
||||||
let compute_id = matches.get_one::<String>("compute-id");
|
|
||||||
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
|
|
||||||
|
|
||||||
// Try to use just 'postgres' if no path is provided
|
// Try to use just 'postgres' if no path is provided
|
||||||
let pgbin = matches.get_one::<String>("pgbin").unwrap();
|
let pgbin = matches.get_one::<String>("pgbin").unwrap();
|
||||||
|
|
||||||
@@ -80,27 +78,8 @@ fn main() -> Result<()> {
|
|||||||
let path = Path::new(sp);
|
let path = Path::new(sp);
|
||||||
let file = File::open(path)?;
|
let file = File::open(path)?;
|
||||||
serde_json::from_reader(file)?
|
serde_json::from_reader(file)?
|
||||||
} else if let Some(id) = compute_id {
|
|
||||||
if let Some(cp_base) = control_plane_uri {
|
|
||||||
let cp_uri = format!("{cp_base}/management/api/v1/{id}/spec");
|
|
||||||
let jwt: String = match std::env::var("NEON_CONSOLE_JWT") {
|
|
||||||
Ok(v) => v,
|
|
||||||
Err(_) => "".to_string(),
|
|
||||||
};
|
|
||||||
|
|
||||||
reqwest::blocking::Client::new()
|
|
||||||
.get(cp_uri)
|
|
||||||
.header("Authorization", jwt)
|
|
||||||
.send()?
|
|
||||||
.json()?
|
|
||||||
} else {
|
|
||||||
panic!(
|
|
||||||
"must specify --control-plane-uri \"{:#?}\" and --compute-id \"{:#?}\"",
|
|
||||||
control_plane_uri, compute_id
|
|
||||||
);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
panic!("compute spec should be provided via --spec or --spec-path argument");
|
panic!("cluster spec should be provided via --spec or --spec-path argument");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -162,6 +141,8 @@ fn main() -> Result<()> {
|
|||||||
// requests, while configuration is still in progress.
|
// requests, while configuration is still in progress.
|
||||||
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
|
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
|
||||||
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
||||||
|
// Also spawn the thread responsible for handling the VM informant -- if it's present
|
||||||
|
let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant");
|
||||||
|
|
||||||
// Start Postgres
|
// Start Postgres
|
||||||
let mut delay_exit = false;
|
let mut delay_exit = false;
|
||||||
@@ -249,18 +230,6 @@ fn cli() -> clap::Command {
|
|||||||
.long("spec-path")
|
.long("spec-path")
|
||||||
.value_name("SPEC_PATH"),
|
.value_name("SPEC_PATH"),
|
||||||
)
|
)
|
||||||
.arg(
|
|
||||||
Arg::new("compute-id")
|
|
||||||
.short('i')
|
|
||||||
.long("compute-id")
|
|
||||||
.value_name("COMPUTE_ID"),
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::new("control-plane-uri")
|
|
||||||
.short('p')
|
|
||||||
.long("control-plane-uri")
|
|
||||||
.value_name("CONTROL_PLANE"),
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ use std::net::SocketAddr;
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::thread;
|
use std::thread;
|
||||||
|
|
||||||
use crate::compute::ComputeNode;
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use hyper::service::{make_service_fn, service_fn};
|
use hyper::service::{make_service_fn, service_fn};
|
||||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||||
@@ -11,6 +10,8 @@ use serde_json;
|
|||||||
use tracing::{error, info};
|
use tracing::{error, info};
|
||||||
use tracing_utils::http::OtelName;
|
use tracing_utils::http::OtelName;
|
||||||
|
|
||||||
|
use crate::compute::ComputeNode;
|
||||||
|
|
||||||
// Service function to handle all available routes.
|
// Service function to handle all available routes.
|
||||||
async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
|
async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
|
||||||
//
|
//
|
||||||
|
|||||||
50
compute_tools/src/informant.rs
Normal file
50
compute_tools/src/informant.rs
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
use std::path::Path;
|
||||||
|
use std::process;
|
||||||
|
use std::thread;
|
||||||
|
use std::time::Duration;
|
||||||
|
use tracing::{info, warn};
|
||||||
|
|
||||||
|
use anyhow::{Context, Result};
|
||||||
|
|
||||||
|
const VM_INFORMANT_PATH: &str = "/bin/vm-informant";
|
||||||
|
const RESTART_INFORMANT_AFTER_MILLIS: u64 = 5000;
|
||||||
|
|
||||||
|
/// Launch a thread to start the VM informant if it's present (and restart, on failure)
|
||||||
|
pub fn spawn_vm_informant_if_present() -> Result<Option<thread::JoinHandle<()>>> {
|
||||||
|
let exists = Path::new(VM_INFORMANT_PATH)
|
||||||
|
.try_exists()
|
||||||
|
.context("could not check if path exists")?;
|
||||||
|
|
||||||
|
if !exists {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Some(
|
||||||
|
thread::Builder::new()
|
||||||
|
.name("run-vm-informant".into())
|
||||||
|
.spawn(move || run_informant())?,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_informant() -> ! {
|
||||||
|
let restart_wait = Duration::from_millis(RESTART_INFORMANT_AFTER_MILLIS);
|
||||||
|
|
||||||
|
info!("starting VM informant");
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let mut cmd = process::Command::new(VM_INFORMANT_PATH);
|
||||||
|
// Block on subprocess:
|
||||||
|
let result = cmd.status();
|
||||||
|
|
||||||
|
match result {
|
||||||
|
Err(e) => warn!("failed to run VM informant at {VM_INFORMANT_PATH:?}: {e}"),
|
||||||
|
Ok(status) if !status.success() => {
|
||||||
|
warn!("{VM_INFORMANT_PATH} exited with code {status:?}, retrying")
|
||||||
|
}
|
||||||
|
Ok(_) => info!("{VM_INFORMANT_PATH} ended gracefully (unexpectedly). Retrying"),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait before retrying
|
||||||
|
thread::sleep(restart_wait);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -8,6 +8,7 @@ pub mod http;
|
|||||||
#[macro_use]
|
#[macro_use]
|
||||||
pub mod logger;
|
pub mod logger;
|
||||||
pub mod compute;
|
pub mod compute;
|
||||||
|
pub mod informant;
|
||||||
pub mod monitor;
|
pub mod monitor;
|
||||||
pub mod params;
|
pub mod params;
|
||||||
pub mod pg_helpers;
|
pub mod pg_helpers;
|
||||||
|
|||||||
@@ -15,7 +15,6 @@ postgres.workspace = true
|
|||||||
regex.workspace = true
|
regex.workspace = true
|
||||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
|
||||||
serde_with.workspace = true
|
serde_with.workspace = true
|
||||||
tar.workspace = true
|
tar.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
|
|||||||
@@ -419,11 +419,6 @@ impl PageServerNode {
|
|||||||
.map(|x| x.parse::<bool>())
|
.map(|x| x.parse::<bool>())
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||||
eviction_policy: settings
|
|
||||||
.get("eviction_policy")
|
|
||||||
.map(|x| serde_json::from_str(x))
|
|
||||||
.transpose()
|
|
||||||
.context("Failed to parse 'eviction_policy' json")?,
|
|
||||||
})
|
})
|
||||||
.send()?
|
.send()?
|
||||||
.error_from_body()?;
|
.error_from_body()?;
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ listen_http_addr = '127.0.0.1:9898'
|
|||||||
checkpoint_distance = '268435456' # in bytes
|
checkpoint_distance = '268435456' # in bytes
|
||||||
checkpoint_timeout = '10m'
|
checkpoint_timeout = '10m'
|
||||||
|
|
||||||
gc_period = '1 hour'
|
gc_period = '100 s'
|
||||||
gc_horizon = '67108864'
|
gc_horizon = '67108864'
|
||||||
|
|
||||||
max_file_descriptors = '100'
|
max_file_descriptors = '100'
|
||||||
@@ -101,7 +101,7 @@ away.
|
|||||||
|
|
||||||
#### gc_period
|
#### gc_period
|
||||||
|
|
||||||
Interval at which garbage collection is triggered. Default is 1 hour.
|
Interval at which garbage collection is triggered. Default is 100 s.
|
||||||
|
|
||||||
#### image_creation_threshold
|
#### image_creation_threshold
|
||||||
|
|
||||||
@@ -109,7 +109,7 @@ L0 delta layer threshold for L1 image layer creation. Default is 3.
|
|||||||
|
|
||||||
#### pitr_interval
|
#### pitr_interval
|
||||||
|
|
||||||
WAL retention duration for PITR branching. Default is 7 days.
|
WAL retention duration for PITR branching. Default is 30 days.
|
||||||
|
|
||||||
#### walreceiver_connect_timeout
|
#### walreceiver_connect_timeout
|
||||||
|
|
||||||
|
|||||||
@@ -1,335 +0,0 @@
|
|||||||
# Synthetic size
|
|
||||||
|
|
||||||
Neon storage has copy-on-write branching, which makes it difficult to
|
|
||||||
answer the question "how large is my database"? To give one reasonable
|
|
||||||
answer, we calculate _synthetic size_ for a project.
|
|
||||||
|
|
||||||
The calculation is called "synthetic", because it is based purely on
|
|
||||||
the user-visible logical size, which is the size that you would see on
|
|
||||||
a standalone PostgreSQL installation, and the amount of WAL, which is
|
|
||||||
also the same as what you'd see on a standalone PostgreSQL, for the
|
|
||||||
same set of updates.
|
|
||||||
|
|
||||||
The synthetic size does *not* depend on the actual physical size
|
|
||||||
consumed in the storage, or implementation details of the Neon storage
|
|
||||||
like garbage collection, compaction and compression. There is a
|
|
||||||
strong *correlation* between the physical size and the synthetic size,
|
|
||||||
but the synthetic size is designed to be independent of the
|
|
||||||
implementation details, so that any improvements we make in the
|
|
||||||
storage system simply reduce our COGS. And vice versa: any bugs or bad
|
|
||||||
implementation where we keep more data than we would need to, do not
|
|
||||||
change the synthetic size or incur any costs to the user.
|
|
||||||
|
|
||||||
The synthetic size is calculated for the whole project. It is not
|
|
||||||
straighforward to attribute size to individual branches. See "What is
|
|
||||||
the size of an individual branch?" for discussion on those
|
|
||||||
difficulties.
|
|
||||||
|
|
||||||
The synthetic size is designed to:
|
|
||||||
|
|
||||||
- Take into account the copy-on-write nature of the storage. For
|
|
||||||
example, if you create a branch, it doesn't immediately add anything
|
|
||||||
to the synthetic size. It starts to affect the synthetic size only
|
|
||||||
as it diverges from the parent branch.
|
|
||||||
|
|
||||||
- Be independent of any implementation details of the storage, like
|
|
||||||
garbage collection, remote storage, or compression.
|
|
||||||
|
|
||||||
## Terms & assumptions
|
|
||||||
|
|
||||||
- logical size is the size of a branch *at a given point in
|
|
||||||
time*. It's the total size of all tables in all databases, as you
|
|
||||||
see with "\l+" in psql for example, plus the Postgres SLRUs and some
|
|
||||||
small amount of metadata. NOTE that currently, Neon does not include
|
|
||||||
the SLRUs and metadata in the logical size. See comment to `get_current_logical_size_non_incremental()`.
|
|
||||||
|
|
||||||
- a "point in time" is defined as an LSN value. You can convert a
|
|
||||||
timestamp to an LSN, but the storage internally works with LSNs.
|
|
||||||
|
|
||||||
- PITR horizon can be set per-branch.
|
|
||||||
|
|
||||||
- PITR horizon can be set as a time interval, e.g. 5 days or hours, or
|
|
||||||
as amount of WAL, in bytes. If it's given as a time interval, it's
|
|
||||||
converted to an LSN for the calculation.
|
|
||||||
|
|
||||||
- PITR horizon can be set to 0, if you don't want to retain any history.
|
|
||||||
|
|
||||||
## Calculation
|
|
||||||
|
|
||||||
Inputs to the calculation are:
|
|
||||||
- logical size of the database at different points in time,
|
|
||||||
- amount of WAL generated, and
|
|
||||||
- the PITR horizon settings
|
|
||||||
|
|
||||||
The synthetic size is based on an idealistic model of the storage
|
|
||||||
system, where we pretend that the storage consists of two things:
|
|
||||||
- snapshots, containing a full snapshot of the database, at a given
|
|
||||||
point in time, and
|
|
||||||
- WAL.
|
|
||||||
|
|
||||||
In the simple case that the project contains just one branch (main),
|
|
||||||
and a fixed PITR horizon, the synthetic size is the sum of:
|
|
||||||
|
|
||||||
- the logical size of the branch *at the beginning of the PITR
|
|
||||||
horizon*, i.e. at the oldest point that you can still recover to, and
|
|
||||||
- the size of the WAL covering the PITR horizon.
|
|
||||||
|
|
||||||
The snapshot allows you to recover to the beginning of the PITR
|
|
||||||
horizon, and the WAL allows you to recover from that point to any
|
|
||||||
point within the horizon.
|
|
||||||
|
|
||||||
```
|
|
||||||
WAL
|
|
||||||
-----------------------#########>
|
|
||||||
^
|
|
||||||
snapshot
|
|
||||||
|
|
||||||
Legend:
|
|
||||||
##### PITR horizon. This is the region that you can still access
|
|
||||||
with Point-in-time query and you can still create branches
|
|
||||||
from.
|
|
||||||
----- history that has fallen out of the PITR horizon, and can no
|
|
||||||
longer be accessed
|
|
||||||
```
|
|
||||||
|
|
||||||
NOTE: This is not how the storage system actually works! The actual
|
|
||||||
implementation is also based on snapshots and WAL, but the snapshots
|
|
||||||
are taken for individual database pages and ranges of pages rather
|
|
||||||
than the whole database, and it is much more complicated. This model
|
|
||||||
is a reasonable approximation, however, to make the synthetic size a
|
|
||||||
useful proxy for the actual storage consumption.
|
|
||||||
|
|
||||||
|
|
||||||
## Example: Data is INSERTed
|
|
||||||
|
|
||||||
For example, let's assume that your database contained 10 GB of data
|
|
||||||
at the beginning of the PITR horizon, and you have since then inserted
|
|
||||||
5 GB of additional data into it. The additional insertions of 5 GB of
|
|
||||||
data consume roughly 5 GB of WAL. In that case, the synthetic size is:
|
|
||||||
|
|
||||||
> 10 GB (snapshot) + 5 GB (WAL) = 15 GB
|
|
||||||
|
|
||||||
If you now set the PITR horizon on the project to 0, so that no
|
|
||||||
historical data is retained, then the beginning PITR horizon would be
|
|
||||||
at the end of the branch, so the size of the snapshot would be
|
|
||||||
calculated at the end of the branch, after the insertions. Then the
|
|
||||||
synthetic size is:
|
|
||||||
|
|
||||||
> 15 GB (snapshot) + 0 GB (WAL) = 15 GB.
|
|
||||||
|
|
||||||
In this case, the synthetic size is the same, regardless of the PITR horizon,
|
|
||||||
because all the history consists of inserts. The newly inserted data takes
|
|
||||||
up the same amount of space, whether it's stored as part of the logical
|
|
||||||
snapshot, or as WAL. (*)
|
|
||||||
|
|
||||||
(*) This is a rough approximation. In reality, the WAL contains
|
|
||||||
headers and other overhead, and on the other hand, the logical
|
|
||||||
snapshot includes empty space on pages, so the size of insertions in
|
|
||||||
WAL can be smaller or greater than the size of the final table after
|
|
||||||
the insertions. But in most cases, it's in the same ballpark.
|
|
||||||
|
|
||||||
## Example: Data is DELETEd
|
|
||||||
|
|
||||||
Let's look at another example:
|
|
||||||
|
|
||||||
Let's start again with a database that contains 10 GB of data. Then,
|
|
||||||
you DELETE 5 GB of the data, and run VACUUM to free up the space, so
|
|
||||||
that the logical size of the database is now only 5 GB.
|
|
||||||
|
|
||||||
Let's assume that the WAL for the deletions and the vacuum take up
|
|
||||||
100 MB of space. In that case, the synthetic size of the project is:
|
|
||||||
|
|
||||||
> 10 GB (snapshot) + 100 MB (WAL) = 10.1 GB
|
|
||||||
|
|
||||||
This is much larger than the logical size of the database after the
|
|
||||||
deletions (5 GB). That's because the system still needs to retain the
|
|
||||||
deleted data, because it's still accessible to queries and branching
|
|
||||||
in the PITR window.
|
|
||||||
|
|
||||||
If you now set the PITR horizon to 0 or just wait for time to pass so
|
|
||||||
that the data falls out of the PITR horizon, making the deleted data
|
|
||||||
inaccessible, the synthetic size shrinks:
|
|
||||||
|
|
||||||
> 5 GB (snapshot) + 0 GB (WAL) = 5 GB
|
|
||||||
|
|
||||||
|
|
||||||
# Branching
|
|
||||||
|
|
||||||
Things get more complicated with branching. Branches in Neon are
|
|
||||||
copy-on-write, which is also reflected in the synthetic size.
|
|
||||||
|
|
||||||
When you create a branch, it doesn't immediately change the synthetic
|
|
||||||
size at all. The branch point is within the PITR horizon, and all the
|
|
||||||
data needed to recover to that point in time needs to be retained
|
|
||||||
anyway.
|
|
||||||
|
|
||||||
However, if you make modifications on the branch, the system needs to
|
|
||||||
keep the WAL of those modifications. The WAL is included in the
|
|
||||||
synthetic size.
|
|
||||||
|
|
||||||
## Example: branch and INSERT
|
|
||||||
|
|
||||||
Let's assume that you again start with a 10 GB database.
|
|
||||||
On the main branch, you insert 2 GB of data. Then you create
|
|
||||||
a branch at that point, and insert another 3 GB of data on the
|
|
||||||
main branch, and 1 GB of data on the child branch
|
|
||||||
|
|
||||||
```
|
|
||||||
child +#####>
|
|
||||||
|
|
|
||||||
| WAL
|
|
||||||
main ---------###############>
|
|
||||||
^
|
|
||||||
snapshot
|
|
||||||
```
|
|
||||||
|
|
||||||
In this case, the synthetic size consists of:
|
|
||||||
- the snapshot at the beginning of the PITR horizon (10 GB)
|
|
||||||
- the WAL on the main branch (2 GB + 3 GB = 5 GB)
|
|
||||||
- the WAL on the child branch (1 GB)
|
|
||||||
|
|
||||||
Total: 16 GB
|
|
||||||
|
|
||||||
# Diverging branches
|
|
||||||
|
|
||||||
If there is only a small amount of changes in the database on the
|
|
||||||
different branches, as in the previous example, the synthetic size
|
|
||||||
consists of a snapshot before the branch point, containing all the
|
|
||||||
shared data, and the WAL on both branches. However, if the branches
|
|
||||||
diverge a lot, it is more efficient to store a separate snapshot of
|
|
||||||
branches.
|
|
||||||
|
|
||||||
## Example: diverging branches
|
|
||||||
|
|
||||||
You start with a 10 GB database. You insert 5 GB of data on the main
|
|
||||||
branch. Then you create a branch, and immediately delete all the data
|
|
||||||
on the child branch and insert 5 GB of new data to it. Then you do the
|
|
||||||
same on the main branch. Let's assume
|
|
||||||
that the PITR horizon requires keeping the last 1 GB of WAL on the
|
|
||||||
both branches.
|
|
||||||
|
|
||||||
```
|
|
||||||
snapshot
|
|
||||||
v WAL
|
|
||||||
child +---------##############>
|
|
||||||
|
|
|
||||||
|
|
|
||||||
main -------------+---------##############>
|
|
||||||
^ WAL
|
|
||||||
snapshot
|
|
||||||
```
|
|
||||||
|
|
||||||
In this case, the synthetic size consists of:
|
|
||||||
- snapshot at the beginning of the PITR horizon on the main branch (4 GB)
|
|
||||||
- WAL on the main branch (1 GB)
|
|
||||||
- snapshot at the beginning of the PITR horizon on the child branch (4 GB)
|
|
||||||
- last 1 GB of WAL on the child branch (1 GB)
|
|
||||||
|
|
||||||
Total: 10 GB
|
|
||||||
|
|
||||||
The alternative way to store this would be to take only one snapshot
|
|
||||||
at the beginning of branch point, and keep all the WAL on both
|
|
||||||
branches. However, the size with that method would be larger, as it
|
|
||||||
would require one 10 GB snapshot, and 5 GB + 5 GB of WAL. It depends
|
|
||||||
on the amount of changes (WAL) on both branches, and the logical size
|
|
||||||
at the branch point, which method would result in a smaller synthetic
|
|
||||||
size. On each branch point, the system performs the calculation with
|
|
||||||
both methods, and uses the method that is cheaper, i.e. the one that
|
|
||||||
results in a smaller synthetic size.
|
|
||||||
|
|
||||||
One way to think about this is that when you create a branch, it
|
|
||||||
starts out as a thin branch that only stores the WAL since the branch
|
|
||||||
point. As you modify it, and the amount of WAL grows, at some point
|
|
||||||
it becomes cheaper to store a completely new snapshot of the branch
|
|
||||||
and truncate the WAL.
|
|
||||||
|
|
||||||
|
|
||||||
# What is the size of an individual branch?
|
|
||||||
|
|
||||||
Synthetic size is calculated for the whole project, and includes all
|
|
||||||
branches. There is no such thing as the size of a branch, because it
|
|
||||||
is not straighforward to attribute the parts of size to individual
|
|
||||||
branches.
|
|
||||||
|
|
||||||
## Example: attributing size to branches
|
|
||||||
|
|
||||||
(copied from https://github.com/neondatabase/neon/pull/2884#discussion_r1029365278)
|
|
||||||
|
|
||||||
Imagine that you create two branches, A and B, at the same point from
|
|
||||||
main branch, and do a couple of small updates on both branches. Then
|
|
||||||
six months pass, and during those six months the data on the main
|
|
||||||
branch churns over completely multiple times. The retention period is,
|
|
||||||
say 1 month.
|
|
||||||
|
|
||||||
```
|
|
||||||
+------> A
|
|
||||||
/
|
|
||||||
--------------------*-------------------------------> main
|
|
||||||
\
|
|
||||||
+--------> B
|
|
||||||
```
|
|
||||||
|
|
||||||
In that situation, the synthetic tenant size would be calculated based
|
|
||||||
on a "logical snapshot" at the branch point, that is, the logical size
|
|
||||||
of the database at that point. Plus the WAL on branches A and B. Let's
|
|
||||||
say that the snapshot size is 10 GB, and the WAL is 1 MB on both
|
|
||||||
branches A and B. So the total synthetic storage size is 10002
|
|
||||||
MB. (Let's ignore the main branch for now, that would be just added to
|
|
||||||
the sum)
|
|
||||||
|
|
||||||
How would you break that down per branch? I can think of three
|
|
||||||
different ways to do it, and all of them have their own problems:
|
|
||||||
|
|
||||||
### Subtraction method
|
|
||||||
|
|
||||||
For each branch, calculate how much smaller the total synthetic size
|
|
||||||
would be, if that branch didn't exist. In other words, how much would
|
|
||||||
you save if you dropped the branch. With this method, the size of
|
|
||||||
branches A and B is 1 MB.
|
|
||||||
|
|
||||||
With this method, the 10 GB shared logical snapshot is not included
|
|
||||||
for A nor B. So the size of all branches is not equal to the total
|
|
||||||
synthetic size of the tenant. If you drop branch A, you save 1 MB as
|
|
||||||
you'd expect, but also the size of B suddenly jumps from 1 MB to 10001
|
|
||||||
MB, which might feel surprising.
|
|
||||||
|
|
||||||
### Division method
|
|
||||||
|
|
||||||
Divide the common parts evenly across all branches that need
|
|
||||||
them. With this method, the size of branches A and B would be 5001 MB.
|
|
||||||
|
|
||||||
With this method, the sum of all branches adds up to the total
|
|
||||||
synthetic size. But it's surprising in other ways: if you drop branch
|
|
||||||
A, you might think that you save 5001 MB, but in reality you only save
|
|
||||||
1 MB, and the size of branch B suddenly grows from 5001 to 10001 MB.
|
|
||||||
|
|
||||||
### Addition method
|
|
||||||
|
|
||||||
For each branch, include all the snapshots and WAL that it depends on,
|
|
||||||
even if some of them are shared by other branches. With this method,
|
|
||||||
the size of branches A and B would be 10001 MB.
|
|
||||||
|
|
||||||
The surprise with this method is that the sum of all the branches is
|
|
||||||
larger than the total synthetic size. And if you drop branch A, the
|
|
||||||
total synthetic size doesn't fall by 10001 MB as you might think.
|
|
||||||
|
|
||||||
# Alternatives
|
|
||||||
|
|
||||||
A sort of cop-out method would be to show the whole tree of branches
|
|
||||||
graphically, and for each section of WAL or logical snapshot, display
|
|
||||||
the size of that section. You can then see which branches depend on
|
|
||||||
which sections, which sections are shared etc. That would be good to
|
|
||||||
have in the UI anyway.
|
|
||||||
|
|
||||||
Or perhaps calculate per-branch numbers using the subtraction method,
|
|
||||||
and in addition to that, one more number for "shared size" that
|
|
||||||
includes all the data that is needed by more than one branch.
|
|
||||||
|
|
||||||
## Which is the right method?
|
|
||||||
|
|
||||||
The bottom line is that it's not straightforward to attribute the
|
|
||||||
synthetic size to individual branches. There are things we can do, and
|
|
||||||
all of those methods are pretty straightforward to implement, but they
|
|
||||||
all have their own problems. What makes sense depends a lot on what
|
|
||||||
you want to do with the number, what question you are trying to
|
|
||||||
answer.
|
|
||||||
@@ -14,6 +14,5 @@ byteorder.workspace = true
|
|||||||
utils.workspace = true
|
utils.workspace = true
|
||||||
postgres_ffi.workspace = true
|
postgres_ffi.workspace = true
|
||||||
enum-map.workspace = true
|
enum-map.workspace = true
|
||||||
serde_json.workspace = true
|
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
workspace_hack.workspace = true
|
||||||
|
|||||||
@@ -155,11 +155,6 @@ pub struct TenantConfigRequest {
|
|||||||
pub lagging_wal_timeout: Option<String>,
|
pub lagging_wal_timeout: Option<String>,
|
||||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||||
pub trace_read_requests: Option<bool>,
|
pub trace_read_requests: Option<bool>,
|
||||||
// We defer the parsing of the eviction_policy field to the request handler.
|
|
||||||
// Otherwise we'd have to move the types for eviction policy into this package.
|
|
||||||
// We might do that once the eviction feature has stabilizied.
|
|
||||||
// For now, this field is not even documented in the openapi_spec.yml.
|
|
||||||
pub eviction_policy: Option<serde_json::Value>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TenantConfigRequest {
|
impl TenantConfigRequest {
|
||||||
@@ -179,7 +174,6 @@ impl TenantConfigRequest {
|
|||||||
lagging_wal_timeout: None,
|
lagging_wal_timeout: None,
|
||||||
max_lsn_wal_lag: None,
|
max_lsn_wal_lag: None,
|
||||||
trace_read_requests: None,
|
trace_read_requests: None,
|
||||||
eviction_policy: None,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -269,11 +263,11 @@ pub struct LayerResidenceEvent {
|
|||||||
///
|
///
|
||||||
#[serde(rename = "timestamp_millis_since_epoch")]
|
#[serde(rename = "timestamp_millis_since_epoch")]
|
||||||
#[serde_as(as = "serde_with::TimestampMilliSeconds")]
|
#[serde_as(as = "serde_with::TimestampMilliSeconds")]
|
||||||
pub timestamp: SystemTime,
|
timestamp: SystemTime,
|
||||||
/// The new residence status of the layer.
|
/// The new residence status of the layer.
|
||||||
pub status: LayerResidenceStatus,
|
status: LayerResidenceStatus,
|
||||||
/// The reason why we had to record this event.
|
/// The reason why we had to record this event.
|
||||||
pub reason: LayerResidenceEventReason,
|
reason: LayerResidenceEventReason,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The reason for recording a given [`ResidenceEvent`].
|
/// The reason for recording a given [`ResidenceEvent`].
|
||||||
|
|||||||
@@ -98,15 +98,6 @@ impl RelTag {
|
|||||||
|
|
||||||
name
|
name
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn with_forknum(&self, forknum: u8) -> Self {
|
|
||||||
RelTag {
|
|
||||||
forknum,
|
|
||||||
spcnode: self.spcnode,
|
|
||||||
dbnode: self.dbnode,
|
|
||||||
relnode: self.relnode,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -75,36 +75,27 @@ impl StartupMessageParams {
|
|||||||
/// taking into account all escape sequences but leaving them as-is.
|
/// taking into account all escape sequences but leaving them as-is.
|
||||||
/// [`None`] means that there's no `options` in [`Self`].
|
/// [`None`] means that there's no `options` in [`Self`].
|
||||||
pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
|
pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
|
||||||
self.get("options").map(Self::parse_options_raw)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Split command-line options according to PostgreSQL's logic,
|
|
||||||
/// applying all escape sequences (using owned strings as needed).
|
|
||||||
/// [`None`] means that there's no `options` in [`Self`].
|
|
||||||
pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
|
|
||||||
self.get("options").map(Self::parse_options_escaped)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Split command-line options according to PostgreSQL's logic,
|
|
||||||
/// taking into account all escape sequences but leaving them as-is.
|
|
||||||
pub fn parse_options_raw(input: &str) -> impl Iterator<Item = &str> {
|
|
||||||
// See `postgres: pg_split_opts`.
|
// See `postgres: pg_split_opts`.
|
||||||
let mut last_was_escape = false;
|
let mut last_was_escape = false;
|
||||||
input
|
let iter = self
|
||||||
|
.get("options")?
|
||||||
.split(move |c: char| {
|
.split(move |c: char| {
|
||||||
// We split by non-escaped whitespace symbols.
|
// We split by non-escaped whitespace symbols.
|
||||||
let should_split = c.is_ascii_whitespace() && !last_was_escape;
|
let should_split = c.is_ascii_whitespace() && !last_was_escape;
|
||||||
last_was_escape = c == '\\' && !last_was_escape;
|
last_was_escape = c == '\\' && !last_was_escape;
|
||||||
should_split
|
should_split
|
||||||
})
|
})
|
||||||
.filter(|s| !s.is_empty())
|
.filter(|s| !s.is_empty());
|
||||||
|
|
||||||
|
Some(iter)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Split command-line options according to PostgreSQL's logic,
|
/// Split command-line options according to PostgreSQL's logic,
|
||||||
/// applying all escape sequences (using owned strings as needed).
|
/// applying all escape sequences (using owned strings as needed).
|
||||||
pub fn parse_options_escaped(input: &str) -> impl Iterator<Item = Cow<'_, str>> {
|
/// [`None`] means that there's no `options` in [`Self`].
|
||||||
|
pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
|
||||||
// See `postgres: pg_split_opts`.
|
// See `postgres: pg_split_opts`.
|
||||||
Self::parse_options_raw(input).map(|s| {
|
let iter = self.options_raw()?.map(|s| {
|
||||||
let mut preserve_next_escape = false;
|
let mut preserve_next_escape = false;
|
||||||
let escape = |c| {
|
let escape = |c| {
|
||||||
// We should remove '\\' unless it's preceded by '\\'.
|
// We should remove '\\' unless it's preceded by '\\'.
|
||||||
@@ -117,12 +108,9 @@ impl StartupMessageParams {
|
|||||||
true => Cow::Owned(s.replace(escape, "")),
|
true => Cow::Owned(s.replace(escape, "")),
|
||||||
false => Cow::Borrowed(s),
|
false => Cow::Borrowed(s),
|
||||||
}
|
}
|
||||||
})
|
});
|
||||||
}
|
|
||||||
|
|
||||||
/// Iterate through key-value pairs in an arbitrary order.
|
Some(iter)
|
||||||
pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
|
|
||||||
self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function is mostly useful in tests.
|
// This function is mostly useful in tests.
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ toml_edit.workspace = true
|
|||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
metrics.workspace = true
|
metrics.workspace = true
|
||||||
utils.workspace = true
|
utils.workspace = true
|
||||||
pin-project-lite.workspace = true
|
|
||||||
workspace_hack.workspace = true
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
|||||||
@@ -20,10 +20,7 @@ use aws_sdk_s3::{
|
|||||||
};
|
};
|
||||||
use aws_smithy_http::body::SdkBody;
|
use aws_smithy_http::body::SdkBody;
|
||||||
use hyper::Body;
|
use hyper::Body;
|
||||||
use tokio::{
|
use tokio::{io, sync::Semaphore};
|
||||||
io::{self, AsyncRead},
|
|
||||||
sync::Semaphore,
|
|
||||||
};
|
|
||||||
use tokio_util::io::ReaderStream;
|
use tokio_util::io::ReaderStream;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
@@ -105,7 +102,7 @@ pub struct S3Bucket {
|
|||||||
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
|
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
|
||||||
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
|
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
|
||||||
// The helps to ensure we don't exceed the thresholds.
|
// The helps to ensure we don't exceed the thresholds.
|
||||||
concurrency_limiter: Arc<Semaphore>,
|
concurrency_limiter: Semaphore,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
@@ -165,7 +162,7 @@ impl S3Bucket {
|
|||||||
client,
|
client,
|
||||||
bucket_name: aws_config.bucket_name.clone(),
|
bucket_name: aws_config.bucket_name.clone(),
|
||||||
prefix_in_bucket,
|
prefix_in_bucket,
|
||||||
concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
|
concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -197,10 +194,9 @@ impl S3Bucket {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
|
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
|
||||||
let permit = self
|
let _guard = self
|
||||||
.concurrency_limiter
|
.concurrency_limiter
|
||||||
.clone()
|
.acquire()
|
||||||
.acquire_owned()
|
|
||||||
.await
|
.await
|
||||||
.context("Concurrency limiter semaphore got closed during S3 download")
|
.context("Concurrency limiter semaphore got closed during S3 download")
|
||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
@@ -221,10 +217,9 @@ impl S3Bucket {
|
|||||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||||
Ok(Download {
|
Ok(Download {
|
||||||
metadata,
|
metadata,
|
||||||
download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
|
download_stream: Box::pin(io::BufReader::new(
|
||||||
permit,
|
|
||||||
object_output.body.into_async_read(),
|
object_output.body.into_async_read(),
|
||||||
))),
|
)),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
Err(SdkError::ServiceError {
|
Err(SdkError::ServiceError {
|
||||||
@@ -245,32 +240,6 @@ impl S3Bucket {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pin_project_lite::pin_project! {
|
|
||||||
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
|
|
||||||
struct RatelimitedAsyncRead<S> {
|
|
||||||
permit: tokio::sync::OwnedSemaphorePermit,
|
|
||||||
#[pin]
|
|
||||||
inner: S,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<S: AsyncRead> RatelimitedAsyncRead<S> {
|
|
||||||
fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
|
|
||||||
RatelimitedAsyncRead { permit, inner }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
|
|
||||||
fn poll_read(
|
|
||||||
self: std::pin::Pin<&mut Self>,
|
|
||||||
cx: &mut std::task::Context<'_>,
|
|
||||||
buf: &mut io::ReadBuf<'_>,
|
|
||||||
) -> std::task::Poll<std::io::Result<()>> {
|
|
||||||
let this = self.project();
|
|
||||||
this.inner.poll_read(cx, buf)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl RemoteStorage for S3Bucket {
|
impl RemoteStorage for S3Bucket {
|
||||||
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
|
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||||
|
|||||||
@@ -7,7 +7,5 @@ license.workspace = true
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
serde.workspace = true
|
|
||||||
serde_json.workspace = true
|
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
workspace_hack.workspace = true
|
||||||
|
|||||||
@@ -1,219 +0,0 @@
|
|||||||
use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
|
|
||||||
|
|
||||||
//
|
|
||||||
// *-g--*---D--->
|
|
||||||
// /
|
|
||||||
// /
|
|
||||||
// / *---b----*-B--->
|
|
||||||
// / /
|
|
||||||
// / /
|
|
||||||
// -----*--e---*-----f----* C
|
|
||||||
// E \
|
|
||||||
// \
|
|
||||||
// *--a---*---A-->
|
|
||||||
//
|
|
||||||
// If A and B need to be retained, is it cheaper to store
|
|
||||||
// snapshot at C+a+b, or snapshots at A and B ?
|
|
||||||
//
|
|
||||||
// If D also needs to be retained, which is cheaper:
|
|
||||||
//
|
|
||||||
// 1. E+g+e+f+a+b
|
|
||||||
// 2. D+C+a+b
|
|
||||||
// 3. D+A+B
|
|
||||||
|
|
||||||
/// [`Segment`] which has had it's size calculated.
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
struct SegmentSize {
|
|
||||||
method: SegmentMethod,
|
|
||||||
|
|
||||||
// calculated size of this subtree, using this method
|
|
||||||
accum_size: u64,
|
|
||||||
|
|
||||||
seg_id: usize,
|
|
||||||
children: Vec<SegmentSize>,
|
|
||||||
}
|
|
||||||
|
|
||||||
struct SizeAlternatives {
|
|
||||||
// cheapest alternative if parent is available.
|
|
||||||
incremental: SegmentSize,
|
|
||||||
|
|
||||||
// cheapest alternative if parent node is not available
|
|
||||||
non_incremental: Option<SegmentSize>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl StorageModel {
|
|
||||||
pub fn calculate(&self) -> SizeResult {
|
|
||||||
// Build adjacency list. 'child_list' is indexed by segment id. Each entry
|
|
||||||
// contains a list of all child segments of the segment.
|
|
||||||
let mut roots: Vec<usize> = Vec::new();
|
|
||||||
let mut child_list: Vec<Vec<usize>> = Vec::new();
|
|
||||||
child_list.resize(self.segments.len(), Vec::new());
|
|
||||||
|
|
||||||
for (seg_id, seg) in self.segments.iter().enumerate() {
|
|
||||||
if let Some(parent_id) = seg.parent {
|
|
||||||
child_list[parent_id].push(seg_id);
|
|
||||||
} else {
|
|
||||||
roots.push(seg_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut segment_results = Vec::new();
|
|
||||||
segment_results.resize(
|
|
||||||
self.segments.len(),
|
|
||||||
SegmentSizeResult {
|
|
||||||
method: SegmentMethod::Skipped,
|
|
||||||
accum_size: 0,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
let mut total_size = 0;
|
|
||||||
for root in roots {
|
|
||||||
if let Some(selected) = self.size_here(root, &child_list).non_incremental {
|
|
||||||
StorageModel::fill_selected_sizes(&selected, &mut segment_results);
|
|
||||||
total_size += selected.accum_size;
|
|
||||||
} else {
|
|
||||||
// Couldn't find any way to get this root. Error?
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
SizeResult {
|
|
||||||
total_size,
|
|
||||||
segments: segment_results,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn fill_selected_sizes(selected: &SegmentSize, result: &mut Vec<SegmentSizeResult>) {
|
|
||||||
result[selected.seg_id] = SegmentSizeResult {
|
|
||||||
method: selected.method,
|
|
||||||
accum_size: selected.accum_size,
|
|
||||||
};
|
|
||||||
// recurse to children
|
|
||||||
for child in selected.children.iter() {
|
|
||||||
StorageModel::fill_selected_sizes(child, result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// This is the core of the sizing calculation.
|
|
||||||
//
|
|
||||||
// This is a recursive function, that for each Segment calculates the best way
|
|
||||||
// to reach all the Segments that are marked as needed in this subtree, under two
|
|
||||||
// different conditions:
|
|
||||||
// a) when the parent of this segment is available (as a snaphot or through WAL), and
|
|
||||||
// b) when the parent of this segment is not available.
|
|
||||||
//
|
|
||||||
fn size_here(&self, seg_id: usize, child_list: &Vec<Vec<usize>>) -> SizeAlternatives {
|
|
||||||
let seg = &self.segments[seg_id];
|
|
||||||
// First figure out the best way to get each child
|
|
||||||
let mut children = Vec::new();
|
|
||||||
for child_id in &child_list[seg_id] {
|
|
||||||
children.push(self.size_here(*child_id, child_list))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Method 1. If this node is not needed, we can skip it as long as we
|
|
||||||
// take snapshots later in each sub-tree
|
|
||||||
let snapshot_later = if !seg.needed {
|
|
||||||
let mut snapshot_later = SegmentSize {
|
|
||||||
seg_id,
|
|
||||||
method: SegmentMethod::Skipped,
|
|
||||||
accum_size: 0,
|
|
||||||
children: Vec::new(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut possible = true;
|
|
||||||
for child in children.iter() {
|
|
||||||
if let Some(non_incremental) = &child.non_incremental {
|
|
||||||
snapshot_later.accum_size += non_incremental.accum_size;
|
|
||||||
snapshot_later.children.push(non_incremental.clone())
|
|
||||||
} else {
|
|
||||||
possible = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if possible {
|
|
||||||
Some(snapshot_later)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
// Method 2. Get a snapshot here. This assumed to be possible, if the 'size' of
|
|
||||||
// this Segment was given.
|
|
||||||
let snapshot_here = if !seg.needed || seg.parent.is_none() {
|
|
||||||
if let Some(snapshot_size) = seg.size {
|
|
||||||
let mut snapshot_here = SegmentSize {
|
|
||||||
seg_id,
|
|
||||||
method: SegmentMethod::SnapshotHere,
|
|
||||||
accum_size: snapshot_size,
|
|
||||||
children: Vec::new(),
|
|
||||||
};
|
|
||||||
for child in children.iter() {
|
|
||||||
snapshot_here.accum_size += child.incremental.accum_size;
|
|
||||||
snapshot_here.children.push(child.incremental.clone())
|
|
||||||
}
|
|
||||||
Some(snapshot_here)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
// Method 3. Use WAL to get here from parent
|
|
||||||
let wal_here = {
|
|
||||||
let mut wal_here = SegmentSize {
|
|
||||||
seg_id,
|
|
||||||
method: SegmentMethod::Wal,
|
|
||||||
accum_size: if let Some(parent_id) = seg.parent {
|
|
||||||
seg.lsn - self.segments[parent_id].lsn
|
|
||||||
} else {
|
|
||||||
0
|
|
||||||
},
|
|
||||||
children: Vec::new(),
|
|
||||||
};
|
|
||||||
for child in children {
|
|
||||||
wal_here.accum_size += child.incremental.accum_size;
|
|
||||||
wal_here.children.push(child.incremental)
|
|
||||||
}
|
|
||||||
wal_here
|
|
||||||
};
|
|
||||||
|
|
||||||
// If the parent is not available, what's the cheapest method involving
|
|
||||||
// a snapshot here or later?
|
|
||||||
let mut cheapest_non_incremental: Option<SegmentSize> = None;
|
|
||||||
if let Some(snapshot_here) = snapshot_here {
|
|
||||||
cheapest_non_incremental = Some(snapshot_here);
|
|
||||||
}
|
|
||||||
if let Some(snapshot_later) = snapshot_later {
|
|
||||||
// Use <=, to prefer skipping if the size is equal
|
|
||||||
if let Some(parent) = &cheapest_non_incremental {
|
|
||||||
if snapshot_later.accum_size <= parent.accum_size {
|
|
||||||
cheapest_non_incremental = Some(snapshot_later);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
cheapest_non_incremental = Some(snapshot_later);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// And what's the cheapest method, if the parent is available?
|
|
||||||
let cheapest_incremental = if let Some(cheapest_non_incremental) = &cheapest_non_incremental
|
|
||||||
{
|
|
||||||
// Is it cheaper to use a snapshot here or later, anyway?
|
|
||||||
// Use <, to prefer Wal over snapshot if the cost is the same
|
|
||||||
if wal_here.accum_size < cheapest_non_incremental.accum_size {
|
|
||||||
wal_here
|
|
||||||
} else {
|
|
||||||
cheapest_non_incremental.clone()
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
wal_here
|
|
||||||
};
|
|
||||||
|
|
||||||
SizeAlternatives {
|
|
||||||
incremental: cheapest_incremental,
|
|
||||||
non_incremental: cheapest_non_incremental,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,70 +1,401 @@
|
|||||||
//! Synthetic size calculation
|
use std::borrow::Cow;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
mod calculation;
|
use anyhow::Context;
|
||||||
pub mod svg;
|
|
||||||
|
|
||||||
/// StorageModel is the input to the synthetic size calculation. It represents
|
/// Pricing model or history size builder.
|
||||||
/// a tree of timelines, with just the information that's needed for the
|
|
||||||
/// calculation. This doesn't track timeline names or where each timeline
|
|
||||||
/// begins and ends, for example. Instead, it consists of "points of interest"
|
|
||||||
/// on the timelines. A point of interest could be the timeline start or end point,
|
|
||||||
/// the oldest point on a timeline that needs to be retained because of PITR
|
|
||||||
/// cutoff, or snapshot points named by the user. For each such point, and the
|
|
||||||
/// edge connecting the points (implicit in Segment), we store information about
|
|
||||||
/// whether we need to be able to recover to the point, and if known, the logical
|
|
||||||
/// size at the point.
|
|
||||||
///
|
///
|
||||||
/// The segments must form a well-formed tree, with no loops.
|
/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
|
||||||
#[derive(serde::Serialize)]
|
/// type.
|
||||||
pub struct StorageModel {
|
pub struct Storage<K: 'static> {
|
||||||
pub segments: Vec<Segment>,
|
segments: Vec<Segment>,
|
||||||
|
|
||||||
|
/// Mapping from the branch name to the index of a segment describing it's latest state.
|
||||||
|
branches: HashMap<K, usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Segment represents one point in the tree of branches, *and* the edge that leads
|
/// Snapshot of a branch.
|
||||||
/// to it (if any). We don't need separate structs for points and edges, because each
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
/// point can have only one parent.
|
|
||||||
///
|
|
||||||
/// When 'needed' is true, it means that we need to be able to reconstruct
|
|
||||||
/// any version between 'parent.lsn' and 'lsn'. If you want to represent that only
|
|
||||||
/// a single point is needed, create two Segments with the same lsn, and mark only
|
|
||||||
/// the child as needed.
|
|
||||||
///
|
|
||||||
#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
|
|
||||||
pub struct Segment {
|
pub struct Segment {
|
||||||
/// Previous segment index into ['Storage::segments`], if any.
|
/// Previous segment index into ['Storage::segments`], if any.
|
||||||
pub parent: Option<usize>,
|
parent: Option<usize>,
|
||||||
|
|
||||||
/// LSN at this point
|
/// Description of how did we get to this state.
|
||||||
pub lsn: u64,
|
///
|
||||||
|
/// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when
|
||||||
|
/// modifying a branch directly.
|
||||||
|
pub op: Cow<'static, str>,
|
||||||
|
|
||||||
/// Logical size at this node, if known.
|
/// LSN before this state
|
||||||
pub size: Option<u64>,
|
start_lsn: u64,
|
||||||
|
|
||||||
/// If true, the segment from parent to this node is needed by `retention_period`
|
/// LSN at this state
|
||||||
|
pub end_lsn: u64,
|
||||||
|
|
||||||
|
/// Logical size before this state
|
||||||
|
start_size: u64,
|
||||||
|
|
||||||
|
/// Logical size at this state. Can be None in the last Segment of a branch.
|
||||||
|
pub end_size: Option<u64>,
|
||||||
|
|
||||||
|
/// Indices to [`Storage::segments`]
|
||||||
|
///
|
||||||
|
/// FIXME: this could be an Option<usize>
|
||||||
|
children_after: Vec<usize>,
|
||||||
|
|
||||||
|
/// Determined by `retention_period` given to [`Storage::calculate`]
|
||||||
pub needed: bool,
|
pub needed: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Result of synthetic size calculation. Returned by StorageModel::calculate()
|
//
|
||||||
pub struct SizeResult {
|
//
|
||||||
pub total_size: u64,
|
//
|
||||||
|
//
|
||||||
|
// *-g--*---D--->
|
||||||
|
// /
|
||||||
|
// /
|
||||||
|
// / *---b----*-B--->
|
||||||
|
// / /
|
||||||
|
// / /
|
||||||
|
// -----*--e---*-----f----* C
|
||||||
|
// E \
|
||||||
|
// \
|
||||||
|
// *--a---*---A-->
|
||||||
|
//
|
||||||
|
// If A and B need to be retained, is it cheaper to store
|
||||||
|
// snapshot at C+a+b, or snapshots at A and B ?
|
||||||
|
//
|
||||||
|
// If D also needs to be retained, which is cheaper:
|
||||||
|
//
|
||||||
|
// 1. E+g+e+f+a+b
|
||||||
|
// 2. D+C+a+b
|
||||||
|
// 3. D+A+B
|
||||||
|
|
||||||
// This has same length as the StorageModel::segments vector in the input.
|
/// [`Segment`] which has had it's size calculated.
|
||||||
// Each entry in this array corresponds to the entry with same index in
|
pub struct SegmentSize {
|
||||||
// StorageModel::segments.
|
pub seg_id: usize,
|
||||||
pub segments: Vec<SegmentSizeResult>,
|
|
||||||
|
pub method: SegmentMethod,
|
||||||
|
|
||||||
|
this_size: u64,
|
||||||
|
|
||||||
|
pub children: Vec<SegmentSize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
|
impl SegmentSize {
|
||||||
pub struct SegmentSizeResult {
|
fn total(&self) -> u64 {
|
||||||
pub method: SegmentMethod,
|
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||||
// calculated size of this subtree, using this method
|
}
|
||||||
pub accum_size: u64,
|
|
||||||
|
pub fn total_children(&self) -> u64 {
|
||||||
|
if self.method == SnapshotAfter {
|
||||||
|
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||||
|
} else {
|
||||||
|
self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Different methods to retain history from a particular state
|
/// Different methods to retain history from a particular state
|
||||||
#[derive(Clone, Copy, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
|
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||||
pub enum SegmentMethod {
|
pub enum SegmentMethod {
|
||||||
SnapshotHere, // A logical snapshot is needed after this segment
|
SnapshotAfter,
|
||||||
Wal, // Keep WAL leading up to this node
|
Wal,
|
||||||
|
WalNeeded,
|
||||||
Skipped,
|
Skipped,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
use SegmentMethod::*;
|
||||||
|
|
||||||
|
impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||||
|
/// Creates a new storage with the given default branch name.
|
||||||
|
pub fn new(initial_branch: K) -> Storage<K> {
|
||||||
|
let init_segment = Segment {
|
||||||
|
op: "".into(),
|
||||||
|
needed: false,
|
||||||
|
parent: None,
|
||||||
|
start_lsn: 0,
|
||||||
|
end_lsn: 0,
|
||||||
|
start_size: 0,
|
||||||
|
end_size: Some(0),
|
||||||
|
children_after: Vec::new(),
|
||||||
|
};
|
||||||
|
|
||||||
|
Storage {
|
||||||
|
segments: vec![init_segment],
|
||||||
|
branches: HashMap::from([(initial_branch, 0)]),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Advances the branch with a new point, at given LSN.
|
||||||
|
pub fn insert_point<Q: ?Sized>(
|
||||||
|
&mut self,
|
||||||
|
branch: &Q,
|
||||||
|
op: Cow<'static, str>,
|
||||||
|
lsn: u64,
|
||||||
|
size: Option<u64>,
|
||||||
|
) -> anyhow::Result<()>
|
||||||
|
where
|
||||||
|
K: std::borrow::Borrow<Q>,
|
||||||
|
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||||
|
{
|
||||||
|
let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
|
||||||
|
let newseg_id = self.segments.len();
|
||||||
|
let lastseg = &mut self.segments[lastseg_id];
|
||||||
|
|
||||||
|
assert!(lsn > lastseg.end_lsn);
|
||||||
|
|
||||||
|
let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
|
||||||
|
|
||||||
|
let newseg = Segment {
|
||||||
|
op,
|
||||||
|
parent: Some(lastseg_id),
|
||||||
|
start_lsn: lastseg.end_lsn,
|
||||||
|
end_lsn: lsn,
|
||||||
|
start_size,
|
||||||
|
end_size: size,
|
||||||
|
children_after: Vec::new(),
|
||||||
|
needed: false,
|
||||||
|
};
|
||||||
|
lastseg.children_after.push(newseg_id);
|
||||||
|
|
||||||
|
self.segments.push(newseg);
|
||||||
|
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
|
||||||
|
pub fn modify_branch<Q: ?Sized>(
|
||||||
|
&mut self,
|
||||||
|
branch: &Q,
|
||||||
|
op: Cow<'static, str>,
|
||||||
|
lsn_bytes: u64,
|
||||||
|
size_bytes: i64,
|
||||||
|
) -> anyhow::Result<()>
|
||||||
|
where
|
||||||
|
K: std::borrow::Borrow<Q>,
|
||||||
|
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||||
|
{
|
||||||
|
let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
|
||||||
|
let newseg_id = self.segments.len();
|
||||||
|
let lastseg = &mut self.segments[lastseg_id];
|
||||||
|
|
||||||
|
let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
|
||||||
|
|
||||||
|
let newseg = Segment {
|
||||||
|
op,
|
||||||
|
parent: Some(lastseg_id),
|
||||||
|
start_lsn: lastseg.end_lsn,
|
||||||
|
end_lsn: lastseg.end_lsn + lsn_bytes,
|
||||||
|
start_size: last_end_size,
|
||||||
|
end_size: Some((last_end_size as i64 + size_bytes) as u64),
|
||||||
|
children_after: Vec::new(),
|
||||||
|
needed: false,
|
||||||
|
};
|
||||||
|
lastseg.children_after.push(newseg_id);
|
||||||
|
|
||||||
|
self.segments.push(newseg);
|
||||||
|
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
||||||
|
where
|
||||||
|
K: std::borrow::Borrow<Q>,
|
||||||
|
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||||
|
{
|
||||||
|
self.modify_branch(branch, "insert".into(), bytes, bytes as i64)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
||||||
|
where
|
||||||
|
K: std::borrow::Borrow<Q>,
|
||||||
|
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||||
|
{
|
||||||
|
self.modify_branch(branch, "update".into(), bytes, 0i64)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
||||||
|
where
|
||||||
|
K: std::borrow::Borrow<Q>,
|
||||||
|
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||||
|
{
|
||||||
|
self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K) -> anyhow::Result<()>
|
||||||
|
where
|
||||||
|
K: std::borrow::Borrow<Q> + std::fmt::Debug,
|
||||||
|
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||||
|
{
|
||||||
|
// Find the right segment
|
||||||
|
let branchseg_id = *self.branches.get(parent).with_context(|| {
|
||||||
|
format!(
|
||||||
|
"should had found the parent {:?} by key. in branches {:?}",
|
||||||
|
parent, self.branches
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let _branchseg = &mut self.segments[branchseg_id];
|
||||||
|
|
||||||
|
// Create branch name for it
|
||||||
|
self.branches.insert(name, branchseg_id);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result<SegmentSize> {
|
||||||
|
// Phase 1: Mark all the segments that need to be retained
|
||||||
|
for (_branch, &last_seg_id) in self.branches.iter() {
|
||||||
|
let last_seg = &self.segments[last_seg_id];
|
||||||
|
let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period);
|
||||||
|
let mut seg_id = last_seg_id;
|
||||||
|
loop {
|
||||||
|
let seg = &mut self.segments[seg_id];
|
||||||
|
if seg.end_lsn < cutoff_lsn {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
seg.needed = true;
|
||||||
|
if let Some(prev_seg_id) = seg.parent {
|
||||||
|
seg_id = prev_seg_id;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 2: For each oldest segment in a chain that needs to be retained,
|
||||||
|
// calculate if we should store snapshot or WAL
|
||||||
|
self.size_from_snapshot_later(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_from_wal(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
|
||||||
|
let seg = &self.segments[seg_id];
|
||||||
|
|
||||||
|
let this_size = seg.end_lsn - seg.start_lsn;
|
||||||
|
|
||||||
|
let mut children = Vec::new();
|
||||||
|
|
||||||
|
// try both ways
|
||||||
|
for &child_id in seg.children_after.iter() {
|
||||||
|
// try each child both ways
|
||||||
|
let child = &self.segments[child_id];
|
||||||
|
let p1 = self.size_from_wal(child_id)?;
|
||||||
|
|
||||||
|
let p = if !child.needed {
|
||||||
|
let p2 = self.size_from_snapshot_later(child_id)?;
|
||||||
|
if p1.total() < p2.total() {
|
||||||
|
p1
|
||||||
|
} else {
|
||||||
|
p2
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
p1
|
||||||
|
};
|
||||||
|
children.push(p);
|
||||||
|
}
|
||||||
|
Ok(SegmentSize {
|
||||||
|
seg_id,
|
||||||
|
method: if seg.needed { WalNeeded } else { Wal },
|
||||||
|
this_size,
|
||||||
|
children,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
|
||||||
|
// If this is needed, then it's time to do the snapshot and continue
|
||||||
|
// with wal method.
|
||||||
|
let seg = &self.segments[seg_id];
|
||||||
|
//eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed);
|
||||||
|
if seg.needed {
|
||||||
|
let mut children = Vec::new();
|
||||||
|
|
||||||
|
for &child_id in seg.children_after.iter() {
|
||||||
|
// try each child both ways
|
||||||
|
let child = &self.segments[child_id];
|
||||||
|
let p1 = self.size_from_wal(child_id)?;
|
||||||
|
|
||||||
|
let p = if !child.needed {
|
||||||
|
let p2 = self.size_from_snapshot_later(child_id)?;
|
||||||
|
if p1.total() < p2.total() {
|
||||||
|
p1
|
||||||
|
} else {
|
||||||
|
p2
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
p1
|
||||||
|
};
|
||||||
|
children.push(p);
|
||||||
|
}
|
||||||
|
Ok(SegmentSize {
|
||||||
|
seg_id,
|
||||||
|
method: WalNeeded,
|
||||||
|
this_size: seg.start_size,
|
||||||
|
children,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
// If any of the direct children are "needed", need to be able to reconstruct here
|
||||||
|
let mut children_needed = false;
|
||||||
|
for &child in seg.children_after.iter() {
|
||||||
|
let seg = &self.segments[child];
|
||||||
|
if seg.needed {
|
||||||
|
children_needed = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let method1 = if !children_needed {
|
||||||
|
let mut children = Vec::new();
|
||||||
|
for child in seg.children_after.iter() {
|
||||||
|
children.push(self.size_from_snapshot_later(*child)?);
|
||||||
|
}
|
||||||
|
Some(SegmentSize {
|
||||||
|
seg_id,
|
||||||
|
method: Skipped,
|
||||||
|
this_size: 0,
|
||||||
|
children,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
// If this a junction, consider snapshotting here
|
||||||
|
let method2 = if children_needed || seg.children_after.len() >= 2 {
|
||||||
|
let mut children = Vec::new();
|
||||||
|
for child in seg.children_after.iter() {
|
||||||
|
children.push(self.size_from_wal(*child)?);
|
||||||
|
}
|
||||||
|
let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") };
|
||||||
|
Some(SegmentSize {
|
||||||
|
seg_id,
|
||||||
|
method: SnapshotAfter,
|
||||||
|
this_size,
|
||||||
|
children,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(match (method1, method2) {
|
||||||
|
(None, None) => anyhow::bail!(
|
||||||
|
"neither method was applicable: children_after={}, children_needed={}",
|
||||||
|
seg.children_after.len(),
|
||||||
|
children_needed
|
||||||
|
),
|
||||||
|
(Some(method), None) => method,
|
||||||
|
(None, Some(method)) => method,
|
||||||
|
(Some(method1), Some(method2)) => {
|
||||||
|
if method1.total() < method2.total() {
|
||||||
|
method1
|
||||||
|
} else {
|
||||||
|
method2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_segments(self) -> Vec<Segment> {
|
||||||
|
self.segments
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
269
libs/tenant_size_model/src/main.rs
Normal file
269
libs/tenant_size_model/src/main.rs
Normal file
@@ -0,0 +1,269 @@
|
|||||||
|
//! Tenant size model testing ground.
|
||||||
|
//!
|
||||||
|
//! Has a number of scenarios and a `main` for invoking these by number, calculating the history
|
||||||
|
//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios
|
||||||
|
//! into pngs.
|
||||||
|
|
||||||
|
use tenant_size_model::{Segment, SegmentSize, Storage};
|
||||||
|
|
||||||
|
// Main branch only. Some updates on it.
|
||||||
|
fn scenario_1() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||||
|
// Create main branch
|
||||||
|
let mut storage = Storage::new("main");
|
||||||
|
|
||||||
|
// Bulk load 5 GB of data to it
|
||||||
|
storage.insert("main", 5_000)?;
|
||||||
|
|
||||||
|
// Stream of updates
|
||||||
|
for _ in 0..5 {
|
||||||
|
storage.update("main", 1_000)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let size = storage.calculate(1000)?;
|
||||||
|
|
||||||
|
Ok((storage.into_segments(), size))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Main branch only. Some updates on it.
|
||||||
|
fn scenario_2() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||||
|
// Create main branch
|
||||||
|
let mut storage = Storage::new("main");
|
||||||
|
|
||||||
|
// Bulk load 5 GB of data to it
|
||||||
|
storage.insert("main", 5_000)?;
|
||||||
|
|
||||||
|
// Stream of updates
|
||||||
|
for _ in 0..5 {
|
||||||
|
storage.update("main", 1_000)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Branch
|
||||||
|
storage.branch("main", "child")?;
|
||||||
|
storage.update("child", 1_000)?;
|
||||||
|
|
||||||
|
// More updates on parent
|
||||||
|
storage.update("main", 1_000)?;
|
||||||
|
|
||||||
|
let size = storage.calculate(1000)?;
|
||||||
|
|
||||||
|
Ok((storage.into_segments(), size))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Like 2, but more updates on main
|
||||||
|
fn scenario_3() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||||
|
// Create main branch
|
||||||
|
let mut storage = Storage::new("main");
|
||||||
|
|
||||||
|
// Bulk load 5 GB of data to it
|
||||||
|
storage.insert("main", 5_000)?;
|
||||||
|
|
||||||
|
// Stream of updates
|
||||||
|
for _ in 0..5 {
|
||||||
|
storage.update("main", 1_000)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Branch
|
||||||
|
storage.branch("main", "child")?;
|
||||||
|
storage.update("child", 1_000)?;
|
||||||
|
|
||||||
|
// More updates on parent
|
||||||
|
for _ in 0..5 {
|
||||||
|
storage.update("main", 1_000)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let size = storage.calculate(1000)?;
|
||||||
|
|
||||||
|
Ok((storage.into_segments(), size))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Diverged branches
|
||||||
|
fn scenario_4() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||||
|
// Create main branch
|
||||||
|
let mut storage = Storage::new("main");
|
||||||
|
|
||||||
|
// Bulk load 5 GB of data to it
|
||||||
|
storage.insert("main", 5_000)?;
|
||||||
|
|
||||||
|
// Stream of updates
|
||||||
|
for _ in 0..5 {
|
||||||
|
storage.update("main", 1_000)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Branch
|
||||||
|
storage.branch("main", "child")?;
|
||||||
|
storage.update("child", 1_000)?;
|
||||||
|
|
||||||
|
// More updates on parent
|
||||||
|
for _ in 0..8 {
|
||||||
|
storage.update("main", 1_000)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let size = storage.calculate(1000)?;
|
||||||
|
|
||||||
|
Ok((storage.into_segments(), size))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn scenario_5() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||||
|
let mut storage = Storage::new("a");
|
||||||
|
storage.insert("a", 5000)?;
|
||||||
|
storage.branch("a", "b")?;
|
||||||
|
storage.update("b", 4000)?;
|
||||||
|
storage.update("a", 2000)?;
|
||||||
|
storage.branch("a", "c")?;
|
||||||
|
storage.insert("c", 4000)?;
|
||||||
|
storage.insert("a", 2000)?;
|
||||||
|
|
||||||
|
let size = storage.calculate(5000)?;
|
||||||
|
|
||||||
|
Ok((storage.into_segments(), size))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn scenario_6() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||||
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
const NO_OP: Cow<'static, str> = Cow::Borrowed("");
|
||||||
|
|
||||||
|
let branches = [
|
||||||
|
Some(0x7ff1edab8182025f15ae33482edb590a_u128),
|
||||||
|
Some(0xb1719e044db05401a05a2ed588a3ad3f),
|
||||||
|
Some(0xb68d6691c895ad0a70809470020929ef),
|
||||||
|
];
|
||||||
|
|
||||||
|
// compared to other scenarios, this one uses bytes instead of kB
|
||||||
|
|
||||||
|
let mut storage = Storage::new(None);
|
||||||
|
|
||||||
|
storage.branch(&None, branches[0])?; // at 0
|
||||||
|
storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064
|
||||||
|
storage.branch(&branches[0], branches[1])?; // at 108951064
|
||||||
|
storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472
|
||||||
|
storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424
|
||||||
|
storage.branch(&branches[0], branches[2])?; // at 283415424
|
||||||
|
storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616
|
||||||
|
storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400
|
||||||
|
|
||||||
|
let size = storage.calculate(100_000)?;
|
||||||
|
|
||||||
|
Ok((storage.into_segments(), size))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let args: Vec<String> = std::env::args().collect();
|
||||||
|
|
||||||
|
let scenario = if args.len() < 2 { "1" } else { &args[1] };
|
||||||
|
|
||||||
|
let (segments, size) = match scenario {
|
||||||
|
"1" => scenario_1(),
|
||||||
|
"2" => scenario_2(),
|
||||||
|
"3" => scenario_3(),
|
||||||
|
"4" => scenario_4(),
|
||||||
|
"5" => scenario_5(),
|
||||||
|
"6" => scenario_6(),
|
||||||
|
other => {
|
||||||
|
eprintln!("invalid scenario {}", other);
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
graphviz_tree(&segments, &size);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
|
||||||
|
use tenant_size_model::SegmentMethod::*;
|
||||||
|
|
||||||
|
let seg_id = node.seg_id;
|
||||||
|
let seg = segments.get(seg_id).unwrap();
|
||||||
|
let lsn = seg.end_lsn;
|
||||||
|
let size = seg.end_size.unwrap_or(0);
|
||||||
|
let method = node.method;
|
||||||
|
|
||||||
|
println!(" {{");
|
||||||
|
println!(" node [width=0.1 height=0.1 shape=oval]");
|
||||||
|
|
||||||
|
let tenant_size = node.total_children();
|
||||||
|
|
||||||
|
let penwidth = if seg.needed { 6 } else { 3 };
|
||||||
|
let x = match method {
|
||||||
|
SnapshotAfter =>
|
||||||
|
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"),
|
||||||
|
Wal =>
|
||||||
|
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
|
||||||
|
WalNeeded =>
|
||||||
|
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
|
||||||
|
Skipped =>
|
||||||
|
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"),
|
||||||
|
};
|
||||||
|
|
||||||
|
println!(" \"seg{seg_id}\" [{x}]");
|
||||||
|
println!(" }}");
|
||||||
|
|
||||||
|
// Recurse. Much of the data is actually on the edge
|
||||||
|
for child in node.children.iter() {
|
||||||
|
let child_id = child.seg_id;
|
||||||
|
graphviz_recurse(segments, child);
|
||||||
|
|
||||||
|
let edge_color = match child.method {
|
||||||
|
SnapshotAfter => "gray",
|
||||||
|
Wal => "black",
|
||||||
|
WalNeeded => "black",
|
||||||
|
Skipped => "gray",
|
||||||
|
};
|
||||||
|
|
||||||
|
println!(" {{");
|
||||||
|
println!(" edge [] ");
|
||||||
|
print!(" \"seg{seg_id}\" -> \"seg{child_id}\" [");
|
||||||
|
print!("color={edge_color}");
|
||||||
|
if child.method == WalNeeded {
|
||||||
|
print!(" penwidth=6");
|
||||||
|
}
|
||||||
|
if child.method == Wal {
|
||||||
|
print!(" penwidth=3");
|
||||||
|
}
|
||||||
|
|
||||||
|
let next = segments.get(child_id).unwrap();
|
||||||
|
|
||||||
|
if next.op.is_empty() {
|
||||||
|
print!(
|
||||||
|
" label=\"{} / {}\"",
|
||||||
|
next.end_lsn - seg.end_lsn,
|
||||||
|
(next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
|
||||||
|
}
|
||||||
|
println!("]");
|
||||||
|
println!(" }}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
|
||||||
|
println!("digraph G {{");
|
||||||
|
println!(" fontname=\"Helvetica,Arial,sans-serif\"");
|
||||||
|
println!(" node [fontname=\"Helvetica,Arial,sans-serif\"]");
|
||||||
|
println!(" edge [fontname=\"Helvetica,Arial,sans-serif\"]");
|
||||||
|
println!(" graph [center=1 rankdir=LR]");
|
||||||
|
println!(" edge [dir=none]");
|
||||||
|
|
||||||
|
graphviz_recurse(segments, tree);
|
||||||
|
|
||||||
|
println!("}}");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn scenarios_return_same_size() {
|
||||||
|
type ScenarioFn = fn() -> anyhow::Result<(Vec<Segment>, SegmentSize)>;
|
||||||
|
let truths: &[(u32, ScenarioFn, _)] = &[
|
||||||
|
(line!(), scenario_1, 8000),
|
||||||
|
(line!(), scenario_2, 9000),
|
||||||
|
(line!(), scenario_3, 13000),
|
||||||
|
(line!(), scenario_4, 16000),
|
||||||
|
(line!(), scenario_5, 17000),
|
||||||
|
(line!(), scenario_6, 333_792_000),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (line, scenario, expected) in truths {
|
||||||
|
let (_, size) = scenario().unwrap();
|
||||||
|
assert_eq!(*expected, size.total_children(), "scenario on line {line}");
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,193 +0,0 @@
|
|||||||
use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
|
|
||||||
use std::fmt::Write;
|
|
||||||
|
|
||||||
const SVG_WIDTH: f32 = 500.0;
|
|
||||||
|
|
||||||
struct SvgDraw<'a> {
|
|
||||||
storage: &'a StorageModel,
|
|
||||||
branches: &'a [String],
|
|
||||||
seg_to_branch: &'a [usize],
|
|
||||||
sizes: &'a [SegmentSizeResult],
|
|
||||||
|
|
||||||
// layout
|
|
||||||
xscale: f32,
|
|
||||||
min_lsn: u64,
|
|
||||||
seg_coordinates: Vec<(f32, f32)>,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn draw_legend(result: &mut String) -> anyhow::Result<()> {
|
|
||||||
writeln!(
|
|
||||||
result,
|
|
||||||
"<circle cx=\"10\" cy=\"10\" r=\"5\" stroke=\"red\"/>"
|
|
||||||
)?;
|
|
||||||
writeln!(result, "<text x=\"20\" y=\"15\">logical snapshot</text>")?;
|
|
||||||
writeln!(
|
|
||||||
result,
|
|
||||||
"<line x1=\"5\" y1=\"30\" x2=\"15\" y2=\"30\" stroke-width=\"6\" stroke=\"black\" />"
|
|
||||||
)?;
|
|
||||||
writeln!(
|
|
||||||
result,
|
|
||||||
"<text x=\"20\" y=\"35\">WAL within retention period</text>"
|
|
||||||
)?;
|
|
||||||
writeln!(
|
|
||||||
result,
|
|
||||||
"<line x1=\"5\" y1=\"50\" x2=\"15\" y2=\"50\" stroke-width=\"3\" stroke=\"black\" />"
|
|
||||||
)?;
|
|
||||||
writeln!(
|
|
||||||
result,
|
|
||||||
"<text x=\"20\" y=\"55\">WAL retained to avoid copy</text>"
|
|
||||||
)?;
|
|
||||||
writeln!(
|
|
||||||
result,
|
|
||||||
"<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
|
|
||||||
)?;
|
|
||||||
writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn draw_svg(
|
|
||||||
storage: &StorageModel,
|
|
||||||
branches: &[String],
|
|
||||||
seg_to_branch: &[usize],
|
|
||||||
sizes: &SizeResult,
|
|
||||||
) -> anyhow::Result<String> {
|
|
||||||
let mut draw = SvgDraw {
|
|
||||||
storage,
|
|
||||||
branches,
|
|
||||||
seg_to_branch,
|
|
||||||
sizes: &sizes.segments,
|
|
||||||
|
|
||||||
xscale: 0.0,
|
|
||||||
min_lsn: 0,
|
|
||||||
seg_coordinates: Vec::new(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut result = String::new();
|
|
||||||
|
|
||||||
writeln!(result, "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" height=\"300\" width=\"500\">")?;
|
|
||||||
|
|
||||||
draw.calculate_svg_layout();
|
|
||||||
|
|
||||||
// Draw the tree
|
|
||||||
for (seg_id, _seg) in storage.segments.iter().enumerate() {
|
|
||||||
draw.draw_seg_phase1(seg_id, &mut result)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Draw snapshots
|
|
||||||
for (seg_id, _seg) in storage.segments.iter().enumerate() {
|
|
||||||
draw.draw_seg_phase2(seg_id, &mut result)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
draw_legend(&mut result)?;
|
|
||||||
|
|
||||||
write!(result, "</svg>")?;
|
|
||||||
|
|
||||||
Ok(result)
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> SvgDraw<'a> {
|
|
||||||
fn calculate_svg_layout(&mut self) {
|
|
||||||
// Find x scale
|
|
||||||
let segments = &self.storage.segments;
|
|
||||||
let min_lsn = segments.iter().map(|s| s.lsn).fold(u64::MAX, std::cmp::min);
|
|
||||||
let max_lsn = segments.iter().map(|s| s.lsn).fold(0, std::cmp::max);
|
|
||||||
|
|
||||||
// Start with 1 pixel = 1 byte. Double the scale until it fits into the image
|
|
||||||
let mut xscale = 1.0;
|
|
||||||
while (max_lsn - min_lsn) as f32 / xscale > SVG_WIDTH {
|
|
||||||
xscale *= 2.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Layout the timelines on Y dimension.
|
|
||||||
// TODO
|
|
||||||
let mut y = 100.0;
|
|
||||||
let mut branch_y_coordinates = Vec::new();
|
|
||||||
for _branch in self.branches {
|
|
||||||
branch_y_coordinates.push(y);
|
|
||||||
y += 40.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate coordinates for each point
|
|
||||||
let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
|
|
||||||
.map(|(seg, branch_id)| {
|
|
||||||
let x = (seg.lsn - min_lsn) as f32 / xscale;
|
|
||||||
let y = branch_y_coordinates[*branch_id];
|
|
||||||
(x, y)
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
self.xscale = xscale;
|
|
||||||
self.min_lsn = min_lsn;
|
|
||||||
self.seg_coordinates = seg_coordinates;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Draws lines between points
|
|
||||||
fn draw_seg_phase1(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
|
|
||||||
let seg = &self.storage.segments[seg_id];
|
|
||||||
|
|
||||||
let wal_bytes = if let Some(parent_id) = seg.parent {
|
|
||||||
seg.lsn - self.storage.segments[parent_id].lsn
|
|
||||||
} else {
|
|
||||||
0
|
|
||||||
};
|
|
||||||
|
|
||||||
let style = match self.sizes[seg_id].method {
|
|
||||||
SegmentMethod::SnapshotHere => "stroke-width=\"1\" stroke=\"gray\"",
|
|
||||||
SegmentMethod::Wal if seg.needed && wal_bytes > 0 => {
|
|
||||||
"stroke-width=\"6\" stroke=\"black\""
|
|
||||||
}
|
|
||||||
SegmentMethod::Wal => "stroke-width=\"3\" stroke=\"black\"",
|
|
||||||
SegmentMethod::Skipped => "stroke-width=\"1\" stroke=\"gray\"",
|
|
||||||
};
|
|
||||||
if let Some(parent_id) = seg.parent {
|
|
||||||
let (x1, y1) = self.seg_coordinates[parent_id];
|
|
||||||
let (x2, y2) = self.seg_coordinates[seg_id];
|
|
||||||
|
|
||||||
writeln!(
|
|
||||||
result,
|
|
||||||
"<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
|
|
||||||
)?;
|
|
||||||
writeln!(
|
|
||||||
result,
|
|
||||||
" <title>{wal_bytes} bytes of WAL (seg {seg_id})</title>"
|
|
||||||
)?;
|
|
||||||
writeln!(result, "</line>")?;
|
|
||||||
} else {
|
|
||||||
// draw a little dash to mark the starting point of this branch
|
|
||||||
let (x, y) = self.seg_coordinates[seg_id];
|
|
||||||
let (x1, y1) = (x, y - 5.0);
|
|
||||||
let (x2, y2) = (x, y + 5.0);
|
|
||||||
|
|
||||||
writeln!(
|
|
||||||
result,
|
|
||||||
"<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
|
|
||||||
)?;
|
|
||||||
writeln!(result, " <title>(seg {seg_id})</title>")?;
|
|
||||||
writeln!(result, "</line>")?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Draw circles where snapshots are taken
|
|
||||||
fn draw_seg_phase2(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
|
|
||||||
let seg = &self.storage.segments[seg_id];
|
|
||||||
|
|
||||||
// draw a snapshot point if it's needed
|
|
||||||
let (coord_x, coord_y) = self.seg_coordinates[seg_id];
|
|
||||||
if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
|
|
||||||
writeln!(
|
|
||||||
result,
|
|
||||||
"<circle cx=\"{coord_x}\" cy=\"{coord_y}\" r=\"5\" stroke=\"red\">",
|
|
||||||
)?;
|
|
||||||
writeln!(
|
|
||||||
result,
|
|
||||||
" <title>logical size {}</title>",
|
|
||||||
seg.size.unwrap()
|
|
||||||
)?;
|
|
||||||
write!(result, "</circle>")?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,313 +0,0 @@
|
|||||||
//! Tenant size model tests.
|
|
||||||
|
|
||||||
use tenant_size_model::{Segment, SizeResult, StorageModel};
|
|
||||||
|
|
||||||
use std::collections::HashMap;
|
|
||||||
|
|
||||||
struct ScenarioBuilder {
|
|
||||||
segments: Vec<Segment>,
|
|
||||||
|
|
||||||
/// Mapping from the branch name to the index of a segment describing its latest state.
|
|
||||||
branches: HashMap<String, usize>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ScenarioBuilder {
|
|
||||||
/// Creates a new storage with the given default branch name.
|
|
||||||
pub fn new(initial_branch: &str) -> ScenarioBuilder {
|
|
||||||
let init_segment = Segment {
|
|
||||||
parent: None,
|
|
||||||
lsn: 0,
|
|
||||||
size: Some(0),
|
|
||||||
needed: false, // determined later
|
|
||||||
};
|
|
||||||
|
|
||||||
ScenarioBuilder {
|
|
||||||
segments: vec![init_segment],
|
|
||||||
branches: HashMap::from([(initial_branch.into(), 0)]),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
|
|
||||||
pub fn modify_branch(&mut self, branch: &str, lsn_bytes: u64, size_bytes: i64) {
|
|
||||||
let lastseg_id = *self.branches.get(branch).unwrap();
|
|
||||||
let newseg_id = self.segments.len();
|
|
||||||
let lastseg = &mut self.segments[lastseg_id];
|
|
||||||
|
|
||||||
let newseg = Segment {
|
|
||||||
parent: Some(lastseg_id),
|
|
||||||
lsn: lastseg.lsn + lsn_bytes,
|
|
||||||
size: Some((lastseg.size.unwrap() as i64 + size_bytes) as u64),
|
|
||||||
needed: false,
|
|
||||||
};
|
|
||||||
|
|
||||||
self.segments.push(newseg);
|
|
||||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn insert(&mut self, branch: &str, bytes: u64) {
|
|
||||||
self.modify_branch(branch, bytes, bytes as i64);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn update(&mut self, branch: &str, bytes: u64) {
|
|
||||||
self.modify_branch(branch, bytes, 0i64);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn _delete(&mut self, branch: &str, bytes: u64) {
|
|
||||||
self.modify_branch(branch, bytes, -(bytes as i64));
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Panics if the parent branch cannot be found.
|
|
||||||
pub fn branch(&mut self, parent: &str, name: &str) {
|
|
||||||
// Find the right segment
|
|
||||||
let branchseg_id = *self
|
|
||||||
.branches
|
|
||||||
.get(parent)
|
|
||||||
.expect("should had found the parent by key");
|
|
||||||
let _branchseg = &mut self.segments[branchseg_id];
|
|
||||||
|
|
||||||
// Create branch name for it
|
|
||||||
self.branches.insert(name.to_string(), branchseg_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn calculate(&mut self, retention_period: u64) -> (StorageModel, SizeResult) {
|
|
||||||
// Phase 1: Mark all the segments that need to be retained
|
|
||||||
for (_branch, &last_seg_id) in self.branches.iter() {
|
|
||||||
let last_seg = &self.segments[last_seg_id];
|
|
||||||
let cutoff_lsn = last_seg.lsn.saturating_sub(retention_period);
|
|
||||||
let mut seg_id = last_seg_id;
|
|
||||||
loop {
|
|
||||||
let seg = &mut self.segments[seg_id];
|
|
||||||
if seg.lsn <= cutoff_lsn {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
seg.needed = true;
|
|
||||||
if let Some(prev_seg_id) = seg.parent {
|
|
||||||
seg_id = prev_seg_id;
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Perform the calculation
|
|
||||||
let storage_model = StorageModel {
|
|
||||||
segments: self.segments.clone(),
|
|
||||||
};
|
|
||||||
let size_result = storage_model.calculate();
|
|
||||||
(storage_model, size_result)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Main branch only. Some updates on it.
|
|
||||||
#[test]
|
|
||||||
fn scenario_1() {
|
|
||||||
// Create main branch
|
|
||||||
let mut scenario = ScenarioBuilder::new("main");
|
|
||||||
|
|
||||||
// Bulk load 5 GB of data to it
|
|
||||||
scenario.insert("main", 5_000);
|
|
||||||
|
|
||||||
// Stream of updates
|
|
||||||
for _ in 0..5 {
|
|
||||||
scenario.update("main", 1_000);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate the synthetic size with retention horizon 1000
|
|
||||||
let (_model, result) = scenario.calculate(1000);
|
|
||||||
|
|
||||||
// The end of the branch is at LSN 10000. Need to retain
|
|
||||||
// a logical snapshot at LSN 9000, plus the WAL between 9000-10000.
|
|
||||||
// The logical snapshot has size 5000.
|
|
||||||
assert_eq!(result.total_size, 5000 + 1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Main branch only. Some updates on it.
|
|
||||||
#[test]
|
|
||||||
fn scenario_2() {
|
|
||||||
// Create main branch
|
|
||||||
let mut scenario = ScenarioBuilder::new("main");
|
|
||||||
|
|
||||||
// Bulk load 5 GB of data to it
|
|
||||||
scenario.insert("main", 5_000);
|
|
||||||
|
|
||||||
// Stream of updates
|
|
||||||
for _ in 0..5 {
|
|
||||||
scenario.update("main", 1_000);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Branch
|
|
||||||
scenario.branch("main", "child");
|
|
||||||
scenario.update("child", 1_000);
|
|
||||||
|
|
||||||
// More updates on parent
|
|
||||||
scenario.update("main", 1_000);
|
|
||||||
|
|
||||||
//
|
|
||||||
// The history looks like this now:
|
|
||||||
//
|
|
||||||
// 10000 11000
|
|
||||||
// *----*----*--------------* main
|
|
||||||
// |
|
|
||||||
// | 11000
|
|
||||||
// +-------------- child
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// With retention horizon 1000, we need to retain logical snapshot
|
|
||||||
// at the branch point, size 5000, and the WAL from 10000-11000 on
|
|
||||||
// both branches.
|
|
||||||
let (_model, result) = scenario.calculate(1000);
|
|
||||||
|
|
||||||
assert_eq!(result.total_size, 5000 + 1000 + 1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Like 2, but more updates on main
|
|
||||||
#[test]
|
|
||||||
fn scenario_3() {
|
|
||||||
// Create main branch
|
|
||||||
let mut scenario = ScenarioBuilder::new("main");
|
|
||||||
|
|
||||||
// Bulk load 5 GB of data to it
|
|
||||||
scenario.insert("main", 5_000);
|
|
||||||
|
|
||||||
// Stream of updates
|
|
||||||
for _ in 0..5 {
|
|
||||||
scenario.update("main", 1_000);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Branch
|
|
||||||
scenario.branch("main", "child");
|
|
||||||
scenario.update("child", 1_000);
|
|
||||||
|
|
||||||
// More updates on parent
|
|
||||||
for _ in 0..5 {
|
|
||||||
scenario.update("main", 1_000);
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// The history looks like this now:
|
|
||||||
//
|
|
||||||
// 10000 15000
|
|
||||||
// *----*----*------------------------------------* main
|
|
||||||
// |
|
|
||||||
// | 11000
|
|
||||||
// +-------------- child
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// With retention horizon 1000, it's still cheapest to retain
|
|
||||||
// - snapshot at branch point (size 5000)
|
|
||||||
// - WAL on child between 10000-11000
|
|
||||||
// - WAL on main between 10000-15000
|
|
||||||
//
|
|
||||||
// This is in total 5000 + 1000 + 5000
|
|
||||||
//
|
|
||||||
let (_model, result) = scenario.calculate(1000);
|
|
||||||
|
|
||||||
assert_eq!(result.total_size, 5000 + 1000 + 5000);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Diverged branches
|
|
||||||
#[test]
|
|
||||||
fn scenario_4() {
|
|
||||||
// Create main branch
|
|
||||||
let mut scenario = ScenarioBuilder::new("main");
|
|
||||||
|
|
||||||
// Bulk load 5 GB of data to it
|
|
||||||
scenario.insert("main", 5_000);
|
|
||||||
|
|
||||||
// Stream of updates
|
|
||||||
for _ in 0..5 {
|
|
||||||
scenario.update("main", 1_000);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Branch
|
|
||||||
scenario.branch("main", "child");
|
|
||||||
scenario.update("child", 1_000);
|
|
||||||
|
|
||||||
// More updates on parent
|
|
||||||
for _ in 0..8 {
|
|
||||||
scenario.update("main", 1_000);
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// The history looks like this now:
|
|
||||||
//
|
|
||||||
// 10000 18000
|
|
||||||
// *----*----*------------------------------------* main
|
|
||||||
// |
|
|
||||||
// | 11000
|
|
||||||
// +-------------- child
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// With retention horizon 1000, it's now cheapest to retain
|
|
||||||
// separate snapshots on both branches:
|
|
||||||
// - snapshot on main branch at LSN 17000 (size 5000)
|
|
||||||
// - WAL on main between 17000-18000
|
|
||||||
// - snapshot on child branch at LSN 10000 (size 5000)
|
|
||||||
// - WAL on child between 10000-11000
|
|
||||||
//
|
|
||||||
// This is in total 5000 + 1000 + 5000 + 1000 = 12000
|
|
||||||
//
|
|
||||||
// (If we used the the method from the previous scenario, and
|
|
||||||
// kept only snapshot at the branch point, we'd need to keep
|
|
||||||
// all the WAL between 10000-18000 on the main branch, so
|
|
||||||
// the total size would be 5000 + 1000 + 8000 = 14000. The
|
|
||||||
// calculation always picks the cheapest alternative)
|
|
||||||
|
|
||||||
let (_model, result) = scenario.calculate(1000);
|
|
||||||
|
|
||||||
assert_eq!(result.total_size, 5000 + 1000 + 5000 + 1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn scenario_5() {
|
|
||||||
let mut scenario = ScenarioBuilder::new("a");
|
|
||||||
scenario.insert("a", 5000);
|
|
||||||
scenario.branch("a", "b");
|
|
||||||
scenario.update("b", 4000);
|
|
||||||
scenario.update("a", 2000);
|
|
||||||
scenario.branch("a", "c");
|
|
||||||
scenario.insert("c", 4000);
|
|
||||||
scenario.insert("a", 2000);
|
|
||||||
|
|
||||||
let (_model, result) = scenario.calculate(1000);
|
|
||||||
|
|
||||||
assert_eq!(result.total_size, 17000);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn scenario_6() {
|
|
||||||
let branches = [
|
|
||||||
"7ff1edab8182025f15ae33482edb590a",
|
|
||||||
"b1719e044db05401a05a2ed588a3ad3f",
|
|
||||||
"0xb68d6691c895ad0a70809470020929ef",
|
|
||||||
];
|
|
||||||
|
|
||||||
// compared to other scenarios, this one uses bytes instead of kB
|
|
||||||
|
|
||||||
let mut scenario = ScenarioBuilder::new("");
|
|
||||||
|
|
||||||
scenario.branch("", branches[0]); // at 0
|
|
||||||
scenario.modify_branch(branches[0], 108951064, 43696128); // at 108951064
|
|
||||||
scenario.branch(branches[0], branches[1]); // at 108951064
|
|
||||||
scenario.modify_branch(branches[1], 15560408, -1851392); // at 124511472
|
|
||||||
scenario.modify_branch(branches[0], 174464360, -1531904); // at 283415424
|
|
||||||
scenario.branch(branches[0], branches[2]); // at 283415424
|
|
||||||
scenario.modify_branch(branches[2], 15906192, 8192); // at 299321616
|
|
||||||
scenario.modify_branch(branches[0], 18909976, 32768); // at 302325400
|
|
||||||
|
|
||||||
let (model, result) = scenario.calculate(100_000);
|
|
||||||
|
|
||||||
// FIXME: We previously calculated 333_792_000. But with this PR, we get
|
|
||||||
// a much lower number. At a quick look at the model output and the
|
|
||||||
// calculations here, the new result seems correct to me.
|
|
||||||
eprintln!(
|
|
||||||
" MODEL: {}",
|
|
||||||
serde_json::to_string(&model.segments).unwrap()
|
|
||||||
);
|
|
||||||
eprintln!(
|
|
||||||
"RESULT: {}",
|
|
||||||
serde_json::to_string(&result.segments).unwrap()
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(result.total_size, 136_236_928);
|
|
||||||
}
|
|
||||||
@@ -13,7 +13,6 @@ bincode.workspace = true
|
|||||||
bytes.workspace = true
|
bytes.workspace = true
|
||||||
heapless.workspace = true
|
heapless.workspace = true
|
||||||
hyper = { workspace = true, features = ["full"] }
|
hyper = { workspace = true, features = ["full"] }
|
||||||
futures = { workspace = true}
|
|
||||||
routerify.workspace = true
|
routerify.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
@@ -40,7 +39,7 @@ pq_proto.workspace = true
|
|||||||
|
|
||||||
workspace_hack.workspace = true
|
workspace_hack.workspace = true
|
||||||
url.workspace = true
|
url.workspace = true
|
||||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
byteorder.workspace = true
|
byteorder.workspace = true
|
||||||
bytes.workspace = true
|
bytes.workspace = true
|
||||||
|
|||||||
@@ -4,13 +4,13 @@ use anyhow::{anyhow, Context};
|
|||||||
use hyper::header::{HeaderName, AUTHORIZATION};
|
use hyper::header::{HeaderName, AUTHORIZATION};
|
||||||
use hyper::http::HeaderValue;
|
use hyper::http::HeaderValue;
|
||||||
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
|
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
|
||||||
use hyper::{Method, StatusCode};
|
|
||||||
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use routerify::ext::RequestExt;
|
use routerify::ext::RequestExt;
|
||||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
|
use routerify::RequestInfo;
|
||||||
|
use routerify::{Middleware, Router, RouterBuilder, RouterService};
|
||||||
use tokio::task::JoinError;
|
use tokio::task::JoinError;
|
||||||
use tracing;
|
use tracing::info;
|
||||||
|
|
||||||
use std::future::Future;
|
use std::future::Future;
|
||||||
use std::net::TcpListener;
|
use std::net::TcpListener;
|
||||||
@@ -26,36 +26,8 @@ static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
static X_REQUEST_ID_HEADER_STR: &str = "x-request-id";
|
|
||||||
|
|
||||||
static X_REQUEST_ID_HEADER: HeaderName = HeaderName::from_static(X_REQUEST_ID_HEADER_STR);
|
|
||||||
#[derive(Debug, Default, Clone)]
|
|
||||||
struct RequestId(String);
|
|
||||||
|
|
||||||
async fn logger(res: Response<Body>, info: RequestInfo) -> Result<Response<Body>, ApiError> {
|
async fn logger(res: Response<Body>, info: RequestInfo) -> Result<Response<Body>, ApiError> {
|
||||||
let request_id = info.context::<RequestId>().unwrap_or_default().0;
|
info!("{} {} {}", info.method(), info.uri().path(), res.status(),);
|
||||||
|
|
||||||
// cannot factor out the Level to avoid the repetition
|
|
||||||
// because tracing can only work with const Level
|
|
||||||
// which is not the case here
|
|
||||||
|
|
||||||
if info.method() == Method::GET && res.status() == StatusCode::OK {
|
|
||||||
tracing::debug!(
|
|
||||||
"{} {} {} {}",
|
|
||||||
info.method(),
|
|
||||||
info.uri().path(),
|
|
||||||
request_id,
|
|
||||||
res.status()
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
tracing::info!(
|
|
||||||
"{} {} {} {}",
|
|
||||||
info.method(),
|
|
||||||
info.uri().path(),
|
|
||||||
request_id,
|
|
||||||
res.status()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Ok(res)
|
Ok(res)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -83,52 +55,9 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
|
|||||||
Ok(response)
|
Ok(response)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_request_id_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
|
||||||
) -> Middleware<B, ApiError> {
|
|
||||||
Middleware::pre(move |req| async move {
|
|
||||||
let request_id = match req.headers().get(&X_REQUEST_ID_HEADER) {
|
|
||||||
Some(request_id) => request_id
|
|
||||||
.to_str()
|
|
||||||
.expect("extract request id value")
|
|
||||||
.to_owned(),
|
|
||||||
None => {
|
|
||||||
let request_id = uuid::Uuid::new_v4();
|
|
||||||
request_id.to_string()
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if req.method() == Method::GET {
|
|
||||||
tracing::debug!("{} {} {}", req.method(), req.uri().path(), request_id);
|
|
||||||
} else {
|
|
||||||
tracing::info!("{} {} {}", req.method(), req.uri().path(), request_id);
|
|
||||||
}
|
|
||||||
req.set_context(RequestId(request_id));
|
|
||||||
|
|
||||||
Ok(req)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn add_request_id_header_to_response(
|
|
||||||
mut res: Response<Body>,
|
|
||||||
req_info: RequestInfo,
|
|
||||||
) -> Result<Response<Body>, ApiError> {
|
|
||||||
if let Some(request_id) = req_info.context::<RequestId>() {
|
|
||||||
if let Ok(request_header_value) = HeaderValue::from_str(&request_id.0) {
|
|
||||||
res.headers_mut()
|
|
||||||
.insert(&X_REQUEST_ID_HEADER, request_header_value);
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(res)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
|
pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
|
||||||
Router::builder()
|
Router::builder()
|
||||||
.middleware(add_request_id_middleware())
|
|
||||||
.middleware(Middleware::post_with_info(logger))
|
.middleware(Middleware::post_with_info(logger))
|
||||||
.middleware(Middleware::post_with_info(
|
|
||||||
add_request_id_header_to_response,
|
|
||||||
))
|
|
||||||
.get("/metrics", prometheus_metrics_handler)
|
.get("/metrics", prometheus_metrics_handler)
|
||||||
.err_handler(error::handler)
|
.err_handler(error::handler)
|
||||||
}
|
}
|
||||||
@@ -274,7 +203,7 @@ pub fn serve_thread_main<S>(
|
|||||||
where
|
where
|
||||||
S: Future<Output = ()> + Send + Sync,
|
S: Future<Output = ()> + Send + Sync,
|
||||||
{
|
{
|
||||||
tracing::info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
|
info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
|
||||||
|
|
||||||
// Create a Service from the router above to handle incoming requests.
|
// Create a Service from the router above to handle incoming requests.
|
||||||
let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
|
let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
|
||||||
@@ -294,48 +223,3 @@ where
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
use futures::future::poll_fn;
|
|
||||||
use hyper::service::Service;
|
|
||||||
use routerify::RequestServiceBuilder;
|
|
||||||
use std::net::{IpAddr, SocketAddr};
|
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
async fn test_request_id_returned() {
|
|
||||||
let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap();
|
|
||||||
let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
|
|
||||||
let mut service = builder.build(remote_addr);
|
|
||||||
if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
|
|
||||||
panic!("request service is not ready: {:?}", e);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut req: Request<Body> = Request::default();
|
|
||||||
req.headers_mut()
|
|
||||||
.append(&X_REQUEST_ID_HEADER, HeaderValue::from_str("42").unwrap());
|
|
||||||
|
|
||||||
let resp: Response<hyper::body::Body> = service.call(req).await.unwrap();
|
|
||||||
|
|
||||||
let header_val = resp.headers().get(&X_REQUEST_ID_HEADER).unwrap();
|
|
||||||
|
|
||||||
assert!(header_val == "42", "response header mismatch");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
async fn test_request_id_empty() {
|
|
||||||
let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap();
|
|
||||||
let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
|
|
||||||
let mut service = builder.build(remote_addr);
|
|
||||||
if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
|
|
||||||
panic!("request service is not ready: {:?}", e);
|
|
||||||
}
|
|
||||||
|
|
||||||
let req: Request<Body> = Request::default();
|
|
||||||
let resp: Response<hyper::body::Body> = service.call(req).await.unwrap();
|
|
||||||
|
|
||||||
let header_val = resp.headers().get(&X_REQUEST_ID_HEADER);
|
|
||||||
|
|
||||||
assert_ne!(header_val, None, "response header should NOT be empty");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,9 +1,7 @@
|
|||||||
use std::fmt::Display;
|
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use bytes::Buf;
|
use bytes::Buf;
|
||||||
use hyper::{header, Body, Request, Response, StatusCode};
|
use hyper::{header, Body, Request, Response, StatusCode};
|
||||||
use serde::{Deserialize, Serialize, Serializer};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use super::error::ApiError;
|
use super::error::ApiError;
|
||||||
|
|
||||||
@@ -33,12 +31,3 @@ pub fn json_response<T: Serialize>(
|
|||||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||||
Ok(response)
|
Ok(response)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Serialize through Display trait.
|
|
||||||
pub fn display_serialize<S, F>(z: &F, s: S) -> Result<S::Ok, S::Error>
|
|
||||||
where
|
|
||||||
S: Serializer,
|
|
||||||
F: Display,
|
|
||||||
{
|
|
||||||
s.serialize_str(&format!("{}", z))
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -45,115 +45,3 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Disable the default rust panic hook by using `set_hook`.
|
|
||||||
///
|
|
||||||
/// For neon binaries, the assumption is that tracing is configured before with [`init`], after
|
|
||||||
/// that sentry is configured (if needed). sentry will install it's own on top of this, always
|
|
||||||
/// processing the panic before we log it.
|
|
||||||
///
|
|
||||||
/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
|
|
||||||
/// If the assumptions about the initialization order are not held, use
|
|
||||||
/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
|
|
||||||
/// lost.
|
|
||||||
#[must_use]
|
|
||||||
pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
|
|
||||||
std::panic::set_hook(Box::new(tracing_panic_hook));
|
|
||||||
TracingPanicHookGuard::new()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Drop guard which restores the std panic hook on drop.
|
|
||||||
///
|
|
||||||
/// Tracing should not be used when it's not configured, but we cannot really latch on to any
|
|
||||||
/// imaginary lifetime of tracing.
|
|
||||||
pub struct TracingPanicHookGuard {
|
|
||||||
act: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TracingPanicHookGuard {
|
|
||||||
fn new() -> Self {
|
|
||||||
TracingPanicHookGuard { act: true }
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Make this hook guard not do anything when dropped.
|
|
||||||
pub fn forget(&mut self) {
|
|
||||||
self.act = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for TracingPanicHookGuard {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
if self.act {
|
|
||||||
let _ = std::panic::take_hook();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Named symbol for our panic hook, which logs the panic.
|
|
||||||
fn tracing_panic_hook(info: &std::panic::PanicInfo) {
|
|
||||||
// following rust 1.66.1 std implementation:
|
|
||||||
// https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
|
|
||||||
let location = info.location();
|
|
||||||
|
|
||||||
let msg = match info.payload().downcast_ref::<&'static str>() {
|
|
||||||
Some(s) => *s,
|
|
||||||
None => match info.payload().downcast_ref::<String>() {
|
|
||||||
Some(s) => &s[..],
|
|
||||||
None => "Box<dyn Any>",
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
let thread = std::thread::current();
|
|
||||||
let thread = thread.name().unwrap_or("<unnamed>");
|
|
||||||
let backtrace = std::backtrace::Backtrace::capture();
|
|
||||||
|
|
||||||
let _entered = if let Some(location) = location {
|
|
||||||
tracing::error_span!("panic", %thread, location = %PrettyLocation(location))
|
|
||||||
} else {
|
|
||||||
// very unlikely to hit here, but the guarantees of std could change
|
|
||||||
tracing::error_span!("panic", %thread)
|
|
||||||
}
|
|
||||||
.entered();
|
|
||||||
|
|
||||||
if backtrace.status() == std::backtrace::BacktraceStatus::Captured {
|
|
||||||
// this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really
|
|
||||||
// get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to
|
|
||||||
// string, maybe even to a TLS one but tracing already does that.
|
|
||||||
tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}");
|
|
||||||
} else {
|
|
||||||
tracing::error!("{msg}");
|
|
||||||
}
|
|
||||||
|
|
||||||
// ensure that we log something on the panic if this hook is left after tracing has been
|
|
||||||
// unconfigured. worst case when teardown is racing the panic is to log the panic twice.
|
|
||||||
tracing::dispatcher::get_default(|d| {
|
|
||||||
if let Some(_none) = d.downcast_ref::<tracing::subscriber::NoSubscriber>() {
|
|
||||||
let location = location.map(PrettyLocation);
|
|
||||||
log_panic_to_stderr(thread, msg, location, &backtrace);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cold]
|
|
||||||
fn log_panic_to_stderr(
|
|
||||||
thread: &str,
|
|
||||||
msg: &str,
|
|
||||||
location: Option<PrettyLocation<'_, '_>>,
|
|
||||||
backtrace: &std::backtrace::Backtrace,
|
|
||||||
) {
|
|
||||||
eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}");
|
|
||||||
}
|
|
||||||
|
|
||||||
struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>);
|
|
||||||
|
|
||||||
impl std::fmt::Display for PrettyLocation<'_, '_> {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Debug for PrettyLocation<'_, '_> {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
<Self as std::fmt::Display>::fmt(self, f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
4
libs/walproposer/.gitignore
vendored
4
libs/walproposer/.gitignore
vendored
@@ -1,4 +0,0 @@
|
|||||||
*.a
|
|
||||||
*.o
|
|
||||||
*.tmp
|
|
||||||
pgdata
|
|
||||||
@@ -1,39 +0,0 @@
|
|||||||
[package]
|
|
||||||
name = "walproposer"
|
|
||||||
version = "0.1.0"
|
|
||||||
edition.workspace = true
|
|
||||||
license.workspace = true
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
atty.workspace = true
|
|
||||||
rand.workspace = true
|
|
||||||
regex.workspace = true
|
|
||||||
bytes.workspace = true
|
|
||||||
byteorder.workspace = true
|
|
||||||
anyhow.workspace = true
|
|
||||||
crc32c.workspace = true
|
|
||||||
hex.workspace = true
|
|
||||||
once_cell.workspace = true
|
|
||||||
log.workspace = true
|
|
||||||
libc.workspace = true
|
|
||||||
memoffset.workspace = true
|
|
||||||
thiserror.workspace = true
|
|
||||||
tracing.workspace = true
|
|
||||||
tracing-subscriber = { workspace = true, features = ["json"] }
|
|
||||||
serde.workspace = true
|
|
||||||
scopeguard.workspace = true
|
|
||||||
utils.workspace = true
|
|
||||||
safekeeper.workspace = true
|
|
||||||
postgres_ffi.workspace = true
|
|
||||||
hyper.workspace = true
|
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
|
||||||
|
|
||||||
[dev-dependencies]
|
|
||||||
env_logger.workspace = true
|
|
||||||
postgres.workspace = true
|
|
||||||
|
|
||||||
[build-dependencies]
|
|
||||||
anyhow.workspace = true
|
|
||||||
bindgen.workspace = true
|
|
||||||
cbindgen = "0.24.0"
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
# walproposer Rust module
|
|
||||||
|
|
||||||
## Rust -> C
|
|
||||||
|
|
||||||
We compile walproposer as a static library and generate Rust bindings for it using `bindgen`.
|
|
||||||
Entrypoint header file is `bindgen_deps.h`.
|
|
||||||
|
|
||||||
## C -> Rust
|
|
||||||
|
|
||||||
We use `cbindgen` to generate C bindings for the Rust code. They are stored in `rust_bindings.h`.
|
|
||||||
|
|
||||||
## How to run the tests
|
|
||||||
|
|
||||||
```
|
|
||||||
export RUSTFLAGS="-C default-linker-libraries"
|
|
||||||
```
|
|
||||||
@@ -1,30 +0,0 @@
|
|||||||
/*
|
|
||||||
* This header file is the input to bindgen. It includes all the
|
|
||||||
* PostgreSQL headers that we need to auto-generate Rust structs
|
|
||||||
* from. If you need to expose a new struct to Rust code, add the
|
|
||||||
* header here, and whitelist the struct in the build.rs file.
|
|
||||||
*/
|
|
||||||
#include "c.h"
|
|
||||||
#include "walproposer.h"
|
|
||||||
|
|
||||||
#include <stdarg.h>
|
|
||||||
#include <stdbool.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
// Calc a sum of two numbers. Used to test Rust->C function calls.
|
|
||||||
int TestFunc(int a, int b);
|
|
||||||
|
|
||||||
// Run a client for simple simlib test.
|
|
||||||
void RunClientC(uint32_t serverId);
|
|
||||||
|
|
||||||
void WalProposerRust();
|
|
||||||
|
|
||||||
void WalProposerCleanup();
|
|
||||||
|
|
||||||
extern bool debug_enabled;
|
|
||||||
|
|
||||||
// Initialize global variables before calling any Postgres C code.
|
|
||||||
void MyContextInit();
|
|
||||||
|
|
||||||
XLogRecPtr MyInsertRecord();
|
|
||||||
@@ -1,137 +0,0 @@
|
|||||||
use std::{env, path::PathBuf, process::Command};
|
|
||||||
use anyhow::{anyhow, Context};
|
|
||||||
use bindgen::CargoCallbacks;
|
|
||||||
|
|
||||||
extern crate bindgen;
|
|
||||||
|
|
||||||
fn main() -> anyhow::Result<()> {
|
|
||||||
let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
|
|
||||||
|
|
||||||
cbindgen::Builder::new()
|
|
||||||
.with_crate(crate_dir)
|
|
||||||
.with_language(cbindgen::Language::C)
|
|
||||||
.generate()
|
|
||||||
.expect("Unable to generate bindings")
|
|
||||||
.write_to_file("rust_bindings.h");
|
|
||||||
|
|
||||||
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
|
||||||
println!("cargo:rerun-if-changed=bindgen_deps.h,test.c,../../pgxn/neon/walproposer.c,build.sh");
|
|
||||||
println!("cargo:rustc-link-arg=-Wl,--start-group");
|
|
||||||
println!("cargo:rustc-link-arg=-lsim");
|
|
||||||
println!("cargo:rustc-link-arg=-lpgport_srv");
|
|
||||||
println!("cargo:rustc-link-arg=-lpostgres");
|
|
||||||
println!("cargo:rustc-link-arg=-lpgcommon_srv");
|
|
||||||
println!("cargo:rustc-link-arg=-lssl");
|
|
||||||
println!("cargo:rustc-link-arg=-lcrypto");
|
|
||||||
println!("cargo:rustc-link-arg=-lz");
|
|
||||||
println!("cargo:rustc-link-arg=-lpthread");
|
|
||||||
println!("cargo:rustc-link-arg=-lrt");
|
|
||||||
println!("cargo:rustc-link-arg=-ldl");
|
|
||||||
println!("cargo:rustc-link-arg=-lm");
|
|
||||||
println!("cargo:rustc-link-arg=-lwalproposer");
|
|
||||||
println!("cargo:rustc-link-arg=-Wl,--end-group");
|
|
||||||
println!("cargo:rustc-link-search=/home/admin/simulator/libs/walproposer");
|
|
||||||
// disable fPIE
|
|
||||||
println!("cargo:rustc-link-arg=-no-pie");
|
|
||||||
|
|
||||||
// print output of build.sh
|
|
||||||
let output = std::process::Command::new("./build.sh")
|
|
||||||
.output()
|
|
||||||
.expect("could not spawn `clang`");
|
|
||||||
|
|
||||||
println!("stdout: {}", String::from_utf8(output.stdout).unwrap());
|
|
||||||
println!("stderr: {}", String::from_utf8(output.stderr).unwrap());
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
// Panic if the command was not successful.
|
|
||||||
panic!("could not compile object file");
|
|
||||||
}
|
|
||||||
|
|
||||||
// // Finding the location of C headers for the Postgres server:
|
|
||||||
// // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/pg_install`
|
|
||||||
// // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/pg_install/{PG_MAJORVERSION}/include/postgresql/server`
|
|
||||||
let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
|
|
||||||
postgres_install_dir.into()
|
|
||||||
} else {
|
|
||||||
PathBuf::from("pg_install")
|
|
||||||
};
|
|
||||||
|
|
||||||
let pg_version = "v15";
|
|
||||||
let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
|
|
||||||
if pg_install_dir_versioned.is_relative() {
|
|
||||||
let cwd = env::current_dir().context("Failed to get current_dir")?;
|
|
||||||
pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned);
|
|
||||||
}
|
|
||||||
|
|
||||||
let pg_config_bin = pg_install_dir_versioned
|
|
||||||
.join(pg_version)
|
|
||||||
.join("bin")
|
|
||||||
.join("pg_config");
|
|
||||||
let inc_server_path: String = if pg_config_bin.exists() {
|
|
||||||
let output = Command::new(pg_config_bin)
|
|
||||||
.arg("--includedir-server")
|
|
||||||
.output()
|
|
||||||
.context("failed to execute `pg_config --includedir-server`")?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
panic!("`pg_config --includedir-server` failed")
|
|
||||||
}
|
|
||||||
|
|
||||||
String::from_utf8(output.stdout)
|
|
||||||
.context("pg_config output is not UTF-8")?
|
|
||||||
.trim_end()
|
|
||||||
.into()
|
|
||||||
} else {
|
|
||||||
let server_path = pg_install_dir_versioned
|
|
||||||
.join("include")
|
|
||||||
.join("postgresql")
|
|
||||||
.join("server")
|
|
||||||
.into_os_string();
|
|
||||||
server_path
|
|
||||||
.into_string()
|
|
||||||
.map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
|
|
||||||
};
|
|
||||||
|
|
||||||
let inc_pgxn_path = "/home/admin/simulator/pgxn/neon";
|
|
||||||
|
|
||||||
// The bindgen::Builder is the main entry point
|
|
||||||
// to bindgen, and lets you build up options for
|
|
||||||
// the resulting bindings.
|
|
||||||
let bindings = bindgen::Builder::default()
|
|
||||||
// The input header we would like to generate
|
|
||||||
// bindings for.
|
|
||||||
.header("bindgen_deps.h")
|
|
||||||
// Tell cargo to invalidate the built crate whenever any of the
|
|
||||||
// included header files changed.
|
|
||||||
.parse_callbacks(Box::new(CargoCallbacks))
|
|
||||||
.allowlist_function("TestFunc")
|
|
||||||
.allowlist_function("RunClientC")
|
|
||||||
.allowlist_function("WalProposerRust")
|
|
||||||
.allowlist_function("MyContextInit")
|
|
||||||
.allowlist_function("WalProposerCleanup")
|
|
||||||
.allowlist_function("MyInsertRecord")
|
|
||||||
.allowlist_var("wal_acceptors_list")
|
|
||||||
.allowlist_var("wal_acceptor_reconnect_timeout")
|
|
||||||
.allowlist_var("wal_acceptor_connection_timeout")
|
|
||||||
.allowlist_var("am_wal_proposer")
|
|
||||||
.allowlist_var("neon_timeline_walproposer")
|
|
||||||
.allowlist_var("neon_tenant_walproposer")
|
|
||||||
.allowlist_var("syncSafekeepers")
|
|
||||||
.allowlist_var("sim_redo_start_lsn")
|
|
||||||
.allowlist_var("debug_enabled")
|
|
||||||
.clang_arg(format!("-I{inc_server_path}"))
|
|
||||||
.clang_arg(format!("-I{inc_pgxn_path}"))
|
|
||||||
.clang_arg(format!("-DSIMLIB"))
|
|
||||||
// Finish the builder and generate the bindings.
|
|
||||||
.generate()
|
|
||||||
// Unwrap the Result and panic on failure.
|
|
||||||
.expect("Unable to generate bindings");
|
|
||||||
|
|
||||||
// Write the bindings to the $OUT_DIR/bindings.rs file.
|
|
||||||
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs");
|
|
||||||
bindings
|
|
||||||
.write_to_file(out_path)
|
|
||||||
.expect("Couldn't write bindings!");
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
cd /home/admin/simulator/libs/walproposer
|
|
||||||
|
|
||||||
# TODO: rewrite to Makefile
|
|
||||||
|
|
||||||
make -C ../.. neon-pg-ext-walproposer
|
|
||||||
make -C ../../pg_install/build/v15/src/backend postgres-lib -s
|
|
||||||
cp ../../pg_install/build/v15/src/backend/libpostgres.a .
|
|
||||||
cp ../../pg_install/build/v15/src/common/libpgcommon_srv.a .
|
|
||||||
cp ../../pg_install/build/v15/src/port/libpgport_srv.a .
|
|
||||||
|
|
||||||
clang -g -c libpqwalproposer.c test.c -ferror-limit=1 -I ../../pg_install/v15/include/postgresql/server -I ../../pgxn/neon
|
|
||||||
rm -rf libsim.a
|
|
||||||
ar rcs libsim.a test.o libpqwalproposer.o
|
|
||||||
|
|
||||||
rm -rf libwalproposer.a
|
|
||||||
|
|
||||||
PGXN_DIR=../../pg_install/build/neon-v15/
|
|
||||||
ar rcs libwalproposer.a $PGXN_DIR/walproposer.o $PGXN_DIR/walproposer_utils.o $PGXN_DIR/neon.o
|
|
||||||
@@ -1,542 +0,0 @@
|
|||||||
#include "postgres.h"
|
|
||||||
#include "neon.h"
|
|
||||||
#include "walproposer.h"
|
|
||||||
#include "rust_bindings.h"
|
|
||||||
#include "replication/message.h"
|
|
||||||
#include "access/xlog_internal.h"
|
|
||||||
|
|
||||||
// defined in walproposer.h
|
|
||||||
uint64 sim_redo_start_lsn;
|
|
||||||
XLogRecPtr sim_latest_available_lsn;
|
|
||||||
|
|
||||||
/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
|
|
||||||
struct WalProposerConn
|
|
||||||
{
|
|
||||||
int64_t tcp;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Helper function */
|
|
||||||
static bool
|
|
||||||
ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
|
|
||||||
{
|
|
||||||
// walprop_log(LOG, "not implemented");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Exported function definitions */
|
|
||||||
char *
|
|
||||||
walprop_error_message(WalProposerConn *conn)
|
|
||||||
{
|
|
||||||
// walprop_log(LOG, "not implemented");
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
WalProposerConnStatusType
|
|
||||||
walprop_status(WalProposerConn *conn)
|
|
||||||
{
|
|
||||||
// walprop_log(LOG, "not implemented: walprop_status");
|
|
||||||
return WP_CONNECTION_OK;
|
|
||||||
}
|
|
||||||
|
|
||||||
WalProposerConn *
|
|
||||||
walprop_connect_start(char *conninfo)
|
|
||||||
{
|
|
||||||
WalProposerConn *conn;
|
|
||||||
|
|
||||||
walprop_log(LOG, "walprop_connect_start: %s", conninfo);
|
|
||||||
|
|
||||||
const char *connstr_prefix = "host=node port=";
|
|
||||||
Assert(strncmp(conninfo, connstr_prefix, strlen(connstr_prefix)) == 0);
|
|
||||||
|
|
||||||
int nodeId = atoi(conninfo + strlen(connstr_prefix));
|
|
||||||
|
|
||||||
conn = palloc(sizeof(WalProposerConn));
|
|
||||||
conn->tcp = sim_open_tcp(nodeId);
|
|
||||||
return conn;
|
|
||||||
}
|
|
||||||
|
|
||||||
WalProposerConnectPollStatusType
|
|
||||||
walprop_connect_poll(WalProposerConn *conn)
|
|
||||||
{
|
|
||||||
// walprop_log(LOG, "not implemented: walprop_connect_poll");
|
|
||||||
return WP_CONN_POLLING_OK;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool
|
|
||||||
walprop_send_query(WalProposerConn *conn, char *query)
|
|
||||||
{
|
|
||||||
// walprop_log(LOG, "not implemented: walprop_send_query");
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
WalProposerExecStatusType
|
|
||||||
walprop_get_query_result(WalProposerConn *conn)
|
|
||||||
{
|
|
||||||
// walprop_log(LOG, "not implemented: walprop_get_query_result");
|
|
||||||
return WP_EXEC_SUCCESS_COPYBOTH;
|
|
||||||
}
|
|
||||||
|
|
||||||
pgsocket
|
|
||||||
walprop_socket(WalProposerConn *conn)
|
|
||||||
{
|
|
||||||
return (pgsocket) conn->tcp;
|
|
||||||
}
|
|
||||||
|
|
||||||
int
|
|
||||||
walprop_flush(WalProposerConn *conn)
|
|
||||||
{
|
|
||||||
// walprop_log(LOG, "not implemented");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
walprop_finish(WalProposerConn *conn)
|
|
||||||
{
|
|
||||||
// walprop_log(LOG, "walprop_finish not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Receive a message from the safekeeper.
|
|
||||||
*
|
|
||||||
* On success, the data is placed in *buf. It is valid until the next call
|
|
||||||
* to this function.
|
|
||||||
*/
|
|
||||||
PGAsyncReadResult
|
|
||||||
walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
|
|
||||||
{
|
|
||||||
uintptr_t len;
|
|
||||||
char *msg;
|
|
||||||
Event event;
|
|
||||||
|
|
||||||
event = sim_epoll_peek(0);
|
|
||||||
if (event.tcp != conn->tcp || event.tag != Message || event.any_message != Bytes)
|
|
||||||
return PG_ASYNC_READ_TRY_AGAIN;
|
|
||||||
|
|
||||||
event = sim_epoll_rcv(0);
|
|
||||||
|
|
||||||
// walprop_log(LOG, "walprop_async_read, T: %d, tcp: %d, tag: %d", (int) event.tag, (int) event.tcp, (int) event.any_message);
|
|
||||||
Assert(event.tcp == conn->tcp);
|
|
||||||
Assert(event.tag == Message);
|
|
||||||
Assert(event.any_message == Bytes);
|
|
||||||
|
|
||||||
msg = (char*) sim_msg_get_bytes(&len);
|
|
||||||
*buf = msg;
|
|
||||||
*amount = len;
|
|
||||||
// walprop_log(LOG, "walprop_async_read: %d", (int) len);
|
|
||||||
|
|
||||||
return PG_ASYNC_READ_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
PGAsyncWriteResult
|
|
||||||
walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
|
|
||||||
{
|
|
||||||
// walprop_log(LOG, "walprop_async_write");
|
|
||||||
sim_msg_set_bytes(buf, size);
|
|
||||||
sim_tcp_send(conn->tcp);
|
|
||||||
return PG_ASYNC_WRITE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* This function is very similar to walprop_async_write. For more
|
|
||||||
* information, refer to the comments there.
|
|
||||||
*/
|
|
||||||
bool
|
|
||||||
walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
|
|
||||||
{
|
|
||||||
// walprop_log(LOG, "walprop_blocking_write");
|
|
||||||
sim_msg_set_bytes(buf, size);
|
|
||||||
sim_tcp_send(conn->tcp);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
sim_start_replication(XLogRecPtr startptr)
|
|
||||||
{
|
|
||||||
walprop_log(LOG, "sim_start_replication: %X/%X", LSN_FORMAT_ARGS(startptr));
|
|
||||||
sim_latest_available_lsn = startptr;
|
|
||||||
|
|
||||||
for (;;)
|
|
||||||
{
|
|
||||||
XLogRecPtr endptr = sim_latest_available_lsn;
|
|
||||||
|
|
||||||
Assert(startptr <= endptr);
|
|
||||||
if (endptr > startptr)
|
|
||||||
{
|
|
||||||
WalProposerBroadcast(startptr, endptr);
|
|
||||||
startptr = endptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
WalProposerPoll();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
|
|
||||||
|
|
||||||
static int UsableBytesInSegment =
|
|
||||||
(DEFAULT_XLOG_SEG_SIZE / XLOG_BLCKSZ * UsableBytesInPage) -
|
|
||||||
(SizeOfXLogLongPHD - SizeOfXLogShortPHD);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Converts a "usable byte position" to XLogRecPtr. A usable byte position
|
|
||||||
* is the position starting from the beginning of WAL, excluding all WAL
|
|
||||||
* page headers.
|
|
||||||
*/
|
|
||||||
static XLogRecPtr
|
|
||||||
XLogBytePosToRecPtr(uint64 bytepos)
|
|
||||||
{
|
|
||||||
uint64 fullsegs;
|
|
||||||
uint64 fullpages;
|
|
||||||
uint64 bytesleft;
|
|
||||||
uint32 seg_offset;
|
|
||||||
XLogRecPtr result;
|
|
||||||
|
|
||||||
fullsegs = bytepos / UsableBytesInSegment;
|
|
||||||
bytesleft = bytepos % UsableBytesInSegment;
|
|
||||||
|
|
||||||
if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
|
|
||||||
{
|
|
||||||
/* fits on first page of segment */
|
|
||||||
seg_offset = bytesleft + SizeOfXLogLongPHD;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* account for the first page on segment with long header */
|
|
||||||
seg_offset = XLOG_BLCKSZ;
|
|
||||||
bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
|
|
||||||
|
|
||||||
fullpages = bytesleft / UsableBytesInPage;
|
|
||||||
bytesleft = bytesleft % UsableBytesInPage;
|
|
||||||
|
|
||||||
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
|
|
||||||
}
|
|
||||||
|
|
||||||
XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Convert an XLogRecPtr to a "usable byte position".
|
|
||||||
*/
|
|
||||||
static uint64
|
|
||||||
XLogRecPtrToBytePos(XLogRecPtr ptr)
|
|
||||||
{
|
|
||||||
uint64 fullsegs;
|
|
||||||
uint32 fullpages;
|
|
||||||
uint32 offset;
|
|
||||||
uint64 result;
|
|
||||||
|
|
||||||
XLByteToSeg(ptr, fullsegs, wal_segment_size);
|
|
||||||
|
|
||||||
fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
|
|
||||||
offset = ptr % XLOG_BLCKSZ;
|
|
||||||
|
|
||||||
if (fullpages == 0)
|
|
||||||
{
|
|
||||||
result = fullsegs * UsableBytesInSegment;
|
|
||||||
if (offset > 0)
|
|
||||||
{
|
|
||||||
Assert(offset >= SizeOfXLogLongPHD);
|
|
||||||
result += offset - SizeOfXLogLongPHD;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
result = fullsegs * UsableBytesInSegment +
|
|
||||||
(XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
|
|
||||||
(fullpages - 1) * UsableBytesInPage; /* full pages */
|
|
||||||
if (offset > 0)
|
|
||||||
{
|
|
||||||
Assert(offset >= SizeOfXLogShortPHD);
|
|
||||||
result += offset - SizeOfXLogShortPHD;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define max_rdatas 16
|
|
||||||
|
|
||||||
void InitMyInsert();
|
|
||||||
static void MyBeginInsert();
|
|
||||||
static void MyRegisterData(char *data, int len);
|
|
||||||
static XLogRecPtr MyFinishInsert(RmgrId rmid, uint8 info, uint8 flags);
|
|
||||||
static void MyCopyXLogRecordToWAL(int write_len, XLogRecData *rdata, XLogRecPtr StartPos, XLogRecPtr EndPos);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* An array of XLogRecData structs, to hold registered data.
|
|
||||||
*/
|
|
||||||
static XLogRecData rdatas[max_rdatas];
|
|
||||||
static int num_rdatas; /* entries currently used */
|
|
||||||
static uint32 mainrdata_len; /* total # of bytes in chain */
|
|
||||||
static XLogRecData hdr_rdt;
|
|
||||||
static char hdr_scratch[16000];
|
|
||||||
static XLogRecPtr CurrBytePos;
|
|
||||||
static XLogRecPtr PrevBytePos;
|
|
||||||
|
|
||||||
void InitMyInsert()
|
|
||||||
{
|
|
||||||
CurrBytePos = sim_redo_start_lsn;
|
|
||||||
PrevBytePos = InvalidXLogRecPtr;
|
|
||||||
sim_latest_available_lsn = sim_redo_start_lsn;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void MyBeginInsert()
|
|
||||||
{
|
|
||||||
num_rdatas = 0;
|
|
||||||
mainrdata_len = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void MyRegisterData(char *data, int len)
|
|
||||||
{
|
|
||||||
XLogRecData *rdata;
|
|
||||||
|
|
||||||
if (num_rdatas >= max_rdatas)
|
|
||||||
walprop_log(ERROR, "too much WAL data");
|
|
||||||
rdata = &rdatas[num_rdatas++];
|
|
||||||
|
|
||||||
rdata->data = data;
|
|
||||||
rdata->len = len;
|
|
||||||
rdata->next = NULL;
|
|
||||||
|
|
||||||
if (num_rdatas > 1) {
|
|
||||||
rdatas[num_rdatas - 2].next = rdata;
|
|
||||||
}
|
|
||||||
|
|
||||||
mainrdata_len += len;
|
|
||||||
}
|
|
||||||
|
|
||||||
static XLogRecPtr
|
|
||||||
MyFinishInsert(RmgrId rmid, uint8 info, uint8 flags)
|
|
||||||
{
|
|
||||||
XLogRecData *rdt;
|
|
||||||
uint32 total_len = 0;
|
|
||||||
int block_id;
|
|
||||||
pg_crc32c rdata_crc;
|
|
||||||
XLogRecord *rechdr;
|
|
||||||
char *scratch = hdr_scratch;
|
|
||||||
int size;
|
|
||||||
XLogRecPtr StartPos;
|
|
||||||
XLogRecPtr EndPos;
|
|
||||||
uint64 startbytepos;
|
|
||||||
uint64 endbytepos;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Note: this function can be called multiple times for the same record.
|
|
||||||
* All the modifications we do to the rdata chains below must handle that.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* The record begins with the fixed-size header */
|
|
||||||
rechdr = (XLogRecord *) scratch;
|
|
||||||
scratch += SizeOfXLogRecord;
|
|
||||||
|
|
||||||
hdr_rdt.data = hdr_scratch;
|
|
||||||
|
|
||||||
if (num_rdatas > 0)
|
|
||||||
{
|
|
||||||
hdr_rdt.next = &rdatas[0];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
hdr_rdt.next = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* followed by main data, if any */
|
|
||||||
if (mainrdata_len > 0)
|
|
||||||
{
|
|
||||||
if (mainrdata_len > 255)
|
|
||||||
{
|
|
||||||
*(scratch++) = (char) XLR_BLOCK_ID_DATA_LONG;
|
|
||||||
memcpy(scratch, &mainrdata_len, sizeof(uint32));
|
|
||||||
scratch += sizeof(uint32);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
*(scratch++) = (char) XLR_BLOCK_ID_DATA_SHORT;
|
|
||||||
*(scratch++) = (uint8) mainrdata_len;
|
|
||||||
}
|
|
||||||
total_len += mainrdata_len;
|
|
||||||
}
|
|
||||||
|
|
||||||
hdr_rdt.len = (scratch - hdr_scratch);
|
|
||||||
total_len += hdr_rdt.len;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Calculate CRC of the data
|
|
||||||
*
|
|
||||||
* Note that the record header isn't added into the CRC initially since we
|
|
||||||
* don't know the prev-link yet. Thus, the CRC will represent the CRC of
|
|
||||||
* the whole record in the order: rdata, then backup blocks, then record
|
|
||||||
* header.
|
|
||||||
*/
|
|
||||||
INIT_CRC32C(rdata_crc);
|
|
||||||
COMP_CRC32C(rdata_crc, hdr_scratch + SizeOfXLogRecord, hdr_rdt.len - SizeOfXLogRecord);
|
|
||||||
for (size_t i = 0; i < num_rdatas; i++)
|
|
||||||
{
|
|
||||||
rdt = &rdatas[i];
|
|
||||||
COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Fill in the fields in the record header. Prev-link is filled in later,
|
|
||||||
* once we know where in the WAL the record will be inserted. The CRC does
|
|
||||||
* not include the record header yet.
|
|
||||||
*/
|
|
||||||
rechdr->xl_xid = 0;
|
|
||||||
rechdr->xl_tot_len = total_len;
|
|
||||||
rechdr->xl_info = info;
|
|
||||||
rechdr->xl_rmid = rmid;
|
|
||||||
rechdr->xl_prev = InvalidXLogRecPtr;
|
|
||||||
rechdr->xl_crc = rdata_crc;
|
|
||||||
|
|
||||||
size = MAXALIGN(rechdr->xl_tot_len);
|
|
||||||
|
|
||||||
/* All (non xlog-switch) records should contain data. */
|
|
||||||
Assert(size > SizeOfXLogRecord);
|
|
||||||
|
|
||||||
startbytepos = XLogRecPtrToBytePos(CurrBytePos);
|
|
||||||
endbytepos = startbytepos + size;
|
|
||||||
|
|
||||||
// Get the position.
|
|
||||||
StartPos = XLogBytePosToRecPtr(startbytepos);
|
|
||||||
EndPos = XLogBytePosToRecPtr(startbytepos + size);
|
|
||||||
rechdr->xl_prev = PrevBytePos;
|
|
||||||
|
|
||||||
Assert(XLogRecPtrToBytePos(StartPos) == startbytepos);
|
|
||||||
Assert(XLogRecPtrToBytePos(EndPos) == endbytepos);
|
|
||||||
|
|
||||||
// Update global pointers.
|
|
||||||
CurrBytePos = EndPos;
|
|
||||||
PrevBytePos = StartPos;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Now that xl_prev has been filled in, calculate CRC of the record
|
|
||||||
* header.
|
|
||||||
*/
|
|
||||||
rdata_crc = rechdr->xl_crc;
|
|
||||||
COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
|
|
||||||
FIN_CRC32C(rdata_crc);
|
|
||||||
rechdr->xl_crc = rdata_crc;
|
|
||||||
|
|
||||||
// Now write it to disk.
|
|
||||||
MyCopyXLogRecordToWAL(rechdr->xl_tot_len, &hdr_rdt, StartPos, EndPos);
|
|
||||||
return EndPos;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define INSERT_FREESPACE(endptr) \
|
|
||||||
(((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
|
|
||||||
|
|
||||||
static void
|
|
||||||
MyCopyXLogRecordToWAL(int write_len, XLogRecData *rdata, XLogRecPtr StartPos, XLogRecPtr EndPos)
|
|
||||||
{
|
|
||||||
XLogRecPtr CurrPos;
|
|
||||||
int written;
|
|
||||||
int freespace;
|
|
||||||
|
|
||||||
// Write hdr_rdt and `num_rdatas` other datas.
|
|
||||||
CurrPos = StartPos;
|
|
||||||
freespace = INSERT_FREESPACE(CurrPos);
|
|
||||||
written = 0;
|
|
||||||
|
|
||||||
Assert(freespace >= sizeof(uint32));
|
|
||||||
|
|
||||||
while (rdata != NULL)
|
|
||||||
{
|
|
||||||
char *rdata_data = rdata->data;
|
|
||||||
int rdata_len = rdata->len;
|
|
||||||
|
|
||||||
while (rdata_len >= freespace)
|
|
||||||
{
|
|
||||||
char header_buf[SizeOfXLogLongPHD];
|
|
||||||
XLogPageHeader NewPage = (XLogPageHeader) header_buf;
|
|
||||||
|
|
||||||
Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
|
|
||||||
XLogWalPropWrite(rdata_data, freespace, CurrPos);
|
|
||||||
rdata_data += freespace;
|
|
||||||
rdata_len -= freespace;
|
|
||||||
written += freespace;
|
|
||||||
CurrPos += freespace;
|
|
||||||
|
|
||||||
// Init new page
|
|
||||||
MemSet(header_buf, 0, SizeOfXLogLongPHD);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Fill the new page's header
|
|
||||||
*/
|
|
||||||
NewPage->xlp_magic = XLOG_PAGE_MAGIC;
|
|
||||||
|
|
||||||
/* NewPage->xlp_info = 0; */ /* done by memset */
|
|
||||||
NewPage->xlp_tli = 1;
|
|
||||||
NewPage->xlp_pageaddr = CurrPos;
|
|
||||||
|
|
||||||
/* NewPage->xlp_rem_len = 0; */ /* done by memset */
|
|
||||||
NewPage->xlp_info |= XLP_BKP_REMOVABLE;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If first page of an XLOG segment file, make it a long header.
|
|
||||||
*/
|
|
||||||
if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
|
|
||||||
{
|
|
||||||
XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
|
|
||||||
|
|
||||||
NewLongPage->xlp_sysid = 0;
|
|
||||||
NewLongPage->xlp_seg_size = wal_segment_size;
|
|
||||||
NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
|
|
||||||
NewPage->xlp_info |= XLP_LONG_HEADER;
|
|
||||||
}
|
|
||||||
|
|
||||||
NewPage->xlp_rem_len = write_len - written;
|
|
||||||
if (NewPage->xlp_rem_len > 0) {
|
|
||||||
NewPage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* skip over the page header */
|
|
||||||
if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
|
|
||||||
{
|
|
||||||
XLogWalPropWrite(header_buf, SizeOfXLogLongPHD, CurrPos);
|
|
||||||
CurrPos += SizeOfXLogLongPHD;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
XLogWalPropWrite(header_buf, SizeOfXLogShortPHD, CurrPos);
|
|
||||||
CurrPos += SizeOfXLogShortPHD;
|
|
||||||
}
|
|
||||||
freespace = INSERT_FREESPACE(CurrPos);
|
|
||||||
}
|
|
||||||
|
|
||||||
Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
|
|
||||||
XLogWalPropWrite(rdata_data, rdata_len, CurrPos);
|
|
||||||
CurrPos += rdata_len;
|
|
||||||
written += rdata_len;
|
|
||||||
freespace -= rdata_len;
|
|
||||||
|
|
||||||
rdata = rdata->next;
|
|
||||||
}
|
|
||||||
|
|
||||||
Assert(written == write_len);
|
|
||||||
CurrPos = MAXALIGN64(CurrPos);
|
|
||||||
Assert(CurrPos == EndPos);
|
|
||||||
}
|
|
||||||
|
|
||||||
XLogRecPtr MyInsertRecord()
|
|
||||||
{
|
|
||||||
const char *prefix = "prefix";
|
|
||||||
const char *message = "message";
|
|
||||||
size_t size = 7;
|
|
||||||
bool transactional = false;
|
|
||||||
|
|
||||||
xl_logical_message xlrec;
|
|
||||||
|
|
||||||
xlrec.dbId = 0;
|
|
||||||
xlrec.transactional = transactional;
|
|
||||||
/* trailing zero is critical; see logicalmsg_desc */
|
|
||||||
xlrec.prefix_size = strlen(prefix) + 1;
|
|
||||||
xlrec.message_size = size;
|
|
||||||
|
|
||||||
MyBeginInsert();
|
|
||||||
MyRegisterData((char *) &xlrec, SizeOfLogicalMessage);
|
|
||||||
MyRegisterData(unconstify(char *, prefix), xlrec.prefix_size);
|
|
||||||
MyRegisterData(unconstify(char *, message), size);
|
|
||||||
|
|
||||||
return MyFinishInsert(RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLOG_INCLUDE_ORIGIN);
|
|
||||||
}
|
|
||||||
@@ -1,106 +0,0 @@
|
|||||||
#include <stdarg.h>
|
|
||||||
#include <stdbool.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
/**
|
|
||||||
* List of all possible AnyMessage.
|
|
||||||
*/
|
|
||||||
enum AnyMessageTag {
|
|
||||||
None,
|
|
||||||
InternalConnect,
|
|
||||||
Just32,
|
|
||||||
ReplCell,
|
|
||||||
Bytes,
|
|
||||||
LSN,
|
|
||||||
};
|
|
||||||
typedef uint8_t AnyMessageTag;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* List of all possible NodeEvent.
|
|
||||||
*/
|
|
||||||
enum EventTag {
|
|
||||||
Timeout,
|
|
||||||
Accept,
|
|
||||||
Closed,
|
|
||||||
Message,
|
|
||||||
Internal,
|
|
||||||
};
|
|
||||||
typedef uint8_t EventTag;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Event returned by epoll_recv.
|
|
||||||
*/
|
|
||||||
typedef struct Event {
|
|
||||||
EventTag tag;
|
|
||||||
int64_t tcp;
|
|
||||||
AnyMessageTag any_message;
|
|
||||||
} Event;
|
|
||||||
|
|
||||||
void rust_function(uint32_t a);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* C API for the node os.
|
|
||||||
*/
|
|
||||||
void sim_sleep(uint64_t ms);
|
|
||||||
|
|
||||||
uint64_t sim_random(uint64_t max);
|
|
||||||
|
|
||||||
uint32_t sim_id(void);
|
|
||||||
|
|
||||||
int64_t sim_open_tcp(uint32_t dst);
|
|
||||||
|
|
||||||
int64_t sim_open_tcp_nopoll(uint32_t dst);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Send MESSAGE_BUF content to the given tcp.
|
|
||||||
*/
|
|
||||||
void sim_tcp_send(int64_t tcp);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Receive a message from the given tcp. Can be used only with tcp opened with
|
|
||||||
* `sim_open_tcp_nopoll`.
|
|
||||||
*/
|
|
||||||
struct Event sim_tcp_recv(int64_t tcp);
|
|
||||||
|
|
||||||
struct Event sim_epoll_rcv(int64_t timeout);
|
|
||||||
|
|
||||||
struct Event sim_epoll_peek(int64_t timeout);
|
|
||||||
|
|
||||||
int64_t sim_now(void);
|
|
||||||
|
|
||||||
void sim_exit(int32_t code, const uint8_t *msg);
|
|
||||||
|
|
||||||
void sim_set_result(int32_t code, const uint8_t *msg);
|
|
||||||
|
|
||||||
void sim_log_event(const int8_t *msg);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get tag of the current message.
|
|
||||||
*/
|
|
||||||
AnyMessageTag sim_msg_tag(void);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Read AnyMessage::Just32 message.
|
|
||||||
*/
|
|
||||||
void sim_msg_get_just_u32(uint32_t *val);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Read AnyMessage::LSN message.
|
|
||||||
*/
|
|
||||||
void sim_msg_get_lsn(uint64_t *val);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Write AnyMessage::ReplCell message.
|
|
||||||
*/
|
|
||||||
void sim_msg_set_repl_cell(uint32_t value, uint32_t client_id, uint32_t seqno);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Write AnyMessage::Bytes message.
|
|
||||||
*/
|
|
||||||
void sim_msg_set_bytes(const char *bytes, uintptr_t len);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Read AnyMessage::Bytes message.
|
|
||||||
*/
|
|
||||||
const char *sim_msg_get_bytes(uintptr_t *len);
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
#![allow(non_upper_case_globals)]
|
|
||||||
#![allow(non_camel_case_types)]
|
|
||||||
#![allow(non_snake_case)]
|
|
||||||
|
|
||||||
use safekeeper::simlib::node_os::NodeOs;
|
|
||||||
use tracing::info;
|
|
||||||
|
|
||||||
pub mod bindings {
|
|
||||||
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn rust_function(a: u32) {
|
|
||||||
info!("Hello from Rust!");
|
|
||||||
info!("a: {}", a);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub mod sim;
|
|
||||||
pub mod sim_proto;
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod test;
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
pub mod simtest;
|
|
||||||
|
|
||||||
pub fn c_context() -> Option<Box<dyn Fn(NodeOs) + Send + Sync>> {
|
|
||||||
Some(Box::new(|os: NodeOs| {
|
|
||||||
sim::c_attach_node_os(os);
|
|
||||||
unsafe { bindings::MyContextInit(); }
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn enable_debug() {
|
|
||||||
unsafe { bindings::debug_enabled = true; }
|
|
||||||
}
|
|
||||||
@@ -1,240 +0,0 @@
|
|||||||
use log::debug;
|
|
||||||
use safekeeper::simlib::{network::TCP, node_os::NodeOs, world::NodeEvent};
|
|
||||||
use std::{
|
|
||||||
cell::RefCell,
|
|
||||||
collections::HashMap,
|
|
||||||
ffi::{CStr, CString},
|
|
||||||
};
|
|
||||||
use tracing::trace;
|
|
||||||
|
|
||||||
use crate::sim_proto::{anymessage_tag, AnyMessageTag, Event, EventTag, MESSAGE_BUF};
|
|
||||||
|
|
||||||
thread_local! {
|
|
||||||
static CURRENT_NODE_OS: RefCell<Option<NodeOs>> = RefCell::new(None);
|
|
||||||
static TCP_CACHE: RefCell<HashMap<i64, TCP>> = RefCell::new(HashMap::new());
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the current node os.
|
|
||||||
fn os() -> NodeOs {
|
|
||||||
CURRENT_NODE_OS.with(|cell| cell.borrow().clone().expect("no node os set"))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn tcp_save(tcp: TCP) -> i64 {
|
|
||||||
TCP_CACHE.with(|cell| {
|
|
||||||
let mut cache = cell.borrow_mut();
|
|
||||||
let id = tcp.id();
|
|
||||||
cache.insert(id, tcp);
|
|
||||||
id
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn tcp_load(id: i64) -> TCP {
|
|
||||||
TCP_CACHE.with(|cell| {
|
|
||||||
let cache = cell.borrow();
|
|
||||||
cache.get(&id).expect("unknown TCP id").clone()
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Should be called before calling any of the C functions.
|
|
||||||
pub(crate) fn c_attach_node_os(os: NodeOs) {
|
|
||||||
CURRENT_NODE_OS.with(|cell| {
|
|
||||||
*cell.borrow_mut() = Some(os);
|
|
||||||
});
|
|
||||||
TCP_CACHE.with(|cell| {
|
|
||||||
*cell.borrow_mut() = HashMap::new();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/// C API for the node os.
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn sim_sleep(ms: u64) {
|
|
||||||
os().sleep(ms);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn sim_random(max: u64) -> u64 {
|
|
||||||
os().random(max)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn sim_id() -> u32 {
|
|
||||||
os().id().into()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn sim_open_tcp(dst: u32) -> i64 {
|
|
||||||
tcp_save(os().open_tcp(dst.into()))
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn sim_open_tcp_nopoll(dst: u32) -> i64 {
|
|
||||||
tcp_save(os().open_tcp_nopoll(dst.into()))
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
/// Send MESSAGE_BUF content to the given tcp.
|
|
||||||
pub extern "C" fn sim_tcp_send(tcp: i64) {
|
|
||||||
tcp_load(tcp).send(MESSAGE_BUF.with(|cell| cell.borrow().clone()));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
/// Receive a message from the given tcp. Can be used only with tcp opened with
|
|
||||||
/// `sim_open_tcp_nopoll`.
|
|
||||||
pub extern "C" fn sim_tcp_recv(tcp: i64) -> Event {
|
|
||||||
let event = tcp_load(tcp).recv();
|
|
||||||
match event {
|
|
||||||
NodeEvent::Accept(_) => unreachable!(),
|
|
||||||
NodeEvent::Closed(_) => Event {
|
|
||||||
tag: EventTag::Closed,
|
|
||||||
tcp: 0,
|
|
||||||
any_message: AnyMessageTag::None,
|
|
||||||
},
|
|
||||||
NodeEvent::Internal(_) => unreachable!(),
|
|
||||||
NodeEvent::Message((message, _)) => {
|
|
||||||
// store message in thread local storage, C code should use
|
|
||||||
// sim_msg_* functions to access it.
|
|
||||||
MESSAGE_BUF.with(|cell| {
|
|
||||||
*cell.borrow_mut() = message.clone();
|
|
||||||
});
|
|
||||||
Event {
|
|
||||||
tag: EventTag::Message,
|
|
||||||
tcp: 0,
|
|
||||||
any_message: anymessage_tag(&message),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
NodeEvent::WakeTimeout(_) => unreachable!(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn sim_epoll_rcv(timeout: i64) -> Event {
|
|
||||||
let event = os().epoll_recv(timeout);
|
|
||||||
let event = if let Some(event) = event {
|
|
||||||
event
|
|
||||||
} else {
|
|
||||||
return Event {
|
|
||||||
tag: EventTag::Timeout,
|
|
||||||
tcp: 0,
|
|
||||||
any_message: AnyMessageTag::None,
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
match event {
|
|
||||||
NodeEvent::Accept(tcp) => Event {
|
|
||||||
tag: EventTag::Accept,
|
|
||||||
tcp: tcp_save(tcp),
|
|
||||||
any_message: AnyMessageTag::None,
|
|
||||||
},
|
|
||||||
NodeEvent::Closed(tcp) => Event {
|
|
||||||
tag: EventTag::Closed,
|
|
||||||
tcp: tcp_save(tcp),
|
|
||||||
any_message: AnyMessageTag::None,
|
|
||||||
},
|
|
||||||
NodeEvent::Message((message, tcp)) => {
|
|
||||||
// store message in thread local storage, C code should use
|
|
||||||
// sim_msg_* functions to access it.
|
|
||||||
MESSAGE_BUF.with(|cell| {
|
|
||||||
*cell.borrow_mut() = message.clone();
|
|
||||||
});
|
|
||||||
Event {
|
|
||||||
tag: EventTag::Message,
|
|
||||||
tcp: tcp_save(tcp),
|
|
||||||
any_message: anymessage_tag(&message),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
NodeEvent::Internal(message) => {
|
|
||||||
// store message in thread local storage, C code should use
|
|
||||||
// sim_msg_* functions to access it.
|
|
||||||
MESSAGE_BUF.with(|cell| {
|
|
||||||
*cell.borrow_mut() = message.clone();
|
|
||||||
});
|
|
||||||
Event {
|
|
||||||
tag: EventTag::Internal,
|
|
||||||
tcp: 0,
|
|
||||||
any_message: anymessage_tag(&message),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
NodeEvent::WakeTimeout(_) => {
|
|
||||||
// can't happen
|
|
||||||
unreachable!()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn sim_epoll_peek(timeout: i64) -> Event {
|
|
||||||
let event = os().epoll_peek(timeout);
|
|
||||||
let event = if let Some(event) = event {
|
|
||||||
event
|
|
||||||
} else {
|
|
||||||
return Event {
|
|
||||||
tag: EventTag::Timeout,
|
|
||||||
tcp: 0,
|
|
||||||
any_message: AnyMessageTag::None,
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
match event {
|
|
||||||
NodeEvent::Accept(tcp) => Event {
|
|
||||||
tag: EventTag::Accept,
|
|
||||||
tcp: tcp_save(tcp),
|
|
||||||
any_message: AnyMessageTag::None,
|
|
||||||
},
|
|
||||||
NodeEvent::Closed(tcp) => Event {
|
|
||||||
tag: EventTag::Closed,
|
|
||||||
tcp: tcp_save(tcp),
|
|
||||||
any_message: AnyMessageTag::None,
|
|
||||||
},
|
|
||||||
NodeEvent::Message((message, tcp)) => Event {
|
|
||||||
tag: EventTag::Message,
|
|
||||||
tcp: tcp_save(tcp),
|
|
||||||
any_message: anymessage_tag(&message),
|
|
||||||
},
|
|
||||||
NodeEvent::Internal(message) => Event {
|
|
||||||
tag: EventTag::Internal,
|
|
||||||
tcp: 0,
|
|
||||||
any_message: anymessage_tag(&message),
|
|
||||||
},
|
|
||||||
NodeEvent::WakeTimeout(_) => {
|
|
||||||
// can't happen
|
|
||||||
unreachable!()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn sim_now() -> i64 {
|
|
||||||
os().now() as i64
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn sim_exit(code: i32, msg: *const u8) {
|
|
||||||
trace!("sim_exit({}, {:?})", code, msg);
|
|
||||||
sim_set_result(code, msg);
|
|
||||||
|
|
||||||
// I tried to make use of pthread_exit, but it doesn't work.
|
|
||||||
// https://github.com/rust-lang/unsafe-code-guidelines/issues/211
|
|
||||||
// unsafe { libc::pthread_exit(std::ptr::null_mut()) };
|
|
||||||
|
|
||||||
// https://doc.rust-lang.org/nomicon/unwinding.html
|
|
||||||
// Everyone on the internet saying this is UB, but it works for me,
|
|
||||||
// so I'm going to use it for now.
|
|
||||||
panic!("sim_exit() called from C code")
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn sim_set_result(code: i32, msg: *const u8) {
|
|
||||||
let msg = unsafe { CStr::from_ptr(msg as *const i8) };
|
|
||||||
let msg = msg.to_string_lossy().into_owned();
|
|
||||||
debug!("sim_set_result({}, {:?})", code, msg);
|
|
||||||
os().set_result(code, msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
pub extern "C" fn sim_log_event(msg: *const i8) {
|
|
||||||
let msg = unsafe { CStr::from_ptr(msg) };
|
|
||||||
let msg = msg.to_string_lossy().into_owned();
|
|
||||||
debug!("sim_log_event({:?})", msg);
|
|
||||||
os().log_event(msg);
|
|
||||||
}
|
|
||||||
@@ -1,114 +0,0 @@
|
|||||||
use safekeeper::simlib::proto::{AnyMessage, ReplCell};
|
|
||||||
use std::{cell::RefCell, ffi::c_char};
|
|
||||||
|
|
||||||
pub(crate) fn anymessage_tag(msg: &AnyMessage) -> AnyMessageTag {
|
|
||||||
match msg {
|
|
||||||
AnyMessage::None => AnyMessageTag::None,
|
|
||||||
AnyMessage::InternalConnect => AnyMessageTag::InternalConnect,
|
|
||||||
AnyMessage::Just32(_) => AnyMessageTag::Just32,
|
|
||||||
AnyMessage::ReplCell(_) => AnyMessageTag::ReplCell,
|
|
||||||
AnyMessage::Bytes(_) => AnyMessageTag::Bytes,
|
|
||||||
AnyMessage::LSN(_) => AnyMessageTag::LSN,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
thread_local! {
|
|
||||||
pub static MESSAGE_BUF: RefCell<AnyMessage> = RefCell::new(AnyMessage::None);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
/// Get tag of the current message.
|
|
||||||
pub extern "C" fn sim_msg_tag() -> AnyMessageTag {
|
|
||||||
MESSAGE_BUF.with(|cell| anymessage_tag(&*cell.borrow()))
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
/// Read AnyMessage::Just32 message.
|
|
||||||
pub extern "C" fn sim_msg_get_just_u32(val: &mut u32) {
|
|
||||||
MESSAGE_BUF.with(|cell| match &*cell.borrow() {
|
|
||||||
AnyMessage::Just32(v) => {
|
|
||||||
*val = *v;
|
|
||||||
}
|
|
||||||
_ => panic!("expected Just32 message"),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
/// Read AnyMessage::LSN message.
|
|
||||||
pub extern "C" fn sim_msg_get_lsn(val: &mut u64) {
|
|
||||||
MESSAGE_BUF.with(|cell| match &*cell.borrow() {
|
|
||||||
AnyMessage::LSN(v) => {
|
|
||||||
*val = *v;
|
|
||||||
}
|
|
||||||
_ => panic!("expected LSN message"),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
/// Write AnyMessage::ReplCell message.
|
|
||||||
pub extern "C" fn sim_msg_set_repl_cell(value: u32, client_id: u32, seqno: u32) {
|
|
||||||
MESSAGE_BUF.with(|cell| {
|
|
||||||
*cell.borrow_mut() = AnyMessage::ReplCell(ReplCell {
|
|
||||||
value,
|
|
||||||
client_id,
|
|
||||||
seqno,
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
/// Write AnyMessage::Bytes message.
|
|
||||||
pub extern "C" fn sim_msg_set_bytes(bytes: *const c_char, len: usize) {
|
|
||||||
MESSAGE_BUF.with(|cell| {
|
|
||||||
// copy bytes to a Rust Vec
|
|
||||||
let mut v: Vec<u8> = Vec::with_capacity(len);
|
|
||||||
unsafe {
|
|
||||||
v.set_len(len);
|
|
||||||
std::ptr::copy_nonoverlapping(bytes as *const u8, v.as_mut_ptr(), len);
|
|
||||||
}
|
|
||||||
*cell.borrow_mut() = AnyMessage::Bytes(v.into());
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[no_mangle]
|
|
||||||
/// Read AnyMessage::Bytes message.
|
|
||||||
pub extern "C" fn sim_msg_get_bytes(len: *mut usize) -> *const c_char {
|
|
||||||
MESSAGE_BUF.with(|cell| match &*cell.borrow() {
|
|
||||||
AnyMessage::Bytes(v) => {
|
|
||||||
unsafe {
|
|
||||||
*len = v.len();
|
|
||||||
v.as_ptr() as *const i8
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => panic!("expected Bytes message"),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
#[repr(C)]
|
|
||||||
/// Event returned by epoll_recv.
|
|
||||||
pub struct Event {
|
|
||||||
pub tag: EventTag,
|
|
||||||
pub tcp: i64,
|
|
||||||
pub any_message: AnyMessageTag,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[repr(u8)]
|
|
||||||
/// List of all possible NodeEvent.
|
|
||||||
pub enum EventTag {
|
|
||||||
Timeout,
|
|
||||||
Accept,
|
|
||||||
Closed,
|
|
||||||
Message,
|
|
||||||
Internal,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[repr(u8)]
|
|
||||||
/// List of all possible AnyMessage.
|
|
||||||
pub enum AnyMessageTag {
|
|
||||||
None,
|
|
||||||
InternalConnect,
|
|
||||||
Just32,
|
|
||||||
ReplCell,
|
|
||||||
Bytes,
|
|
||||||
LSN,
|
|
||||||
}
|
|
||||||
@@ -1,88 +0,0 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use safekeeper::safekeeper::SafeKeeperState;
|
|
||||||
use safekeeper::simlib::sync::Mutex;
|
|
||||||
use utils::id::TenantTimelineId;
|
|
||||||
|
|
||||||
pub struct Disk {
|
|
||||||
pub timelines: Mutex<HashMap<TenantTimelineId, Arc<TimelineDisk>>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Disk {
|
|
||||||
pub fn new() -> Self {
|
|
||||||
Disk {
|
|
||||||
timelines: Mutex::new(HashMap::new()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn put_state(&self, ttid: &TenantTimelineId, state: SafeKeeperState) -> Arc<TimelineDisk> {
|
|
||||||
self.timelines
|
|
||||||
.lock()
|
|
||||||
.entry(ttid.clone())
|
|
||||||
.and_modify(|e| {
|
|
||||||
let mut mu = e.state.lock();
|
|
||||||
*mu = state.clone();
|
|
||||||
})
|
|
||||||
.or_insert_with(|| {
|
|
||||||
Arc::new(TimelineDisk {
|
|
||||||
state: Mutex::new(state),
|
|
||||||
wal: Mutex::new(BlockStorage::new()),
|
|
||||||
})
|
|
||||||
})
|
|
||||||
.clone()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct TimelineDisk {
|
|
||||||
pub state: Mutex<SafeKeeperState>,
|
|
||||||
pub wal: Mutex<BlockStorage>,
|
|
||||||
}
|
|
||||||
|
|
||||||
const BLOCK_SIZE: usize = 8192;
|
|
||||||
|
|
||||||
pub struct BlockStorage {
|
|
||||||
blocks: HashMap<u64, [u8; BLOCK_SIZE]>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BlockStorage {
|
|
||||||
pub fn new() -> Self {
|
|
||||||
BlockStorage {
|
|
||||||
blocks: HashMap::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn read(&self, pos: u64, buf: &mut [u8]) {
|
|
||||||
let mut buf_offset = 0;
|
|
||||||
let mut storage_pos = pos;
|
|
||||||
while buf_offset < buf.len() {
|
|
||||||
let block_id = storage_pos / BLOCK_SIZE as u64;
|
|
||||||
let block = self.blocks.get(&block_id).unwrap_or(&[0; BLOCK_SIZE]);
|
|
||||||
let block_offset = storage_pos % BLOCK_SIZE as u64;
|
|
||||||
let block_len = BLOCK_SIZE as u64 - block_offset;
|
|
||||||
let buf_len = buf.len() - buf_offset;
|
|
||||||
let copy_len = std::cmp::min(block_len as usize, buf_len);
|
|
||||||
buf[buf_offset..buf_offset + copy_len]
|
|
||||||
.copy_from_slice(&block[block_offset as usize..block_offset as usize + copy_len]);
|
|
||||||
buf_offset += copy_len;
|
|
||||||
storage_pos += copy_len as u64;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn write(&mut self, pos: u64, buf: &[u8]) {
|
|
||||||
let mut buf_offset = 0;
|
|
||||||
let mut storage_pos = pos;
|
|
||||||
while buf_offset < buf.len() {
|
|
||||||
let block_id = storage_pos / BLOCK_SIZE as u64;
|
|
||||||
let block = self.blocks.entry(block_id).or_insert([0; BLOCK_SIZE]);
|
|
||||||
let block_offset = storage_pos % BLOCK_SIZE as u64;
|
|
||||||
let block_len = BLOCK_SIZE as u64 - block_offset;
|
|
||||||
let buf_len = buf.len() - buf_offset;
|
|
||||||
let copy_len = std::cmp::min(block_len as usize, buf_len);
|
|
||||||
block[block_offset as usize..block_offset as usize + copy_len]
|
|
||||||
.copy_from_slice(&buf[buf_offset..buf_offset + copy_len]);
|
|
||||||
buf_offset += copy_len;
|
|
||||||
storage_pos += copy_len as u64
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,61 +0,0 @@
|
|||||||
use std::{sync::Arc, fmt};
|
|
||||||
|
|
||||||
use safekeeper::simlib::{world::World, sync::Mutex};
|
|
||||||
use tracing_subscriber::fmt::{time::FormatTime, format::Writer};
|
|
||||||
use utils::logging;
|
|
||||||
|
|
||||||
use crate::bindings;
|
|
||||||
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct SimClock {
|
|
||||||
world_ptr: Arc<Mutex<Option<Arc<World>>>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for SimClock {
|
|
||||||
fn default() -> Self {
|
|
||||||
SimClock {
|
|
||||||
world_ptr: Arc::new(Mutex::new(None)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SimClock {
|
|
||||||
pub fn set_world(&self, world: Arc<World>) {
|
|
||||||
*self.world_ptr.lock() = Some(world);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FormatTime for SimClock {
|
|
||||||
fn format_time(&self, w: &mut Writer<'_>) -> fmt::Result {
|
|
||||||
let world = self.world_ptr.lock().clone();
|
|
||||||
|
|
||||||
if let Some(world) = world {
|
|
||||||
let now = world.now();
|
|
||||||
write!(w, "[{}]", now)
|
|
||||||
} else {
|
|
||||||
write!(w, "[?]")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn init_logger() -> SimClock {
|
|
||||||
let debug_enabled = unsafe { bindings::debug_enabled };
|
|
||||||
|
|
||||||
let clock = SimClock::default();
|
|
||||||
let base_logger = tracing_subscriber::fmt()
|
|
||||||
.with_target(false)
|
|
||||||
.with_timer(clock.clone())
|
|
||||||
.with_ansi(true)
|
|
||||||
.with_max_level(match debug_enabled {
|
|
||||||
true => tracing::Level::DEBUG,
|
|
||||||
false => tracing::Level::INFO,
|
|
||||||
})
|
|
||||||
.with_writer(std::io::stdout);
|
|
||||||
base_logger.init();
|
|
||||||
|
|
||||||
// logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
|
||||||
std::panic::set_hook(Box::new(|_| {}));
|
|
||||||
|
|
||||||
clock
|
|
||||||
}
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
#[cfg(test)]
|
|
||||||
pub mod simple_client;
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
pub mod wp_sk;
|
|
||||||
|
|
||||||
pub mod disk;
|
|
||||||
pub mod safekeeper;
|
|
||||||
pub mod storage;
|
|
||||||
pub mod log;
|
|
||||||
pub mod util;
|
|
||||||
@@ -1,372 +0,0 @@
|
|||||||
//! Safekeeper communication endpoint to WAL proposer (compute node).
|
|
||||||
//! Gets messages from the network, passes them down to consensus module and
|
|
||||||
//! sends replies back.
|
|
||||||
|
|
||||||
use std::{collections::HashMap, path::PathBuf, sync::Arc, time::Duration};
|
|
||||||
|
|
||||||
use anyhow::{anyhow, bail, Result};
|
|
||||||
use bytes::{Bytes, BytesMut};
|
|
||||||
use hyper::Uri;
|
|
||||||
use log::info;
|
|
||||||
use safekeeper::{
|
|
||||||
safekeeper::{
|
|
||||||
ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, ServerInfo, UNKNOWN_SERVER_VERSION,
|
|
||||||
},
|
|
||||||
simlib::{network::TCP, node_os::NodeOs, proto::AnyMessage, world::NodeEvent},
|
|
||||||
timeline::TimelineError,
|
|
||||||
SafeKeeperConf, wal_storage::Storage,
|
|
||||||
};
|
|
||||||
use tracing::{debug, info_span};
|
|
||||||
use utils::{
|
|
||||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
|
||||||
lsn::Lsn,
|
|
||||||
};
|
|
||||||
|
|
||||||
use crate::simtest::storage::DiskStateStorage;
|
|
||||||
|
|
||||||
use super::{
|
|
||||||
disk::{Disk, TimelineDisk},
|
|
||||||
storage::DiskWALStorage,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ConnState {
|
|
||||||
tcp: TCP,
|
|
||||||
|
|
||||||
greeting: bool,
|
|
||||||
ttid: TenantTimelineId,
|
|
||||||
flush_pending: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
struct SharedState {
|
|
||||||
sk: SafeKeeper<DiskStateStorage, DiskWALStorage>,
|
|
||||||
disk: Arc<TimelineDisk>,
|
|
||||||
}
|
|
||||||
|
|
||||||
struct GlobalMap {
|
|
||||||
timelines: HashMap<TenantTimelineId, SharedState>,
|
|
||||||
conf: SafeKeeperConf,
|
|
||||||
disk: Arc<Disk>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl GlobalMap {
|
|
||||||
fn new(disk: Arc<Disk>, conf: SafeKeeperConf) -> Result<Self> {
|
|
||||||
let mut timelines = HashMap::new();
|
|
||||||
|
|
||||||
for (&ttid, disk) in disk.timelines.lock().iter() {
|
|
||||||
debug!("loading timeline {}", ttid);
|
|
||||||
let state = disk.state.lock().clone();
|
|
||||||
|
|
||||||
if state.server.wal_seg_size == 0 {
|
|
||||||
bail!(TimelineError::UninitializedWalSegSize(ttid));
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.server.pg_version == UNKNOWN_SERVER_VERSION {
|
|
||||||
bail!(TimelineError::UninitialinzedPgVersion(ttid));
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.commit_lsn < state.local_start_lsn {
|
|
||||||
bail!(
|
|
||||||
"commit_lsn {} is higher than local_start_lsn {}",
|
|
||||||
state.commit_lsn,
|
|
||||||
state.local_start_lsn
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let control_store = DiskStateStorage::new(disk.clone());
|
|
||||||
let wal_store = DiskWALStorage::new(disk.clone(), &control_store)?;
|
|
||||||
|
|
||||||
let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
|
|
||||||
timelines.insert(
|
|
||||||
ttid.clone(),
|
|
||||||
SharedState {
|
|
||||||
sk,
|
|
||||||
disk: disk.clone(),
|
|
||||||
},
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(Self {
|
|
||||||
timelines,
|
|
||||||
conf,
|
|
||||||
disk,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn create(&mut self, ttid: TenantTimelineId, server_info: ServerInfo) -> Result<()> {
|
|
||||||
if self.timelines.contains_key(&ttid) {
|
|
||||||
bail!("timeline {} already exists", ttid);
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!("creating new timeline {}", ttid);
|
|
||||||
|
|
||||||
let commit_lsn = Lsn::INVALID;
|
|
||||||
let local_start_lsn = Lsn::INVALID;
|
|
||||||
|
|
||||||
// TODO: load state from in-memory storage
|
|
||||||
let state = SafeKeeperState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
|
|
||||||
|
|
||||||
if state.server.wal_seg_size == 0 {
|
|
||||||
bail!(TimelineError::UninitializedWalSegSize(ttid));
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.server.pg_version == UNKNOWN_SERVER_VERSION {
|
|
||||||
bail!(TimelineError::UninitialinzedPgVersion(ttid));
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.commit_lsn < state.local_start_lsn {
|
|
||||||
bail!(
|
|
||||||
"commit_lsn {} is higher than local_start_lsn {}",
|
|
||||||
state.commit_lsn,
|
|
||||||
state.local_start_lsn
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let disk_timeline = self.disk.put_state(&ttid, state);
|
|
||||||
let control_store = DiskStateStorage::new(disk_timeline.clone());
|
|
||||||
let wal_store = DiskWALStorage::new(disk_timeline.clone(), &control_store)?;
|
|
||||||
|
|
||||||
let sk = SafeKeeper::new(control_store, wal_store, self.conf.my_id)?;
|
|
||||||
|
|
||||||
self.timelines.insert(
|
|
||||||
ttid.clone(),
|
|
||||||
SharedState {
|
|
||||||
sk,
|
|
||||||
disk: disk_timeline,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get(&mut self, ttid: &TenantTimelineId) -> &mut SharedState {
|
|
||||||
self.timelines.get_mut(ttid).expect("timeline must exist")
|
|
||||||
}
|
|
||||||
|
|
||||||
fn has_tli(&self, ttid: &TenantTimelineId) -> bool {
|
|
||||||
self.timelines.contains_key(ttid)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn run_server(os: NodeOs, disk: Arc<Disk>) -> Result<()> {
|
|
||||||
let _enter = info_span!("safekeeper", id = os.id()).entered();
|
|
||||||
debug!("started server");
|
|
||||||
os.log_event("started;safekeeper".to_owned());
|
|
||||||
let conf = SafeKeeperConf {
|
|
||||||
workdir: PathBuf::from("."),
|
|
||||||
my_id: NodeId(os.id() as u64),
|
|
||||||
listen_pg_addr: String::new(),
|
|
||||||
listen_http_addr: String::new(),
|
|
||||||
no_sync: false,
|
|
||||||
broker_endpoint: "/".parse::<Uri>().unwrap(),
|
|
||||||
broker_keepalive_interval: Duration::from_secs(0),
|
|
||||||
heartbeat_timeout: Duration::from_secs(0),
|
|
||||||
remote_storage: None,
|
|
||||||
max_offloader_lag_bytes: 0,
|
|
||||||
backup_runtime_threads: None,
|
|
||||||
wal_backup_enabled: false,
|
|
||||||
auth: None,
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut global = GlobalMap::new(disk, conf.clone())?;
|
|
||||||
let mut conns: HashMap<i64, ConnState> = HashMap::new();
|
|
||||||
|
|
||||||
for (&ttid, shared_state) in global.timelines.iter_mut() {
|
|
||||||
let flush_lsn = shared_state.sk.wal_store.flush_lsn();
|
|
||||||
let commit_lsn = shared_state.sk.state.commit_lsn;
|
|
||||||
os.log_event(format!("tli_loaded;{};{}", flush_lsn.0, commit_lsn.0));
|
|
||||||
}
|
|
||||||
|
|
||||||
let epoll = os.epoll();
|
|
||||||
loop {
|
|
||||||
// waiting for the next message
|
|
||||||
let mut next_event = Some(epoll.recv());
|
|
||||||
|
|
||||||
loop {
|
|
||||||
let event = match next_event {
|
|
||||||
Some(event) => event,
|
|
||||||
None => break,
|
|
||||||
};
|
|
||||||
|
|
||||||
match event {
|
|
||||||
NodeEvent::Accept(tcp) => {
|
|
||||||
conns.insert(
|
|
||||||
tcp.id(),
|
|
||||||
ConnState {
|
|
||||||
tcp,
|
|
||||||
greeting: false,
|
|
||||||
ttid: TenantTimelineId::empty(),
|
|
||||||
flush_pending: false,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
}
|
|
||||||
NodeEvent::Message((msg, tcp)) => {
|
|
||||||
let conn = conns.get_mut(&tcp.id());
|
|
||||||
if let Some(conn) = conn {
|
|
||||||
let res = conn.process_any(msg, &mut global);
|
|
||||||
if res.is_err() {
|
|
||||||
debug!("conn {:?} error: {:#}", tcp, res.unwrap_err());
|
|
||||||
conns.remove(&tcp.id());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
debug!("conn {:?} was closed, dropping msg {:?}", tcp, msg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
NodeEvent::Internal(_) => {}
|
|
||||||
NodeEvent::Closed(_) => {}
|
|
||||||
NodeEvent::WakeTimeout(_) => {}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: make simulator support multiple events per tick
|
|
||||||
next_event = epoll.try_recv();
|
|
||||||
}
|
|
||||||
|
|
||||||
conns.retain(|_, conn| {
|
|
||||||
let res = conn.flush(&mut global);
|
|
||||||
if res.is_err() {
|
|
||||||
debug!("conn {:?} error: {:?}", conn.tcp, res);
|
|
||||||
}
|
|
||||||
res.is_ok()
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ConnState {
|
|
||||||
fn process_any(&mut self, any: AnyMessage, global: &mut GlobalMap) -> Result<()> {
|
|
||||||
if let AnyMessage::Bytes(copy_data) = any {
|
|
||||||
let repl_prefix = b"START_REPLICATION ";
|
|
||||||
if !self.greeting && copy_data.starts_with(repl_prefix) {
|
|
||||||
self.process_start_replication(copy_data.slice(repl_prefix.len()..), global)?;
|
|
||||||
bail!("finished processing START_REPLICATION")
|
|
||||||
}
|
|
||||||
|
|
||||||
let msg = ProposerAcceptorMessage::parse(copy_data)?;
|
|
||||||
debug!("got msg: {:?}", msg);
|
|
||||||
return self.process(msg, global);
|
|
||||||
} else {
|
|
||||||
bail!("unexpected message, expected AnyMessage::Bytes");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn process_start_replication(
|
|
||||||
&mut self,
|
|
||||||
copy_data: Bytes,
|
|
||||||
global: &mut GlobalMap,
|
|
||||||
) -> Result<()> {
|
|
||||||
// format is "<tenant_id> <timeline_id> <start_lsn> <end_lsn>"
|
|
||||||
let str = String::from_utf8(copy_data.to_vec())?;
|
|
||||||
|
|
||||||
let mut parts = str.split(' ');
|
|
||||||
let tenant_id = parts.next().unwrap().parse::<TenantId>()?;
|
|
||||||
let timeline_id = parts.next().unwrap().parse::<TimelineId>()?;
|
|
||||||
let start_lsn = parts.next().unwrap().parse::<u64>()?;
|
|
||||||
let end_lsn = parts.next().unwrap().parse::<u64>()?;
|
|
||||||
|
|
||||||
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
|
|
||||||
let shared_state = global.get(&ttid);
|
|
||||||
|
|
||||||
// read bytes from start_lsn to end_lsn
|
|
||||||
let mut buf = vec![0; (end_lsn - start_lsn) as usize];
|
|
||||||
shared_state.disk.wal.lock().read(start_lsn, &mut buf);
|
|
||||||
|
|
||||||
// send bytes to the client
|
|
||||||
self.tcp.send(AnyMessage::Bytes(Bytes::from(buf)));
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn init_timeline(
|
|
||||||
&mut self,
|
|
||||||
ttid: TenantTimelineId,
|
|
||||||
server_info: ServerInfo,
|
|
||||||
global: &mut GlobalMap,
|
|
||||||
) -> Result<()> {
|
|
||||||
self.ttid = ttid;
|
|
||||||
if global.has_tli(&ttid) {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
|
|
||||||
global.create(ttid, server_info)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn process(&mut self, msg: ProposerAcceptorMessage, global: &mut GlobalMap) -> Result<()> {
|
|
||||||
if !self.greeting {
|
|
||||||
self.greeting = true;
|
|
||||||
|
|
||||||
match msg {
|
|
||||||
ProposerAcceptorMessage::Greeting(ref greeting) => {
|
|
||||||
debug!(
|
|
||||||
"start handshake with walproposer {:?}",
|
|
||||||
self.tcp,
|
|
||||||
);
|
|
||||||
let server_info = ServerInfo {
|
|
||||||
pg_version: greeting.pg_version,
|
|
||||||
system_id: greeting.system_id,
|
|
||||||
wal_seg_size: greeting.wal_seg_size,
|
|
||||||
};
|
|
||||||
let ttid = TenantTimelineId::new(greeting.tenant_id, greeting.timeline_id);
|
|
||||||
self.init_timeline(ttid, server_info, global)?
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
bail!("unexpected message {msg:?} instead of greeting");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let tli = global.get(&self.ttid);
|
|
||||||
|
|
||||||
match msg {
|
|
||||||
ProposerAcceptorMessage::AppendRequest(append_request) => {
|
|
||||||
self.flush_pending = true;
|
|
||||||
self.process_sk_msg(
|
|
||||||
tli,
|
|
||||||
&ProposerAcceptorMessage::NoFlushAppendRequest(append_request),
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
other => {
|
|
||||||
self.process_sk_msg(tli, &other)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Process FlushWAL if needed.
|
|
||||||
// TODO: add extra flushes, to verify that extra flushes don't break anything
|
|
||||||
fn flush(&mut self, global: &mut GlobalMap) -> Result<()> {
|
|
||||||
if !self.flush_pending {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
self.flush_pending = false;
|
|
||||||
let shared_state = global.get(&self.ttid);
|
|
||||||
self.process_sk_msg(shared_state, &ProposerAcceptorMessage::FlushWAL)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Make safekeeper process a message and send a reply to the TCP
|
|
||||||
fn process_sk_msg(
|
|
||||||
&mut self,
|
|
||||||
shared_state: &mut SharedState,
|
|
||||||
msg: &ProposerAcceptorMessage,
|
|
||||||
) -> Result<()> {
|
|
||||||
let mut reply = shared_state.sk.process_msg(msg)?;
|
|
||||||
if let Some(reply) = &mut reply {
|
|
||||||
// // if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn
|
|
||||||
// if let AcceptorProposerMessage::AppendResponse(ref mut resp) = reply {
|
|
||||||
// // TODO:
|
|
||||||
// }
|
|
||||||
|
|
||||||
let mut buf = BytesMut::with_capacity(128);
|
|
||||||
reply.serialize(&mut buf)?;
|
|
||||||
|
|
||||||
self.tcp.send(AnyMessage::Bytes(buf.into()));
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for ConnState {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
debug!("dropping conn: {:?}", self.tcp);
|
|
||||||
if !std::thread::panicking() {
|
|
||||||
self.tcp.close();
|
|
||||||
}
|
|
||||||
// TODO: clean up non-fsynced WAL
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,38 +0,0 @@
|
|||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use safekeeper::{
|
|
||||||
simlib::{
|
|
||||||
network::{Delay, NetworkOptions},
|
|
||||||
world::World,
|
|
||||||
},
|
|
||||||
simtest::{start_simulation, Options},
|
|
||||||
};
|
|
||||||
|
|
||||||
use crate::{bindings::RunClientC, c_context};
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn run_rust_c_test() {
|
|
||||||
let delay = Delay {
|
|
||||||
min: 1,
|
|
||||||
max: 5,
|
|
||||||
fail_prob: 0.5,
|
|
||||||
};
|
|
||||||
|
|
||||||
let network = NetworkOptions {
|
|
||||||
keepalive_timeout: Some(50),
|
|
||||||
connect_delay: delay.clone(),
|
|
||||||
send_delay: delay.clone(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let u32_data: [u32; 5] = [1, 2, 3, 4, 5];
|
|
||||||
|
|
||||||
let world = Arc::new(World::new(1337, Arc::new(network), c_context()));
|
|
||||||
start_simulation(Options {
|
|
||||||
world,
|
|
||||||
time_limit: 1_000_000,
|
|
||||||
client_fn: Box::new(move |_, server_id| unsafe {
|
|
||||||
RunClientC(server_id);
|
|
||||||
}),
|
|
||||||
u32_data,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
@@ -1,234 +0,0 @@
|
|||||||
use std::{ops::Deref, sync::Arc};
|
|
||||||
|
|
||||||
use anyhow::Result;
|
|
||||||
use bytes::{Buf, BytesMut};
|
|
||||||
use log::{debug, info};
|
|
||||||
use postgres_ffi::{waldecoder::WalStreamDecoder, XLogSegNo};
|
|
||||||
use safekeeper::{control_file, safekeeper::SafeKeeperState, wal_storage};
|
|
||||||
use utils::lsn::Lsn;
|
|
||||||
|
|
||||||
use super::disk::TimelineDisk;
|
|
||||||
|
|
||||||
pub struct DiskStateStorage {
|
|
||||||
persisted_state: SafeKeeperState,
|
|
||||||
disk: Arc<TimelineDisk>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl DiskStateStorage {
|
|
||||||
pub fn new(disk: Arc<TimelineDisk>) -> Self {
|
|
||||||
let guard = disk.state.lock();
|
|
||||||
let state = guard.clone();
|
|
||||||
drop(guard);
|
|
||||||
DiskStateStorage {
|
|
||||||
persisted_state: state,
|
|
||||||
disk,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl control_file::Storage for DiskStateStorage {
|
|
||||||
fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
|
|
||||||
self.persisted_state = s.clone();
|
|
||||||
*self.disk.state.lock() = s.clone();
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Deref for DiskStateStorage {
|
|
||||||
type Target = SafeKeeperState;
|
|
||||||
|
|
||||||
fn deref(&self) -> &Self::Target {
|
|
||||||
&self.persisted_state
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct DummyWalStore {
|
|
||||||
lsn: Lsn,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl DummyWalStore {
|
|
||||||
pub fn new() -> Self {
|
|
||||||
DummyWalStore { lsn: Lsn::INVALID }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl wal_storage::Storage for DummyWalStore {
|
|
||||||
fn flush_lsn(&self) -> Lsn {
|
|
||||||
self.lsn
|
|
||||||
}
|
|
||||||
|
|
||||||
fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
|
||||||
self.lsn = startpos + buf.len() as u64;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
|
|
||||||
self.lsn = end_pos;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn flush_wal(&mut self) -> Result<()> {
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>> {
|
|
||||||
Box::new(move |_segno_up_to: XLogSegNo| Ok(()))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_metrics(&self) -> safekeeper::metrics::WalStorageMetrics {
|
|
||||||
safekeeper::metrics::WalStorageMetrics::default()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct DiskWALStorage {
|
|
||||||
/// Written to disk, but possibly still in the cache and not fully persisted.
|
|
||||||
/// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
|
|
||||||
write_lsn: Lsn,
|
|
||||||
|
|
||||||
/// The LSN of the last WAL record written to disk. Still can be not fully flushed.
|
|
||||||
write_record_lsn: Lsn,
|
|
||||||
|
|
||||||
/// The LSN of the last WAL record flushed to disk.
|
|
||||||
flush_record_lsn: Lsn,
|
|
||||||
|
|
||||||
/// Decoder is required for detecting boundaries of WAL records.
|
|
||||||
decoder: WalStreamDecoder,
|
|
||||||
|
|
||||||
unflushed_bytes: BytesMut,
|
|
||||||
|
|
||||||
disk: Arc<TimelineDisk>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl DiskWALStorage {
|
|
||||||
pub fn new(disk: Arc<TimelineDisk>, state: &SafeKeeperState) -> Result<Self> {
|
|
||||||
let write_lsn = if state.commit_lsn == Lsn(0) {
|
|
||||||
Lsn(0)
|
|
||||||
} else {
|
|
||||||
Self::find_end_of_wal(disk.clone(), state.commit_lsn)?
|
|
||||||
};
|
|
||||||
|
|
||||||
let flush_lsn = write_lsn;
|
|
||||||
Ok(DiskWALStorage {
|
|
||||||
write_lsn,
|
|
||||||
write_record_lsn: flush_lsn,
|
|
||||||
flush_record_lsn: flush_lsn,
|
|
||||||
decoder: WalStreamDecoder::new(flush_lsn, 15),
|
|
||||||
unflushed_bytes: BytesMut::new(),
|
|
||||||
disk,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn find_end_of_wal(disk: Arc<TimelineDisk>, start_lsn: Lsn) -> Result<Lsn> {
|
|
||||||
let mut buf = [0; 8192];
|
|
||||||
let mut pos = start_lsn.0;
|
|
||||||
let mut decoder = WalStreamDecoder::new(start_lsn, 15);
|
|
||||||
let mut result = start_lsn;
|
|
||||||
loop {
|
|
||||||
disk.wal.lock().read(pos, &mut buf);
|
|
||||||
pos += buf.len() as u64;
|
|
||||||
decoder.feed_bytes(&buf);
|
|
||||||
|
|
||||||
loop {
|
|
||||||
match decoder.poll_decode() {
|
|
||||||
Ok(Some(record)) => result = record.0,
|
|
||||||
Err(e) => {
|
|
||||||
debug!(
|
|
||||||
"find_end_of_wal reached end at {:?}, decode error: {:?}",
|
|
||||||
result, e
|
|
||||||
);
|
|
||||||
return Ok(result);
|
|
||||||
}
|
|
||||||
Ok(None) => break, // need more data
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl wal_storage::Storage for DiskWALStorage {
|
|
||||||
fn flush_lsn(&self) -> Lsn {
|
|
||||||
self.flush_record_lsn
|
|
||||||
}
|
|
||||||
|
|
||||||
fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
|
||||||
if self.write_lsn != startpos {
|
|
||||||
panic!("write_wal called with wrong startpos");
|
|
||||||
}
|
|
||||||
|
|
||||||
self.unflushed_bytes.extend_from_slice(buf);
|
|
||||||
self.write_lsn += buf.len() as u64;
|
|
||||||
|
|
||||||
if self.decoder.available() != startpos {
|
|
||||||
info!(
|
|
||||||
"restart decoder from {} to {}",
|
|
||||||
self.decoder.available(),
|
|
||||||
startpos,
|
|
||||||
);
|
|
||||||
self.decoder = WalStreamDecoder::new(startpos, 15);
|
|
||||||
}
|
|
||||||
self.decoder.feed_bytes(buf);
|
|
||||||
loop {
|
|
||||||
match self.decoder.poll_decode()? {
|
|
||||||
None => break, // no full record yet
|
|
||||||
Some((lsn, _rec)) => {
|
|
||||||
self.write_record_lsn = lsn;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
|
|
||||||
if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
|
|
||||||
panic!(
|
|
||||||
"truncate_wal called on non-written WAL, write_lsn={}, end_pos={}",
|
|
||||||
self.write_lsn, end_pos
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
self.flush_wal()?;
|
|
||||||
|
|
||||||
// write zeroes to disk from end_pos until self.write_lsn
|
|
||||||
let buf = [0; 8192];
|
|
||||||
let mut pos = end_pos.0;
|
|
||||||
while pos < self.write_lsn.0 {
|
|
||||||
self.disk.wal.lock().write(pos, &buf);
|
|
||||||
pos += buf.len() as u64;
|
|
||||||
}
|
|
||||||
|
|
||||||
self.write_lsn = end_pos;
|
|
||||||
self.write_record_lsn = end_pos;
|
|
||||||
self.flush_record_lsn = end_pos;
|
|
||||||
self.unflushed_bytes.clear();
|
|
||||||
self.decoder = WalStreamDecoder::new(end_pos, 15);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn flush_wal(&mut self) -> Result<()> {
|
|
||||||
if self.flush_record_lsn == self.write_record_lsn {
|
|
||||||
// no need to do extra flush
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
|
|
||||||
let num_bytes = self.write_record_lsn.0 - self.flush_record_lsn.0;
|
|
||||||
|
|
||||||
self.disk.wal.lock().write(
|
|
||||||
self.flush_record_lsn.0,
|
|
||||||
&self.unflushed_bytes[..num_bytes as usize],
|
|
||||||
);
|
|
||||||
self.unflushed_bytes.advance(num_bytes as usize);
|
|
||||||
self.flush_record_lsn = self.write_record_lsn;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>> {
|
|
||||||
Box::new(move |_segno_up_to: XLogSegNo| Ok(()))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_metrics(&self) -> safekeeper::metrics::WalStorageMetrics {
|
|
||||||
safekeeper::metrics::WalStorageMetrics::default()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,610 +0,0 @@
|
|||||||
use std::{ffi::CString, path::Path, str::FromStr, sync::Arc, collections::HashMap};
|
|
||||||
|
|
||||||
use rand::{Rng, SeedableRng};
|
|
||||||
use safekeeper::simlib::{
|
|
||||||
network::{Delay, NetworkOptions},
|
|
||||||
proto::AnyMessage,
|
|
||||||
time::EmptyEvent,
|
|
||||||
world::World,
|
|
||||||
world::{Node, NodeEvent, SEvent, NodeId},
|
|
||||||
};
|
|
||||||
use tracing::{debug, error, info, warn};
|
|
||||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
bindings::{
|
|
||||||
neon_tenant_walproposer, neon_timeline_walproposer, sim_redo_start_lsn, syncSafekeepers,
|
|
||||||
wal_acceptor_connection_timeout, wal_acceptor_reconnect_timeout, wal_acceptors_list,
|
|
||||||
MyInsertRecord, WalProposerCleanup, WalProposerRust,
|
|
||||||
},
|
|
||||||
c_context,
|
|
||||||
simtest::{
|
|
||||||
log::{init_logger, SimClock},
|
|
||||||
safekeeper::run_server,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
use super::disk::Disk;
|
|
||||||
|
|
||||||
pub struct SkNode {
|
|
||||||
pub node: Arc<Node>,
|
|
||||||
pub id: u32,
|
|
||||||
pub disk: Arc<Disk>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SkNode {
|
|
||||||
pub fn new(node: Arc<Node>) -> Self {
|
|
||||||
let disk = Arc::new(Disk::new());
|
|
||||||
let res = Self {
|
|
||||||
id: node.id,
|
|
||||||
node,
|
|
||||||
disk,
|
|
||||||
};
|
|
||||||
res.launch();
|
|
||||||
res
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn launch(&self) {
|
|
||||||
let id = self.id;
|
|
||||||
let disk = self.disk.clone();
|
|
||||||
// start the server thread
|
|
||||||
self.node.launch(move |os| {
|
|
||||||
let res = run_server(os, disk);
|
|
||||||
debug!("server {} finished: {:?}", id, res);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn restart(&self) {
|
|
||||||
self.node.crash_stop();
|
|
||||||
self.launch();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct TestConfig {
|
|
||||||
pub network: NetworkOptions,
|
|
||||||
pub timeout: u64,
|
|
||||||
pub clock: Option<SimClock>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TestConfig {
|
|
||||||
pub fn new(clock: Option<SimClock>) -> Self {
|
|
||||||
Self {
|
|
||||||
network: NetworkOptions {
|
|
||||||
keepalive_timeout: Some(2000),
|
|
||||||
connect_delay: Delay {
|
|
||||||
min: 1,
|
|
||||||
max: 5,
|
|
||||||
fail_prob: 0.0,
|
|
||||||
},
|
|
||||||
send_delay: Delay {
|
|
||||||
min: 1,
|
|
||||||
max: 5,
|
|
||||||
fail_prob: 0.0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
timeout: 1_000 * 10,
|
|
||||||
clock,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn start(&self, seed: u64) -> Test {
|
|
||||||
let world = Arc::new(World::new(
|
|
||||||
seed,
|
|
||||||
Arc::new(self.network.clone()),
|
|
||||||
c_context(),
|
|
||||||
));
|
|
||||||
world.register_world();
|
|
||||||
|
|
||||||
if let Some(clock) = &self.clock {
|
|
||||||
clock.set_world(world.clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
let servers = [
|
|
||||||
SkNode::new(world.new_node()),
|
|
||||||
SkNode::new(world.new_node()),
|
|
||||||
SkNode::new(world.new_node()),
|
|
||||||
];
|
|
||||||
|
|
||||||
let server_ids = [servers[0].id, servers[1].id, servers[2].id];
|
|
||||||
|
|
||||||
let safekeepers_guc = server_ids.map(|id| format!("node:{}", id)).join(",");
|
|
||||||
let ttid = TenantTimelineId::generate();
|
|
||||||
|
|
||||||
// wait init for all servers
|
|
||||||
world.await_all();
|
|
||||||
|
|
||||||
// clean up pgdata directory
|
|
||||||
self.init_pgdata();
|
|
||||||
|
|
||||||
Test {
|
|
||||||
world,
|
|
||||||
servers,
|
|
||||||
safekeepers_guc,
|
|
||||||
ttid,
|
|
||||||
timeout: self.timeout,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn init_pgdata(&self) {
|
|
||||||
let pgdata = Path::new("/home/admin/simulator/libs/walproposer/pgdata");
|
|
||||||
if pgdata.exists() {
|
|
||||||
std::fs::remove_dir_all(pgdata).unwrap();
|
|
||||||
}
|
|
||||||
std::fs::create_dir(pgdata).unwrap();
|
|
||||||
|
|
||||||
// create empty pg_wal and pg_notify subdirs
|
|
||||||
std::fs::create_dir(pgdata.join("pg_wal")).unwrap();
|
|
||||||
std::fs::create_dir(pgdata.join("pg_notify")).unwrap();
|
|
||||||
|
|
||||||
// write postgresql.conf
|
|
||||||
let mut conf = std::fs::File::create(pgdata.join("postgresql.conf")).unwrap();
|
|
||||||
let content = "
|
|
||||||
wal_log_hints=off
|
|
||||||
hot_standby=on
|
|
||||||
fsync=off
|
|
||||||
wal_level=replica
|
|
||||||
restart_after_crash=off
|
|
||||||
shared_preload_libraries=neon
|
|
||||||
neon.pageserver_connstring=''
|
|
||||||
neon.tenant_id=cc6e67313d57283bad411600fbf5c142
|
|
||||||
neon.timeline_id=de6fa815c1e45aa61491c3d34c4eb33e
|
|
||||||
synchronous_standby_names=walproposer
|
|
||||||
neon.safekeepers='node:1,node:2,node:3'
|
|
||||||
max_connections=100
|
|
||||||
";
|
|
||||||
|
|
||||||
std::io::Write::write_all(&mut conf, content.as_bytes()).unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct Test {
|
|
||||||
pub world: Arc<World>,
|
|
||||||
pub servers: [SkNode; 3],
|
|
||||||
pub safekeepers_guc: String,
|
|
||||||
pub ttid: TenantTimelineId,
|
|
||||||
pub timeout: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Test {
|
|
||||||
fn launch_sync(&self) -> Arc<Node> {
|
|
||||||
let client_node = self.world.new_node();
|
|
||||||
debug!("sync-safekeepers started at node {}", client_node.id);
|
|
||||||
|
|
||||||
// start the client thread
|
|
||||||
let guc = self.safekeepers_guc.clone();
|
|
||||||
let ttid = self.ttid.clone();
|
|
||||||
client_node.launch(move |_| {
|
|
||||||
let list = CString::new(guc).unwrap();
|
|
||||||
|
|
||||||
unsafe {
|
|
||||||
WalProposerCleanup();
|
|
||||||
|
|
||||||
syncSafekeepers = true;
|
|
||||||
wal_acceptors_list = list.into_raw();
|
|
||||||
wal_acceptor_reconnect_timeout = 1000;
|
|
||||||
wal_acceptor_connection_timeout = 5000;
|
|
||||||
neon_tenant_walproposer =
|
|
||||||
CString::new(ttid.tenant_id.to_string()).unwrap().into_raw();
|
|
||||||
neon_timeline_walproposer = CString::new(ttid.timeline_id.to_string())
|
|
||||||
.unwrap()
|
|
||||||
.into_raw();
|
|
||||||
WalProposerRust();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
self.world.await_all();
|
|
||||||
|
|
||||||
client_node
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn sync_safekeepers(&self) -> anyhow::Result<Lsn> {
|
|
||||||
let client_node = self.launch_sync();
|
|
||||||
|
|
||||||
// poll until exit or timeout
|
|
||||||
let time_limit = self.timeout;
|
|
||||||
while self.world.step() && self.world.now() < time_limit && !client_node.is_finished() {}
|
|
||||||
|
|
||||||
if !client_node.is_finished() {
|
|
||||||
anyhow::bail!("timeout or idle stuck");
|
|
||||||
}
|
|
||||||
|
|
||||||
let res = client_node.result.lock().clone();
|
|
||||||
if res.0 != 0 {
|
|
||||||
anyhow::bail!("non-zero exitcode: {:?}", res);
|
|
||||||
}
|
|
||||||
let lsn = Lsn::from_str(&res.1)?;
|
|
||||||
Ok(lsn)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn launch_walproposer(&self, lsn: Lsn) -> WalProposer {
|
|
||||||
let client_node = self.world.new_node();
|
|
||||||
|
|
||||||
let lsn = if lsn.0 == 0 {
|
|
||||||
// usual LSN after basebackup
|
|
||||||
Lsn(21623024)
|
|
||||||
} else {
|
|
||||||
lsn
|
|
||||||
};
|
|
||||||
|
|
||||||
// start the client thread
|
|
||||||
let guc = self.safekeepers_guc.clone();
|
|
||||||
let ttid = self.ttid.clone();
|
|
||||||
client_node.launch(move |_| {
|
|
||||||
let list = CString::new(guc).unwrap();
|
|
||||||
|
|
||||||
unsafe {
|
|
||||||
WalProposerCleanup();
|
|
||||||
|
|
||||||
sim_redo_start_lsn = lsn.0;
|
|
||||||
syncSafekeepers = false;
|
|
||||||
wal_acceptors_list = list.into_raw();
|
|
||||||
wal_acceptor_reconnect_timeout = 1000;
|
|
||||||
wal_acceptor_connection_timeout = 5000;
|
|
||||||
neon_tenant_walproposer =
|
|
||||||
CString::new(ttid.tenant_id.to_string()).unwrap().into_raw();
|
|
||||||
neon_timeline_walproposer = CString::new(ttid.timeline_id.to_string())
|
|
||||||
.unwrap()
|
|
||||||
.into_raw();
|
|
||||||
WalProposerRust();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
self.world.await_all();
|
|
||||||
|
|
||||||
WalProposer {
|
|
||||||
node: client_node,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn poll_for_duration(&self, duration: u64) {
|
|
||||||
let time_limit = std::cmp::min(self.world.now() + duration, self.timeout);
|
|
||||||
while self.world.step() && self.world.now() < time_limit {}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn run_schedule(&self, schedule: &Schedule) -> anyhow::Result<()> {
|
|
||||||
{
|
|
||||||
let empty_event = Box::new(EmptyEvent);
|
|
||||||
|
|
||||||
let now = self.world.now();
|
|
||||||
for (time, _) in schedule {
|
|
||||||
if *time < now {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
self.world.schedule(*time - now, empty_event.clone())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut wait_node = self.launch_sync();
|
|
||||||
// fake walproposer
|
|
||||||
let mut wp = WalProposer {
|
|
||||||
node: wait_node.clone(),
|
|
||||||
};
|
|
||||||
let mut sync_in_progress = true;
|
|
||||||
|
|
||||||
let mut skipped_tx = 0;
|
|
||||||
let mut started_tx = 0;
|
|
||||||
|
|
||||||
let mut schedule_ptr = 0;
|
|
||||||
|
|
||||||
loop {
|
|
||||||
if sync_in_progress && wait_node.is_finished() {
|
|
||||||
let res = wait_node.result.lock().clone();
|
|
||||||
if res.0 != 0 {
|
|
||||||
warn!("sync non-zero exitcode: {:?}", res);
|
|
||||||
debug!("restarting walproposer");
|
|
||||||
wait_node = self.launch_sync();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let lsn = Lsn::from_str(&res.1)?;
|
|
||||||
debug!("sync-safekeepers finished at LSN {}", lsn);
|
|
||||||
wp = self.launch_walproposer(lsn);
|
|
||||||
wait_node = wp.node.clone();
|
|
||||||
debug!("walproposer started at node {}", wait_node.id);
|
|
||||||
sync_in_progress = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
let now = self.world.now();
|
|
||||||
while schedule_ptr < schedule.len() && schedule[schedule_ptr].0 <= now {
|
|
||||||
if now != schedule[schedule_ptr].0 {
|
|
||||||
warn!("skipped event {:?} at {}", schedule[schedule_ptr], now);
|
|
||||||
}
|
|
||||||
|
|
||||||
let action = &schedule[schedule_ptr].1;
|
|
||||||
match action {
|
|
||||||
TestAction::WriteTx(size) => {
|
|
||||||
if !sync_in_progress && !wait_node.is_finished() {
|
|
||||||
started_tx += *size;
|
|
||||||
wp.write_tx(*size);
|
|
||||||
debug!("written {} transactions", size);
|
|
||||||
} else {
|
|
||||||
skipped_tx += size;
|
|
||||||
debug!("skipped {} transactions", size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
TestAction::RestartSafekeeper(id) => {
|
|
||||||
debug!("restarting safekeeper {}", id);
|
|
||||||
self.servers[*id as usize].restart();
|
|
||||||
}
|
|
||||||
TestAction::RestartWalProposer => {
|
|
||||||
debug!("restarting walproposer");
|
|
||||||
wait_node.crash_stop();
|
|
||||||
sync_in_progress = true;
|
|
||||||
wait_node = self.launch_sync();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
schedule_ptr += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if schedule_ptr == schedule.len() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
let next_event_time = schedule[schedule_ptr].0;
|
|
||||||
|
|
||||||
// poll until the next event
|
|
||||||
if wait_node.is_finished() {
|
|
||||||
while self.world.step() && self.world.now() < next_event_time {}
|
|
||||||
} else {
|
|
||||||
while self.world.step()
|
|
||||||
&& self.world.now() < next_event_time
|
|
||||||
&& !wait_node.is_finished()
|
|
||||||
{}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!("finished schedule");
|
|
||||||
debug!("skipped_tx: {}", skipped_tx);
|
|
||||||
debug!("started_tx: {}", started_tx);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct WalProposer {
|
|
||||||
pub node: Arc<Node>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl WalProposer {
|
|
||||||
pub fn write_tx(&mut self, cnt: usize) {
|
|
||||||
self.node
|
|
||||||
.network_chan()
|
|
||||||
.send(NodeEvent::Internal(AnyMessage::Just32(cnt as u32)));
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn stop(&self) {
|
|
||||||
self.node.crash_stop();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub enum TestAction {
|
|
||||||
WriteTx(usize),
|
|
||||||
RestartSafekeeper(usize),
|
|
||||||
RestartWalProposer,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub type Schedule = Vec<(u64, TestAction)>;
|
|
||||||
|
|
||||||
pub fn generate_schedule(seed: u64) -> Schedule {
|
|
||||||
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
|
|
||||||
let mut schedule = Vec::new();
|
|
||||||
let mut time = 0;
|
|
||||||
|
|
||||||
let cnt = rng.gen_range(1..100);
|
|
||||||
|
|
||||||
for _ in 0..cnt {
|
|
||||||
time += rng.gen_range(0..500);
|
|
||||||
let action = match rng.gen_range(0..3) {
|
|
||||||
0 => TestAction::WriteTx(rng.gen_range(1..10)),
|
|
||||||
1 => TestAction::RestartSafekeeper(rng.gen_range(0..3)),
|
|
||||||
2 => TestAction::RestartWalProposer,
|
|
||||||
_ => unreachable!(),
|
|
||||||
};
|
|
||||||
schedule.push((time, action));
|
|
||||||
}
|
|
||||||
|
|
||||||
schedule
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn generate_network_opts(seed: u64) -> NetworkOptions {
|
|
||||||
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
|
|
||||||
|
|
||||||
let timeout = rng.gen_range(100..2000);
|
|
||||||
let max_delay = rng.gen_range(1..2*timeout);
|
|
||||||
let min_delay = rng.gen_range(1..=max_delay);
|
|
||||||
|
|
||||||
let max_fail_prob = rng.gen_range(0.0..0.9);
|
|
||||||
let connect_fail_prob = rng.gen_range(0.0..max_fail_prob);
|
|
||||||
let send_fail_prob = rng.gen_range(0.0..connect_fail_prob);
|
|
||||||
|
|
||||||
NetworkOptions {
|
|
||||||
keepalive_timeout: Some(timeout),
|
|
||||||
connect_delay: Delay {
|
|
||||||
min: min_delay,
|
|
||||||
max: max_delay,
|
|
||||||
fail_prob: connect_fail_prob,
|
|
||||||
},
|
|
||||||
send_delay: Delay {
|
|
||||||
min: min_delay,
|
|
||||||
max: max_delay,
|
|
||||||
fail_prob: send_fail_prob,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug,Clone,PartialEq,Eq)]
|
|
||||||
enum NodeKind {
|
|
||||||
Unknown,
|
|
||||||
Safekeeper,
|
|
||||||
WalProposer,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for NodeKind {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self::Unknown
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone, Debug, Default)]
|
|
||||||
struct NodeInfo {
|
|
||||||
kind: NodeKind,
|
|
||||||
|
|
||||||
// walproposer
|
|
||||||
is_sync: bool,
|
|
||||||
term: u64,
|
|
||||||
epoch_lsn: u64,
|
|
||||||
|
|
||||||
// safekeeper
|
|
||||||
commit_lsn: u64,
|
|
||||||
flush_lsn: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl NodeInfo {
|
|
||||||
fn init_kind(&mut self, kind: NodeKind) {
|
|
||||||
if self.kind == NodeKind::Unknown {
|
|
||||||
self.kind = kind;
|
|
||||||
} else {
|
|
||||||
assert!(self.kind == kind);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn started(&mut self, data: &str) {
|
|
||||||
let mut parts = data.split(';');
|
|
||||||
assert!(parts.next().unwrap() == "started");
|
|
||||||
match parts.next().unwrap() {
|
|
||||||
"safekeeper" => {
|
|
||||||
self.init_kind(NodeKind::Safekeeper);
|
|
||||||
}
|
|
||||||
"walproposer" => {
|
|
||||||
self.init_kind(NodeKind::WalProposer);
|
|
||||||
let is_sync: u8 = parts.next().unwrap().parse().unwrap();
|
|
||||||
self.is_sync = is_sync != 0;
|
|
||||||
}
|
|
||||||
_ => unreachable!(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug,Default)]
|
|
||||||
struct GlobalState {
|
|
||||||
nodes: Vec<NodeInfo>,
|
|
||||||
commit_lsn: u64,
|
|
||||||
write_lsn: u64,
|
|
||||||
max_write_lsn: u64,
|
|
||||||
|
|
||||||
written_wal: u64,
|
|
||||||
written_records: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl GlobalState {
|
|
||||||
fn new() -> Self {
|
|
||||||
Default::default()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get(&mut self, id: u32) -> &mut NodeInfo {
|
|
||||||
let id = id as usize;
|
|
||||||
if id >= self.nodes.len() {
|
|
||||||
self.nodes.resize(id + 1, NodeInfo::default());
|
|
||||||
}
|
|
||||||
&mut self.nodes[id]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn validate_events(events: Vec<SEvent>) {
|
|
||||||
const INITDB_LSN: u64 = 21623024;
|
|
||||||
|
|
||||||
let hook = std::panic::take_hook();
|
|
||||||
scopeguard::defer_on_success! {
|
|
||||||
std::panic::set_hook(hook);
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut state = GlobalState::new();
|
|
||||||
state.max_write_lsn = INITDB_LSN;
|
|
||||||
|
|
||||||
for event in events {
|
|
||||||
debug!("{:?}", event);
|
|
||||||
|
|
||||||
let node = state.get(event.node);
|
|
||||||
if event.data.starts_with("started;") {
|
|
||||||
node.started(&event.data);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
assert!(node.kind != NodeKind::Unknown);
|
|
||||||
|
|
||||||
// drop reference to unlock state
|
|
||||||
let mut node = node.clone();
|
|
||||||
|
|
||||||
let mut parts = event.data.split(';');
|
|
||||||
match node.kind {
|
|
||||||
NodeKind::Safekeeper => {
|
|
||||||
match parts.next().unwrap() {
|
|
||||||
"tli_loaded" => {
|
|
||||||
let flush_lsn: u64 = parts.next().unwrap().parse().unwrap();
|
|
||||||
let commit_lsn: u64 = parts.next().unwrap().parse().unwrap();
|
|
||||||
node.flush_lsn = flush_lsn;
|
|
||||||
node.commit_lsn = commit_lsn;
|
|
||||||
}
|
|
||||||
_ => unreachable!(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
NodeKind::WalProposer => {
|
|
||||||
match parts.next().unwrap() {
|
|
||||||
"prop_elected" => {
|
|
||||||
let prop_lsn: u64 = parts.next().unwrap().parse().unwrap();
|
|
||||||
let prop_term: u64 = parts.next().unwrap().parse().unwrap();
|
|
||||||
let prev_lsn: u64 = parts.next().unwrap().parse().unwrap();
|
|
||||||
let prev_term: u64 = parts.next().unwrap().parse().unwrap();
|
|
||||||
|
|
||||||
assert!(prop_lsn >= prev_lsn);
|
|
||||||
assert!(prop_term >= prev_term);
|
|
||||||
|
|
||||||
assert!(prop_lsn >= state.commit_lsn);
|
|
||||||
|
|
||||||
if prop_lsn > state.write_lsn {
|
|
||||||
assert!(prop_lsn <= state.max_write_lsn);
|
|
||||||
debug!("moving write_lsn up from {} to {}", state.write_lsn, prop_lsn);
|
|
||||||
state.write_lsn = prop_lsn;
|
|
||||||
}
|
|
||||||
if prop_lsn < state.write_lsn {
|
|
||||||
debug!("moving write_lsn down from {} to {}", state.write_lsn, prop_lsn);
|
|
||||||
state.write_lsn = prop_lsn;
|
|
||||||
}
|
|
||||||
|
|
||||||
node.epoch_lsn = prop_lsn;
|
|
||||||
node.term = prop_term;
|
|
||||||
}
|
|
||||||
"write_wal" => {
|
|
||||||
assert!(!node.is_sync);
|
|
||||||
let start_lsn: u64 = parts.next().unwrap().parse().unwrap();
|
|
||||||
let end_lsn: u64 = parts.next().unwrap().parse().unwrap();
|
|
||||||
let cnt: u64 = parts.next().unwrap().parse().unwrap();
|
|
||||||
|
|
||||||
let size = end_lsn - start_lsn;
|
|
||||||
state.written_wal += size;
|
|
||||||
state.written_records += cnt;
|
|
||||||
|
|
||||||
// TODO: If we allow writing WAL before winning the election
|
|
||||||
|
|
||||||
assert!(start_lsn >= state.commit_lsn);
|
|
||||||
assert!(end_lsn >= start_lsn);
|
|
||||||
assert!(start_lsn == state.write_lsn);
|
|
||||||
state.write_lsn = end_lsn;
|
|
||||||
|
|
||||||
if end_lsn > state.max_write_lsn {
|
|
||||||
state.max_write_lsn = end_lsn;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"commit_lsn" => {
|
|
||||||
let lsn: u64 = parts.next().unwrap().parse().unwrap();
|
|
||||||
assert!(lsn >= state.commit_lsn);
|
|
||||||
state.commit_lsn = lsn;
|
|
||||||
}
|
|
||||||
_ => unreachable!(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => unreachable!(),
|
|
||||||
}
|
|
||||||
|
|
||||||
// update the node in the state struct
|
|
||||||
*state.get(event.node) = node;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,265 +0,0 @@
|
|||||||
use std::{ffi::CString, path::Path, str::FromStr, sync::Arc};
|
|
||||||
|
|
||||||
use rand::Rng;
|
|
||||||
use safekeeper::simlib::{
|
|
||||||
network::{Delay, NetworkOptions},
|
|
||||||
proto::AnyMessage,
|
|
||||||
world::World,
|
|
||||||
world::{Node, NodeEvent},
|
|
||||||
};
|
|
||||||
use tracing::{info, warn};
|
|
||||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
bindings::{
|
|
||||||
neon_tenant_walproposer, neon_timeline_walproposer, sim_redo_start_lsn, syncSafekeepers,
|
|
||||||
wal_acceptor_connection_timeout, wal_acceptor_reconnect_timeout, wal_acceptors_list,
|
|
||||||
MyInsertRecord, WalProposerCleanup, WalProposerRust,
|
|
||||||
},
|
|
||||||
c_context,
|
|
||||||
simtest::{
|
|
||||||
log::{init_logger, SimClock},
|
|
||||||
safekeeper::run_server,
|
|
||||||
util::{generate_schedule, TestConfig, generate_network_opts, validate_events},
|
|
||||||
}, enable_debug,
|
|
||||||
};
|
|
||||||
|
|
||||||
use super::{
|
|
||||||
disk::Disk,
|
|
||||||
util::{Schedule, TestAction},
|
|
||||||
};
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn sync_empty_safekeepers() {
|
|
||||||
let clock = init_logger();
|
|
||||||
let mut config = TestConfig::new(Some(clock));
|
|
||||||
let test = config.start(1337);
|
|
||||||
|
|
||||||
let lsn = test.sync_safekeepers().unwrap();
|
|
||||||
assert_eq!(lsn, Lsn(0));
|
|
||||||
info!("Sucessfully synced empty safekeepers at 0/0");
|
|
||||||
|
|
||||||
let lsn = test.sync_safekeepers().unwrap();
|
|
||||||
assert_eq!(lsn, Lsn(0));
|
|
||||||
info!("Sucessfully synced (again) empty safekeepers at 0/0");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn run_walproposer_generate_wal() {
|
|
||||||
let clock = init_logger();
|
|
||||||
let mut config = TestConfig::new(Some(clock));
|
|
||||||
// config.network.timeout = Some(250);
|
|
||||||
let test = config.start(1337);
|
|
||||||
|
|
||||||
let lsn = test.sync_safekeepers().unwrap();
|
|
||||||
assert_eq!(lsn, Lsn(0));
|
|
||||||
info!("Sucessfully synced empty safekeepers at 0/0");
|
|
||||||
|
|
||||||
let mut wp = test.launch_walproposer(lsn);
|
|
||||||
|
|
||||||
test.poll_for_duration(30);
|
|
||||||
|
|
||||||
for i in 0..100 {
|
|
||||||
wp.write_tx(1);
|
|
||||||
test.poll_for_duration(5);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn crash_safekeeper() {
|
|
||||||
let clock = init_logger();
|
|
||||||
let mut config = TestConfig::new(Some(clock));
|
|
||||||
// config.network.timeout = Some(250);
|
|
||||||
let test = config.start(1337);
|
|
||||||
|
|
||||||
let lsn = test.sync_safekeepers().unwrap();
|
|
||||||
assert_eq!(lsn, Lsn(0));
|
|
||||||
info!("Sucessfully synced empty safekeepers at 0/0");
|
|
||||||
|
|
||||||
let mut wp = test.launch_walproposer(lsn);
|
|
||||||
|
|
||||||
test.poll_for_duration(30);
|
|
||||||
|
|
||||||
wp.write_tx(3);
|
|
||||||
|
|
||||||
test.servers[0].restart();
|
|
||||||
|
|
||||||
test.poll_for_duration(100);
|
|
||||||
test.poll_for_duration(1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_simple_restart() {
|
|
||||||
let clock = init_logger();
|
|
||||||
let mut config = TestConfig::new(Some(clock));
|
|
||||||
// config.network.timeout = Some(250);
|
|
||||||
let test = config.start(1337);
|
|
||||||
|
|
||||||
let lsn = test.sync_safekeepers().unwrap();
|
|
||||||
assert_eq!(lsn, Lsn(0));
|
|
||||||
info!("Sucessfully synced empty safekeepers at 0/0");
|
|
||||||
|
|
||||||
let mut wp = test.launch_walproposer(lsn);
|
|
||||||
|
|
||||||
test.poll_for_duration(30);
|
|
||||||
|
|
||||||
wp.write_tx(3);
|
|
||||||
test.poll_for_duration(100);
|
|
||||||
|
|
||||||
wp.stop();
|
|
||||||
drop(wp);
|
|
||||||
|
|
||||||
let lsn = test.sync_safekeepers().unwrap();
|
|
||||||
info!("Sucessfully synced safekeepers at {}", lsn);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_simple_schedule() -> anyhow::Result<()> {
|
|
||||||
let clock = init_logger();
|
|
||||||
let mut config = TestConfig::new(Some(clock));
|
|
||||||
config.network.keepalive_timeout = Some(100);
|
|
||||||
let test = config.start(1337);
|
|
||||||
|
|
||||||
let schedule: Schedule = vec![
|
|
||||||
(0, TestAction::RestartWalProposer),
|
|
||||||
(50, TestAction::WriteTx(5)),
|
|
||||||
(100, TestAction::RestartSafekeeper(0)),
|
|
||||||
(100, TestAction::WriteTx(5)),
|
|
||||||
(110, TestAction::RestartSafekeeper(1)),
|
|
||||||
(110, TestAction::WriteTx(5)),
|
|
||||||
(120, TestAction::RestartSafekeeper(2)),
|
|
||||||
(120, TestAction::WriteTx(5)),
|
|
||||||
(201, TestAction::RestartWalProposer),
|
|
||||||
(251, TestAction::RestartSafekeeper(0)),
|
|
||||||
(251, TestAction::RestartSafekeeper(1)),
|
|
||||||
(251, TestAction::RestartSafekeeper(2)),
|
|
||||||
(251, TestAction::WriteTx(5)),
|
|
||||||
(255, TestAction::WriteTx(5)),
|
|
||||||
(1000, TestAction::WriteTx(5)),
|
|
||||||
];
|
|
||||||
|
|
||||||
test.run_schedule(&schedule)?;
|
|
||||||
info!("Test finished, stopping all threads");
|
|
||||||
test.world.deallocate();
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_many_tx() -> anyhow::Result<()> {
|
|
||||||
enable_debug();
|
|
||||||
let clock = init_logger();
|
|
||||||
let mut config = TestConfig::new(Some(clock));
|
|
||||||
let test = config.start(1337);
|
|
||||||
|
|
||||||
let mut schedule: Schedule = vec![];
|
|
||||||
for i in 0..100 {
|
|
||||||
schedule.push((i * 10, TestAction::WriteTx(10)));
|
|
||||||
}
|
|
||||||
|
|
||||||
test.run_schedule(&schedule)?;
|
|
||||||
info!("Test finished, stopping all threads");
|
|
||||||
test.world.stop_all();
|
|
||||||
|
|
||||||
let events = test.world.take_events();
|
|
||||||
info!("Events: {:?}", events);
|
|
||||||
let last_commit_lsn = events
|
|
||||||
.iter()
|
|
||||||
.filter_map(|event| {
|
|
||||||
if event.data.starts_with("commit_lsn;") {
|
|
||||||
let lsn: u64 = event.data.split(';').nth(1).unwrap().parse().unwrap();
|
|
||||||
return Some(lsn);
|
|
||||||
}
|
|
||||||
None
|
|
||||||
})
|
|
||||||
.last()
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let initdb_lsn = 21623024;
|
|
||||||
let diff = last_commit_lsn - initdb_lsn;
|
|
||||||
info!("Last commit lsn: {}, diff: {}", last_commit_lsn, diff);
|
|
||||||
assert!(diff > 1000 * 8);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_random_schedules() -> anyhow::Result<()> {
|
|
||||||
let clock = init_logger();
|
|
||||||
let mut config = TestConfig::new(Some(clock));
|
|
||||||
config.network.keepalive_timeout = Some(100);
|
|
||||||
|
|
||||||
for i in 0..30000 {
|
|
||||||
let seed: u64 = rand::thread_rng().gen();
|
|
||||||
config.network = generate_network_opts(seed);
|
|
||||||
|
|
||||||
let test = config.start(seed);
|
|
||||||
warn!("Running test with seed {}", seed);
|
|
||||||
|
|
||||||
let schedule = generate_schedule(seed);
|
|
||||||
test.run_schedule(&schedule).unwrap();
|
|
||||||
validate_events(test.world.take_events());
|
|
||||||
test.world.deallocate();
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_one_schedule() -> anyhow::Result<()> {
|
|
||||||
enable_debug();
|
|
||||||
let clock = init_logger();
|
|
||||||
let mut config = TestConfig::new(Some(clock));
|
|
||||||
config.network.keepalive_timeout = Some(100);
|
|
||||||
|
|
||||||
// let seed = 6762900106769428342;
|
|
||||||
// let test = config.start(seed);
|
|
||||||
// warn!("Running test with seed {}", seed);
|
|
||||||
|
|
||||||
// let schedule = generate_schedule(seed);
|
|
||||||
// info!("schedule: {:?}", schedule);
|
|
||||||
// test.run_schedule(&schedule)?;
|
|
||||||
// test.world.deallocate();
|
|
||||||
|
|
||||||
let seed = 3649773280641776194;
|
|
||||||
config.network = generate_network_opts(seed);
|
|
||||||
info!("network: {:?}", config.network);
|
|
||||||
let test = config.start(seed);
|
|
||||||
warn!("Running test with seed {}", seed);
|
|
||||||
|
|
||||||
let schedule = generate_schedule(seed);
|
|
||||||
info!("schedule: {:?}", schedule);
|
|
||||||
test.run_schedule(&schedule).unwrap();
|
|
||||||
validate_events(test.world.take_events());
|
|
||||||
test.world.deallocate();
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_res_dealloc() -> anyhow::Result<()> {
|
|
||||||
// enable_debug();
|
|
||||||
let clock = init_logger();
|
|
||||||
let mut config = TestConfig::new(Some(clock));
|
|
||||||
|
|
||||||
// print pid
|
|
||||||
let pid = unsafe { libc::getpid() };
|
|
||||||
info!("pid: {}", pid);
|
|
||||||
|
|
||||||
let seed = 123456;
|
|
||||||
config.network = generate_network_opts(seed);
|
|
||||||
let test = config.start(seed);
|
|
||||||
warn!("Running test with seed {}", seed);
|
|
||||||
|
|
||||||
let schedule = generate_schedule(seed);
|
|
||||||
info!("schedule: {:?}", schedule);
|
|
||||||
test.run_schedule(&schedule).unwrap();
|
|
||||||
test.world.stop_all();
|
|
||||||
|
|
||||||
let world = test.world.clone();
|
|
||||||
drop(test);
|
|
||||||
info!("world strong count: {}", Arc::strong_count(&world));
|
|
||||||
world.deallocate();
|
|
||||||
info!("world strong count: {}", Arc::strong_count(&world));
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -1,31 +0,0 @@
|
|||||||
use tracing::info;
|
|
||||||
|
|
||||||
use crate::bindings::{TestFunc, MyContextInit};
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_rust_c_calls() {
|
|
||||||
let res = std::thread::spawn(|| {
|
|
||||||
let res = unsafe {
|
|
||||||
MyContextInit();
|
|
||||||
TestFunc(1, 2)
|
|
||||||
};
|
|
||||||
res
|
|
||||||
}).join().unwrap();
|
|
||||||
info!("res: {}", res);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_sim_bindings() {
|
|
||||||
std::thread::spawn(|| {
|
|
||||||
unsafe {
|
|
||||||
MyContextInit();
|
|
||||||
TestFunc(1, 2)
|
|
||||||
}
|
|
||||||
}).join().unwrap();
|
|
||||||
std::thread::spawn(|| {
|
|
||||||
unsafe {
|
|
||||||
MyContextInit();
|
|
||||||
TestFunc(1, 2)
|
|
||||||
}
|
|
||||||
}).join().unwrap();
|
|
||||||
}
|
|
||||||
@@ -1,100 +0,0 @@
|
|||||||
#include "bindgen_deps.h"
|
|
||||||
#include "rust_bindings.h"
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <pthread.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include "postgres.h"
|
|
||||||
#include "utils/memutils.h"
|
|
||||||
#include "utils/guc.h"
|
|
||||||
#include "miscadmin.h"
|
|
||||||
#include "common/pg_prng.h"
|
|
||||||
|
|
||||||
// From src/backend/main/main.c
|
|
||||||
const char *progname = "fakepostgres";
|
|
||||||
|
|
||||||
int TestFunc(int a, int b) {
|
|
||||||
printf("TestFunc: %d + %d = %d\n", a, b, a + b);
|
|
||||||
rust_function(0);
|
|
||||||
elog(LOG, "postgres elog test");
|
|
||||||
printf("After rust_function\n");
|
|
||||||
return a + b;
|
|
||||||
}
|
|
||||||
|
|
||||||
// This is a quick experiment with rewriting existing Rust code in C.
|
|
||||||
void RunClientC(uint32_t serverId) {
|
|
||||||
uint32_t clientId = sim_id();
|
|
||||||
|
|
||||||
elog(LOG, "started client");
|
|
||||||
|
|
||||||
int data_len = 5;
|
|
||||||
|
|
||||||
int delivered = 0;
|
|
||||||
int tcp = sim_open_tcp(serverId);
|
|
||||||
while (delivered < data_len) {
|
|
||||||
sim_msg_set_repl_cell(delivered+1, clientId, delivered);
|
|
||||||
sim_tcp_send(tcp);
|
|
||||||
|
|
||||||
Event event = sim_epoll_rcv(-1);
|
|
||||||
switch (event.tag)
|
|
||||||
{
|
|
||||||
case Closed:
|
|
||||||
elog(LOG, "connection closed");
|
|
||||||
tcp = sim_open_tcp(serverId);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case Message:
|
|
||||||
Assert(event.any_message == Just32);
|
|
||||||
uint32_t val;
|
|
||||||
sim_msg_get_just_u32(&val);
|
|
||||||
if (val == delivered + 1) {
|
|
||||||
delivered += 1;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
Assert(false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool debug_enabled = false;
|
|
||||||
|
|
||||||
bool initializedMemoryContext = false;
|
|
||||||
// pthread_mutex_init(&lock, NULL)?
|
|
||||||
pthread_mutex_t lock;
|
|
||||||
|
|
||||||
void MyContextInit() {
|
|
||||||
// initializes global variables, TODO how to make them thread-local?
|
|
||||||
pthread_mutex_lock(&lock);
|
|
||||||
if (!initializedMemoryContext) {
|
|
||||||
initializedMemoryContext = true;
|
|
||||||
MemoryContextInit();
|
|
||||||
pg_prng_seed(&pg_global_prng_state, 0);
|
|
||||||
|
|
||||||
setenv("PGDATA", "/home/admin/simulator/libs/walproposer/pgdata", 1);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Set default values for command-line options.
|
|
||||||
*/
|
|
||||||
InitializeGUCOptions();
|
|
||||||
|
|
||||||
/* Acquire configuration parameters */
|
|
||||||
if (!SelectConfigFiles(NULL, progname))
|
|
||||||
exit(1);
|
|
||||||
|
|
||||||
if (debug_enabled) {
|
|
||||||
log_min_messages = LOG;
|
|
||||||
} else {
|
|
||||||
log_min_messages = FATAL;
|
|
||||||
}
|
|
||||||
Log_line_prefix = "[%p] ";
|
|
||||||
|
|
||||||
InitializeMaxBackends();
|
|
||||||
ChangeToDataDir();
|
|
||||||
CreateSharedMemoryAndSemaphores();
|
|
||||||
SetInstallXLogFileSegmentActive();
|
|
||||||
// CreateAuxProcessResourceOwner();
|
|
||||||
// StartupXLOG();
|
|
||||||
}
|
|
||||||
pthread_mutex_unlock(&lock);
|
|
||||||
}
|
|
||||||
@@ -23,7 +23,6 @@ const_format.workspace = true
|
|||||||
consumption_metrics.workspace = true
|
consumption_metrics.workspace = true
|
||||||
crc32c.workspace = true
|
crc32c.workspace = true
|
||||||
crossbeam-utils.workspace = true
|
crossbeam-utils.workspace = true
|
||||||
either.workspace = true
|
|
||||||
fail.workspace = true
|
fail.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
git-version.workspace = true
|
git-version.workspace = true
|
||||||
@@ -52,7 +51,7 @@ thiserror.workspace = true
|
|||||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||||
tokio-postgres.workspace = true
|
tokio-postgres.workspace = true
|
||||||
tokio-util.workspace = true
|
tokio-util.workspace = true
|
||||||
toml_edit = { workspace = true, features = [ "serde" ] }
|
toml_edit.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
url.workspace = true
|
url.workspace = true
|
||||||
walkdir.workspace = true
|
walkdir.workspace = true
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ use pageserver_api::reltag::{RelTag, SlruKind};
|
|||||||
|
|
||||||
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
||||||
use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
|
use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
|
||||||
use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
|
|
||||||
use postgres_ffi::TransactionId;
|
use postgres_ffi::TransactionId;
|
||||||
use postgres_ffi::XLogFileName;
|
use postgres_ffi::XLogFileName;
|
||||||
use postgres_ffi::PG_TLI;
|
use postgres_ffi::PG_TLI;
|
||||||
@@ -191,31 +190,14 @@ where
|
|||||||
{
|
{
|
||||||
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
|
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
|
||||||
|
|
||||||
// If full backup is requested, include all relation files.
|
// Gather and send relational files in each database if full backup is requested.
|
||||||
// Otherwise only include init forks of unlogged relations.
|
if self.full_backup {
|
||||||
let rels = self
|
for rel in self
|
||||||
.timeline
|
.timeline
|
||||||
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
||||||
.await?;
|
.await?
|
||||||
for &rel in rels.iter() {
|
{
|
||||||
// Send init fork as main fork to provide well formed empty
|
self.add_rel(rel).await?;
|
||||||
// contents of UNLOGGED relations. Postgres copies it in
|
|
||||||
// `reinit.c` during recovery.
|
|
||||||
if rel.forknum == INIT_FORKNUM {
|
|
||||||
// I doubt we need _init fork itself, but having it at least
|
|
||||||
// serves as a marker relation is unlogged.
|
|
||||||
self.add_rel(rel, rel).await?;
|
|
||||||
self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if self.full_backup {
|
|
||||||
if rel.forknum == MAIN_FORKNUM && rels.contains(&rel.with_forknum(INIT_FORKNUM))
|
|
||||||
{
|
|
||||||
// skip this, will include it when we reach the init fork
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
self.add_rel(rel, rel).await?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -238,16 +220,15 @@ where
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add contents of relfilenode `src`, naming it as `dst`.
|
async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
|
||||||
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
|
|
||||||
let nblocks = self
|
let nblocks = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_rel_size(src, self.lsn, false, self.ctx)
|
.get_rel_size(tag, self.lsn, false, self.ctx)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
// If the relation is empty, create an empty file
|
// If the relation is empty, create an empty file
|
||||||
if nblocks == 0 {
|
if nblocks == 0 {
|
||||||
let file_name = dst.to_segfile_name(0);
|
let file_name = tag.to_segfile_name(0);
|
||||||
let header = new_tar_header(&file_name, 0)?;
|
let header = new_tar_header(&file_name, 0)?;
|
||||||
self.ar.append(&header, &mut io::empty()).await?;
|
self.ar.append(&header, &mut io::empty()).await?;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
@@ -263,12 +244,12 @@ where
|
|||||||
for blknum in startblk..endblk {
|
for blknum in startblk..endblk {
|
||||||
let img = self
|
let img = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
|
.get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx)
|
||||||
.await?;
|
.await?;
|
||||||
segment_data.extend_from_slice(&img[..]);
|
segment_data.extend_from_slice(&img[..]);
|
||||||
}
|
}
|
||||||
|
|
||||||
let file_name = dst.to_segfile_name(seg as u32);
|
let file_name = tag.to_segfile_name(seg as u32);
|
||||||
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
|
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
|
||||||
self.ar.append(&header, segment_data.as_slice()).await?;
|
self.ar.append(&header, segment_data.as_slice()).await?;
|
||||||
|
|
||||||
|
|||||||
@@ -88,13 +88,6 @@ fn main() -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Initialize logging, which must be initialized before the custom panic hook is installed.
|
|
||||||
logging::init(conf.log_format)?;
|
|
||||||
|
|
||||||
// mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
|
|
||||||
// disarming this hook on pageserver, because we never tear down tracing.
|
|
||||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
|
||||||
|
|
||||||
// initialize sentry if SENTRY_DSN is provided
|
// initialize sentry if SENTRY_DSN is provided
|
||||||
let _sentry_guard = init_sentry(
|
let _sentry_guard = init_sentry(
|
||||||
Some(GIT_VERSION.into()),
|
Some(GIT_VERSION.into()),
|
||||||
@@ -217,6 +210,9 @@ fn start_pageserver(
|
|||||||
launch_ts: &'static LaunchTimestamp,
|
launch_ts: &'static LaunchTimestamp,
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
|
// Initialize logging
|
||||||
|
logging::init(conf.log_format)?;
|
||||||
|
|
||||||
// Print version and launch timestamp to the log,
|
// Print version and launch timestamp to the log,
|
||||||
// and expose them as prometheus metrics.
|
// and expose them as prometheus metrics.
|
||||||
// A changed version string indicates changed software.
|
// A changed version string indicates changed software.
|
||||||
|
|||||||
@@ -731,13 +731,6 @@ impl PageServerConf {
|
|||||||
})?);
|
})?);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(eviction_policy) = item.get("eviction_policy") {
|
|
||||||
t_conf.eviction_policy = Some(
|
|
||||||
toml_edit::de::from_item(eviction_policy.clone())
|
|
||||||
.context("parse eviction_policy")?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(t_conf)
|
Ok(t_conf)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
|
|||||||
const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
|
const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
|
||||||
|
|
||||||
#[serde_as]
|
#[serde_as]
|
||||||
#[derive(Serialize, Debug)]
|
#[derive(Serialize)]
|
||||||
struct Ids {
|
struct Ids {
|
||||||
#[serde_as(as = "DisplayFromStr")]
|
#[serde_as(as = "DisplayFromStr")]
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
@@ -75,7 +75,7 @@ pub async fn collect_metrics(
|
|||||||
// define client here to reuse it for all requests
|
// define client here to reuse it for all requests
|
||||||
let client = reqwest::Client::new();
|
let client = reqwest::Client::new();
|
||||||
let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
|
let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
|
||||||
let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
|
let mut prev_iteration_time: Option<std::time::Instant> = None;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
@@ -86,11 +86,11 @@ pub async fn collect_metrics(
|
|||||||
_ = ticker.tick() => {
|
_ = ticker.tick() => {
|
||||||
|
|
||||||
// send cached metrics every cached_metric_collection_interval
|
// send cached metrics every cached_metric_collection_interval
|
||||||
let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
|
let send_cached = prev_iteration_time
|
||||||
|
.map(|x| x.elapsed() >= cached_metric_collection_interval)
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
if send_cached {
|
prev_iteration_time = Some(std::time::Instant::now());
|
||||||
prev_iteration_time = std::time::Instant::now();
|
|
||||||
}
|
|
||||||
|
|
||||||
collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
|
collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
|
||||||
}
|
}
|
||||||
@@ -287,12 +287,6 @@ pub async fn collect_metrics_iteration(
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
error!("metrics endpoint refused the sent metrics: {:?}", res);
|
error!("metrics endpoint refused the sent metrics: {:?}", res);
|
||||||
for metric in chunk_to_send.iter() {
|
|
||||||
// Report if the metric value is suspiciously large
|
|
||||||
if metric.value > (1u64 << 40) {
|
|
||||||
error!("potentially abnormal metric value: {:?}", metric);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
|
|||||||
@@ -437,13 +437,6 @@ paths:
|
|||||||
type: boolean
|
type: boolean
|
||||||
description: |
|
description: |
|
||||||
When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
|
When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
|
||||||
- name: retention_period
|
|
||||||
in: query
|
|
||||||
required: false
|
|
||||||
schema:
|
|
||||||
type: integer
|
|
||||||
description: |
|
|
||||||
Override the default retention period (in bytes) used for size calculation.
|
|
||||||
get:
|
get:
|
||||||
description: |
|
description: |
|
||||||
Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
|
Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
|
||||||
|
|||||||
@@ -7,21 +7,19 @@ use hyper::{Body, Request, Response, Uri};
|
|||||||
use metrics::launch_timestamp::LaunchTimestamp;
|
use metrics::launch_timestamp::LaunchTimestamp;
|
||||||
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
|
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
|
||||||
use remote_storage::GenericRemoteStorage;
|
use remote_storage::GenericRemoteStorage;
|
||||||
use tenant_size_model::{SizeResult, StorageModel};
|
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||||
|
|
||||||
use super::models::{
|
use super::models::{
|
||||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
TimelineCreateRequest, TimelineInfo,
|
||||||
};
|
};
|
||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext};
|
||||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||||
use crate::task_mgr::TaskKind;
|
use crate::task_mgr::TaskKind;
|
||||||
use crate::tenant::config::TenantConfOpt;
|
use crate::tenant::config::TenantConfOpt;
|
||||||
use crate::tenant::mgr::TenantMapInsertError;
|
use crate::tenant::mgr::TenantMapInsertError;
|
||||||
use crate::tenant::size::ModelInputs;
|
|
||||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||||
use crate::tenant::{PageReconstructError, Timeline};
|
use crate::tenant::{PageReconstructError, Timeline};
|
||||||
use crate::{config::PageServerConf, tenant::mgr};
|
use crate::{config::PageServerConf, tenant::mgr};
|
||||||
@@ -40,7 +38,7 @@ use utils::{
|
|||||||
|
|
||||||
// Imports only used for testing APIs
|
// Imports only used for testing APIs
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
use super::models::ConfigureFailpointsRequest;
|
use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
|
||||||
|
|
||||||
struct State {
|
struct State {
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
@@ -481,19 +479,11 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
|||||||
/// to debug any of the calculations. Requires `tenant_id` request parameter, supports
|
/// to debug any of the calculations. Requires `tenant_id` request parameter, supports
|
||||||
/// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
|
/// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
|
||||||
/// values.
|
/// values.
|
||||||
///
|
|
||||||
/// 'retention_period' query parameter overrides the cutoff that is used to calculate the size
|
|
||||||
/// (only if it is shorter than the real cutoff).
|
|
||||||
///
|
|
||||||
/// Note: we don't update the cached size and prometheus metric here.
|
|
||||||
/// The retention period might be different, and it's nice to have a method to just calculate it
|
|
||||||
/// without modifying anything anyway.
|
|
||||||
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
|
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
|
||||||
let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
|
|
||||||
let headers = request.headers();
|
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let tenant = mgr::get_tenant(tenant_id, true)
|
let tenant = mgr::get_tenant(tenant_id, true)
|
||||||
@@ -502,29 +492,24 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
|||||||
|
|
||||||
// this can be long operation
|
// this can be long operation
|
||||||
let inputs = tenant
|
let inputs = tenant
|
||||||
.gather_size_inputs(retention_period, &ctx)
|
.gather_size_inputs(&ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
let mut sizes = None;
|
let size = if !inputs_only.unwrap_or(false) {
|
||||||
if !inputs_only.unwrap_or(false) {
|
Some(
|
||||||
let storage_model = inputs
|
tenant
|
||||||
.calculate_model()
|
.calc_and_update_cached_synthetic_size(&inputs)
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(ApiError::InternalServerError)?,
|
||||||
let size = storage_model.calculate();
|
)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
// If request header expects html, return html
|
/// Private response type with the additional "unstable" `inputs` field.
|
||||||
if headers["Accept"] == "text/html" {
|
///
|
||||||
return synthetic_size_html_response(inputs, storage_model, size);
|
/// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is
|
||||||
}
|
/// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`.
|
||||||
sizes = Some(size);
|
|
||||||
} else if headers["Accept"] == "text/html" {
|
|
||||||
return Err(ApiError::BadRequest(anyhow!(
|
|
||||||
"inputs_only parameter is incompatible with html output request"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The type resides in the pageserver not to expose `ModelInputs`.
|
|
||||||
#[serde_with::serde_as]
|
#[serde_with::serde_as]
|
||||||
#[derive(serde::Serialize)]
|
#[derive(serde::Serialize)]
|
||||||
struct TenantHistorySize {
|
struct TenantHistorySize {
|
||||||
@@ -534,9 +519,6 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
|||||||
///
|
///
|
||||||
/// Will be none if `?inputs_only=true` was given.
|
/// Will be none if `?inputs_only=true` was given.
|
||||||
size: Option<u64>,
|
size: Option<u64>,
|
||||||
/// Size of each segment used in the model.
|
|
||||||
/// Will be null if `?inputs_only=true` was given.
|
|
||||||
segment_sizes: Option<Vec<tenant_size_model::SegmentSizeResult>>,
|
|
||||||
inputs: crate::tenant::size::ModelInputs,
|
inputs: crate::tenant::size::ModelInputs,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -544,8 +526,7 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
|||||||
StatusCode::OK,
|
StatusCode::OK,
|
||||||
TenantHistorySize {
|
TenantHistorySize {
|
||||||
id: tenant_id,
|
id: tenant_id,
|
||||||
size: sizes.as_ref().map(|x| x.total_size),
|
size,
|
||||||
segment_sizes: sizes.map(|x| x.segments),
|
|
||||||
inputs,
|
inputs,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
@@ -610,62 +591,6 @@ async fn evict_timeline_layer_handler(request: Request<Body>) -> Result<Response
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get tenant_size SVG graph along with the JSON data.
|
|
||||||
fn synthetic_size_html_response(
|
|
||||||
inputs: ModelInputs,
|
|
||||||
storage_model: StorageModel,
|
|
||||||
sizes: SizeResult,
|
|
||||||
) -> Result<Response<Body>, ApiError> {
|
|
||||||
let mut timeline_ids: Vec<String> = Vec::new();
|
|
||||||
let mut timeline_map: HashMap<TimelineId, usize> = HashMap::new();
|
|
||||||
for (index, ti) in inputs.timeline_inputs.iter().enumerate() {
|
|
||||||
timeline_map.insert(ti.timeline_id, index);
|
|
||||||
timeline_ids.push(ti.timeline_id.to_string());
|
|
||||||
}
|
|
||||||
let seg_to_branch: Vec<usize> = inputs
|
|
||||||
.segments
|
|
||||||
.iter()
|
|
||||||
.map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let svg =
|
|
||||||
tenant_size_model::svg::draw_svg(&storage_model, &timeline_ids, &seg_to_branch, &sizes)
|
|
||||||
.map_err(ApiError::InternalServerError)?;
|
|
||||||
|
|
||||||
let mut response = String::new();
|
|
||||||
|
|
||||||
use std::fmt::Write;
|
|
||||||
write!(response, "<html>\n<body>\n").unwrap();
|
|
||||||
write!(response, "<div>\n{svg}\n</div>").unwrap();
|
|
||||||
writeln!(response, "Project size: {}", sizes.total_size).unwrap();
|
|
||||||
writeln!(response, "<pre>").unwrap();
|
|
||||||
writeln!(
|
|
||||||
response,
|
|
||||||
"{}",
|
|
||||||
serde_json::to_string_pretty(&inputs).unwrap()
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
writeln!(
|
|
||||||
response,
|
|
||||||
"{}",
|
|
||||||
serde_json::to_string_pretty(&sizes.segments).unwrap()
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
writeln!(response, "</pre>").unwrap();
|
|
||||||
write!(response, "</body>\n</html>\n").unwrap();
|
|
||||||
|
|
||||||
html_response(StatusCode::OK, response)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
|
|
||||||
let response = Response::builder()
|
|
||||||
.status(status)
|
|
||||||
.header(hyper::header::CONTENT_TYPE, "text/html")
|
|
||||||
.body(Body::from(data.as_bytes().to_vec()))
|
|
||||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
|
||||||
Ok(response)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper function to standardize the error messages we produce on bad durations
|
// Helper function to standardize the error messages we produce on bad durations
|
||||||
//
|
//
|
||||||
// Intended to be used with anyhow's `with_context`, e.g.:
|
// Intended to be used with anyhow's `with_context`, e.g.:
|
||||||
@@ -872,14 +797,6 @@ async fn update_tenant_config_handler(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(eviction_policy) = request_data.eviction_policy {
|
|
||||||
tenant_conf.eviction_policy = Some(
|
|
||||||
serde_json::from_value(eviction_policy)
|
|
||||||
.context("parse field `eviction_policy`")
|
|
||||||
.map_err(ApiError::BadRequest)?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
|
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
|
||||||
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
|
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
|
||||||
@@ -925,6 +842,7 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Run GC immediately on given timeline.
|
// Run GC immediately on given timeline.
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
@@ -971,22 +889,19 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
|||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
async {
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
|
||||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
|
||||||
timeline
|
|
||||||
.freeze_and_flush()
|
|
||||||
.await
|
|
||||||
.map_err(ApiError::InternalServerError)?;
|
|
||||||
timeline
|
|
||||||
.compact(&ctx)
|
|
||||||
.await
|
|
||||||
.map_err(ApiError::InternalServerError)?;
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
}
|
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||||
.instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
|
timeline
|
||||||
.await
|
.freeze_and_flush()
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
timeline
|
||||||
|
.compact(&ctx)
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn timeline_download_remote_layers_handler_post(
|
async fn timeline_download_remote_layers_handler_post(
|
||||||
@@ -1031,17 +946,6 @@ async fn active_timeline_of_active_tenant(
|
|||||||
.map_err(ApiError::NotFound)
|
.map_err(ApiError::NotFound)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
// Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook().
|
|
||||||
// For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it.
|
|
||||||
// Use catch_unwind to ensure that tokio nor hyper are distracted by our panic.
|
|
||||||
let query = req.uri().query();
|
|
||||||
let _ = std::panic::catch_unwind(|| {
|
|
||||||
panic!("unconditional panic for testing panic hook integration; request query: {query:?}")
|
|
||||||
});
|
|
||||||
json_response(StatusCode::NO_CONTENT, ())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
json_response(
|
json_response(
|
||||||
StatusCode::NOT_FOUND,
|
StatusCode::NOT_FOUND,
|
||||||
@@ -1107,7 +1011,7 @@ pub fn make_router(
|
|||||||
.get("/v1/tenant", tenant_list_handler)
|
.get("/v1/tenant", tenant_list_handler)
|
||||||
.post("/v1/tenant", tenant_create_handler)
|
.post("/v1/tenant", tenant_create_handler)
|
||||||
.get("/v1/tenant/:tenant_id", tenant_status)
|
.get("/v1/tenant/:tenant_id", tenant_status)
|
||||||
.get("/v1/tenant/:tenant_id/synthetic_size", tenant_size_handler)
|
.get("/v1/tenant/:tenant_id/size", tenant_size_handler)
|
||||||
.put("/v1/tenant/config", update_tenant_config_handler)
|
.put("/v1/tenant/config", update_tenant_config_handler)
|
||||||
.get("/v1/tenant/:tenant_id/config", get_tenant_config_handler)
|
.get("/v1/tenant/:tenant_id/config", get_tenant_config_handler)
|
||||||
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
|
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
|
||||||
@@ -1126,7 +1030,7 @@ pub fn make_router(
|
|||||||
)
|
)
|
||||||
.put(
|
.put(
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc",
|
"/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc",
|
||||||
timeline_gc_handler,
|
testing_api!("run timeline GC", timeline_gc_handler),
|
||||||
)
|
)
|
||||||
.put(
|
.put(
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
|
"/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
|
||||||
@@ -1160,6 +1064,5 @@ pub fn make_router(
|
|||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||||
evict_timeline_layer_handler,
|
evict_timeline_layer_handler,
|
||||||
)
|
)
|
||||||
.get("/v1/panic", always_panic_handler)
|
|
||||||
.any(handler_404))
|
.any(handler_404))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,11 +7,11 @@ use std::fmt;
|
|||||||
use std::ops::{AddAssign, Range};
|
use std::ops::{AddAssign, Range};
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
|
||||||
/// Key used in the Repository kv-store.
|
/// Key used in the Repository kv-store.
|
||||||
///
|
///
|
||||||
/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
|
/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
|
||||||
/// for what we actually store in these fields.
|
/// for what we actually store in these fields.
|
||||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
|
|
||||||
pub struct Key {
|
pub struct Key {
|
||||||
pub field1: u8,
|
pub field1: u8,
|
||||||
pub field2: u32,
|
pub field2: u32,
|
||||||
|
|||||||
@@ -231,9 +231,6 @@ pub enum TaskKind {
|
|||||||
// Compaction. One per tenant.
|
// Compaction. One per tenant.
|
||||||
Compaction,
|
Compaction,
|
||||||
|
|
||||||
// Eviction. One per timeline.
|
|
||||||
Eviction,
|
|
||||||
|
|
||||||
// Initial logical size calculation
|
// Initial logical size calculation
|
||||||
InitialLogicalSizeCalculation,
|
InitialLogicalSizeCalculation,
|
||||||
|
|
||||||
|
|||||||
@@ -2418,9 +2418,6 @@ impl Tenant {
|
|||||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
|
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
|
||||||
pub async fn gather_size_inputs(
|
pub async fn gather_size_inputs(
|
||||||
&self,
|
&self,
|
||||||
// `max_retention_period` overrides the cutoff that is used to calculate the size
|
|
||||||
// (only if it is shorter than the real cutoff).
|
|
||||||
max_retention_period: Option<u64>,
|
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<size::ModelInputs> {
|
) -> anyhow::Result<size::ModelInputs> {
|
||||||
let logical_sizes_at_once = self
|
let logical_sizes_at_once = self
|
||||||
@@ -2428,41 +2425,32 @@ impl Tenant {
|
|||||||
.concurrent_tenant_size_logical_size_queries
|
.concurrent_tenant_size_logical_size_queries
|
||||||
.inner();
|
.inner();
|
||||||
|
|
||||||
// TODO: Having a single mutex block concurrent reads is not great for performance.
|
// TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries
|
||||||
//
|
// are for testing/experimenting, we tolerate this.
|
||||||
// But the only case where we need to run multiple of these at once is when we
|
|
||||||
// request a size for a tenant manually via API, while another background calculation
|
|
||||||
// is in progress (which is not a common case).
|
|
||||||
//
|
//
|
||||||
// See more for on the issue #2748 condenced out of the initial PR review.
|
// See more for on the issue #2748 condenced out of the initial PR review.
|
||||||
let mut shared_cache = self.cached_logical_sizes.lock().await;
|
let mut shared_cache = self.cached_logical_sizes.lock().await;
|
||||||
|
|
||||||
size::gather_inputs(
|
size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await
|
||||||
self,
|
|
||||||
logical_sizes_at_once,
|
|
||||||
max_retention_period,
|
|
||||||
&mut shared_cache,
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate synthetic tenant size and cache the result.
|
/// Calculate synthetic tenant size
|
||||||
/// This is periodically called by background worker.
|
/// This is periodically called by background worker.
|
||||||
/// result is cached in tenant struct
|
/// result is cached in tenant struct
|
||||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
|
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
|
||||||
pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
|
pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
|
||||||
let inputs = self.gather_size_inputs(None, ctx).await?;
|
let inputs = self.gather_size_inputs(ctx).await?;
|
||||||
|
|
||||||
let size = inputs.calculate()?;
|
self.calc_and_update_cached_synthetic_size(&inputs)
|
||||||
|
|
||||||
self.set_cached_synthetic_size(size);
|
|
||||||
|
|
||||||
Ok(size)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Cache given synthetic size and update the metric value
|
/// Calculate synthetic size , cache it and set metric value
|
||||||
pub fn set_cached_synthetic_size(&self, size: u64) {
|
pub fn calc_and_update_cached_synthetic_size(
|
||||||
|
&self,
|
||||||
|
inputs: &size::ModelInputs,
|
||||||
|
) -> anyhow::Result<u64> {
|
||||||
|
let size = inputs.calculate()?;
|
||||||
|
|
||||||
self.cached_synthetic_tenant_size
|
self.cached_synthetic_tenant_size
|
||||||
.store(size, Ordering::Relaxed);
|
.store(size, Ordering::Relaxed);
|
||||||
|
|
||||||
@@ -2470,6 +2458,8 @@ impl Tenant {
|
|||||||
.get_metric_with_label_values(&[&self.tenant_id.to_string()])
|
.get_metric_with_label_values(&[&self.tenant_id.to_string()])
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.set(size);
|
.set(size);
|
||||||
|
|
||||||
|
Ok(size)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_cached_synthetic_size(&self) -> u64 {
|
pub fn get_cached_synthetic_size(&self) -> u64 {
|
||||||
@@ -2767,7 +2757,6 @@ pub mod harness {
|
|||||||
lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
|
lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
|
||||||
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
|
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
|
||||||
trace_read_requests: Some(tenant_conf.trace_read_requests),
|
trace_read_requests: Some(tenant_conf.trace_read_requests),
|
||||||
eviction_policy: Some(tenant_conf.eviction_policy),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -91,7 +91,6 @@ pub struct TenantConf {
|
|||||||
/// to avoid eager reconnects.
|
/// to avoid eager reconnects.
|
||||||
pub max_lsn_wal_lag: NonZeroU64,
|
pub max_lsn_wal_lag: NonZeroU64,
|
||||||
pub trace_read_requests: bool,
|
pub trace_read_requests: bool,
|
||||||
pub eviction_policy: EvictionPolicy,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Same as TenantConf, but this struct preserves the information about
|
/// Same as TenantConf, but this struct preserves the information about
|
||||||
@@ -103,7 +102,6 @@ pub struct TenantConfOpt {
|
|||||||
pub checkpoint_distance: Option<u64>,
|
pub checkpoint_distance: Option<u64>,
|
||||||
|
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub checkpoint_timeout: Option<Duration>,
|
pub checkpoint_timeout: Option<Duration>,
|
||||||
|
|
||||||
@@ -155,34 +153,6 @@ pub struct TenantConfOpt {
|
|||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub trace_read_requests: Option<bool>,
|
pub trace_read_requests: Option<bool>,
|
||||||
|
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
|
||||||
#[serde(default)]
|
|
||||||
pub eviction_policy: Option<EvictionPolicy>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
||||||
#[serde(tag = "kind")]
|
|
||||||
pub enum EvictionPolicy {
|
|
||||||
NoEviction,
|
|
||||||
LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl EvictionPolicy {
|
|
||||||
pub fn discriminant_str(&self) -> &'static str {
|
|
||||||
match self {
|
|
||||||
EvictionPolicy::NoEviction => "NoEviction",
|
|
||||||
EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
||||||
pub struct EvictionPolicyLayerAccessThreshold {
|
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
pub period: Duration,
|
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
pub threshold: Duration,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TenantConfOpt {
|
impl TenantConfOpt {
|
||||||
@@ -219,7 +189,6 @@ impl TenantConfOpt {
|
|||||||
trace_read_requests: self
|
trace_read_requests: self
|
||||||
.trace_read_requests
|
.trace_read_requests
|
||||||
.unwrap_or(global_conf.trace_read_requests),
|
.unwrap_or(global_conf.trace_read_requests),
|
||||||
eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -292,7 +261,6 @@ impl Default for TenantConf {
|
|||||||
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
||||||
.expect("cannot parse default max walreceiver Lsn wal lag"),
|
.expect("cannot parse default max walreceiver Lsn wal lag"),
|
||||||
trace_read_requests: false,
|
trace_read_requests: false,
|
||||||
eviction_policy: EvictionPolicy::NoEviction,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -53,6 +53,7 @@ use crate::repository::Key;
|
|||||||
use crate::tenant::storage_layer::InMemoryLayer;
|
use crate::tenant::storage_layer::InMemoryLayer;
|
||||||
use crate::tenant::storage_layer::Layer;
|
use crate::tenant::storage_layer::Layer;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
use std::collections::HashMap;
|
||||||
use std::collections::VecDeque;
|
use std::collections::VecDeque;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@@ -61,6 +62,8 @@ use utils::lsn::Lsn;
|
|||||||
use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
||||||
pub use historic_layer_coverage::Replacement;
|
pub use historic_layer_coverage::Replacement;
|
||||||
|
|
||||||
|
use self::historic_layer_coverage::LayerKey;
|
||||||
|
|
||||||
use super::storage_layer::range_eq;
|
use super::storage_layer::range_eq;
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -87,11 +90,18 @@ pub struct LayerMap<L: ?Sized> {
|
|||||||
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
|
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
|
||||||
|
|
||||||
/// Index of the historic layers optimized for search
|
/// Index of the historic layers optimized for search
|
||||||
historic: BufferedHistoricLayerCoverage<Arc<L>>,
|
historic: BufferedHistoricLayerCoverage<LayerKey>,
|
||||||
|
|
||||||
|
/// All layers accessible by key. Useful for:
|
||||||
|
/// 1. Iterating all layers
|
||||||
|
/// 2. Dereferencing a self.historic search result
|
||||||
|
/// 3. Replacing a layer with a remote/local version without
|
||||||
|
/// rebuilding the self.historic index.
|
||||||
|
mapping: HashMap<LayerKey, Arc<L>>,
|
||||||
|
|
||||||
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
|
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
|
||||||
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
|
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
|
||||||
l0_delta_layers: Vec<Arc<L>>,
|
l0_delta_layers: HashMap<LayerKey, Arc<L>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<L: ?Sized> Default for LayerMap<L> {
|
impl<L: ?Sized> Default for LayerMap<L> {
|
||||||
@@ -100,8 +110,9 @@ impl<L: ?Sized> Default for LayerMap<L> {
|
|||||||
open_layer: None,
|
open_layer: None,
|
||||||
next_open_layer_at: None,
|
next_open_layer_at: None,
|
||||||
frozen_layers: VecDeque::default(),
|
frozen_layers: VecDeque::default(),
|
||||||
l0_delta_layers: Vec::default(),
|
l0_delta_layers: HashMap::default(),
|
||||||
historic: BufferedHistoricLayerCoverage::default(),
|
historic: BufferedHistoricLayerCoverage::default(),
|
||||||
|
mapping: HashMap::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -139,30 +150,6 @@ where
|
|||||||
self.layer_map.remove_historic_noflush(layer)
|
self.layer_map.remove_historic_noflush(layer)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Replaces existing layer iff it is the `expected`.
|
|
||||||
///
|
|
||||||
/// If the expected layer has been removed it will not be inserted by this function.
|
|
||||||
///
|
|
||||||
/// Returned `Replacement` describes succeeding in replacement or the reason why it could not
|
|
||||||
/// be done.
|
|
||||||
///
|
|
||||||
/// TODO replacement can be done without buffering and rebuilding layer map updates.
|
|
||||||
/// One way to do that is to add a layer of indirection for returned values, so
|
|
||||||
/// that we can replace values only by updating a hashmap.
|
|
||||||
pub fn replace_historic(
|
|
||||||
&mut self,
|
|
||||||
expected: &Arc<L>,
|
|
||||||
new: Arc<L>,
|
|
||||||
) -> anyhow::Result<Replacement<Arc<L>>> {
|
|
||||||
fail::fail_point!("layermap-replace-notfound", |_| Ok(
|
|
||||||
// this is not what happens if an L0 layer was not found a anyhow error but perhaps
|
|
||||||
// that should be changed. this is good enough to show a replacement failure.
|
|
||||||
Replacement::NotFound
|
|
||||||
));
|
|
||||||
|
|
||||||
self.layer_map.replace_historic_noflush(expected, new)
|
|
||||||
}
|
|
||||||
|
|
||||||
// We will flush on drop anyway, but this method makes it
|
// We will flush on drop anyway, but this method makes it
|
||||||
// more explicit that there is some work being done.
|
// more explicit that there is some work being done.
|
||||||
/// Apply all updates
|
/// Apply all updates
|
||||||
@@ -234,33 +221,38 @@ where
|
|||||||
match (latest_delta, latest_image) {
|
match (latest_delta, latest_image) {
|
||||||
(None, None) => None,
|
(None, None) => None,
|
||||||
(None, Some(image)) => {
|
(None, Some(image)) => {
|
||||||
|
let image = self.mapping.get(&image).unwrap();
|
||||||
let lsn_floor = image.get_lsn_range().start;
|
let lsn_floor = image.get_lsn_range().start;
|
||||||
Some(SearchResult {
|
Some(SearchResult {
|
||||||
layer: image,
|
layer: image.clone(),
|
||||||
lsn_floor,
|
lsn_floor,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
(Some(delta), None) => {
|
(Some(delta), None) => {
|
||||||
|
let delta = self.mapping.get(&delta).unwrap();
|
||||||
let lsn_floor = delta.get_lsn_range().start;
|
let lsn_floor = delta.get_lsn_range().start;
|
||||||
Some(SearchResult {
|
Some(SearchResult {
|
||||||
layer: delta,
|
layer: delta.clone(),
|
||||||
lsn_floor,
|
lsn_floor,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
(Some(delta), Some(image)) => {
|
(Some(delta), Some(image)) => {
|
||||||
|
let image = self.mapping.get(&image).unwrap();
|
||||||
|
let delta = self.mapping.get(&delta).unwrap();
|
||||||
|
|
||||||
let img_lsn = image.get_lsn_range().start;
|
let img_lsn = image.get_lsn_range().start;
|
||||||
let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
|
let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
|
||||||
let image_exact_match = img_lsn + 1 == end_lsn;
|
let image_exact_match = img_lsn + 1 == end_lsn;
|
||||||
if image_is_newer || image_exact_match {
|
if image_is_newer || image_exact_match {
|
||||||
Some(SearchResult {
|
Some(SearchResult {
|
||||||
layer: image,
|
layer: image.clone(),
|
||||||
lsn_floor: img_lsn,
|
lsn_floor: img_lsn,
|
||||||
})
|
})
|
||||||
} else {
|
} else {
|
||||||
let lsn_floor =
|
let lsn_floor =
|
||||||
std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
|
std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
|
||||||
Some(SearchResult {
|
Some(SearchResult {
|
||||||
layer: delta,
|
layer: delta.clone(),
|
||||||
lsn_floor,
|
lsn_floor,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -279,13 +271,12 @@ where
|
|||||||
/// Helper function for BatchedUpdates::insert_historic
|
/// Helper function for BatchedUpdates::insert_historic
|
||||||
///
|
///
|
||||||
pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
|
pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
|
||||||
self.historic.insert(
|
let key = LayerKey::from(&*layer);
|
||||||
historic_layer_coverage::LayerKey::from(&*layer),
|
self.historic.insert(key.clone(), key.clone());
|
||||||
Arc::clone(&layer),
|
self.mapping.insert(key.clone(), layer.clone());
|
||||||
);
|
|
||||||
|
|
||||||
if Self::is_l0(&layer) {
|
if Self::is_l0(&layer) {
|
||||||
self.l0_delta_layers.push(layer);
|
self.l0_delta_layers.insert(key, layer.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
NUM_ONDISK_LAYERS.inc();
|
NUM_ONDISK_LAYERS.inc();
|
||||||
@@ -297,27 +288,28 @@ where
|
|||||||
/// Helper function for BatchedUpdates::remove_historic
|
/// Helper function for BatchedUpdates::remove_historic
|
||||||
///
|
///
|
||||||
pub fn remove_historic_noflush(&mut self, layer: Arc<L>) {
|
pub fn remove_historic_noflush(&mut self, layer: Arc<L>) {
|
||||||
self.historic
|
let key = historic_layer_coverage::LayerKey::from(&*layer);
|
||||||
.remove(historic_layer_coverage::LayerKey::from(&*layer));
|
self.historic.remove(key.clone());
|
||||||
|
self.mapping.remove(&key.clone());
|
||||||
|
|
||||||
if Self::is_l0(&layer) {
|
if Self::is_l0(&layer) {
|
||||||
let len_before = self.l0_delta_layers.len();
|
self.l0_delta_layers.remove(&key);
|
||||||
self.l0_delta_layers
|
|
||||||
.retain(|other| !Self::compare_arced_layers(other, &layer));
|
|
||||||
// this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
|
|
||||||
// there's a chance that the comparison fails at runtime due to it comparing (pointer,
|
|
||||||
// vtable) pairs.
|
|
||||||
assert_eq!(
|
|
||||||
self.l0_delta_layers.len(),
|
|
||||||
len_before - 1,
|
|
||||||
"failed to locate removed historic layer from l0_delta_layers"
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
NUM_ONDISK_LAYERS.dec();
|
NUM_ONDISK_LAYERS.dec();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(self) fn replace_historic_noflush(
|
/// Replaces existing layer iff it is the `expected`.
|
||||||
|
///
|
||||||
|
/// If the expected layer has been removed it will not be inserted by this function.
|
||||||
|
///
|
||||||
|
/// Returned `Replacement` describes succeeding in replacement or the reason why it could not
|
||||||
|
/// be done.
|
||||||
|
///
|
||||||
|
/// TODO replacement can be done without buffering and rebuilding layer map updates.
|
||||||
|
/// One way to do that is to add a layer of indirection for returned values, so
|
||||||
|
/// that we can replace values only by updating a hashmap.
|
||||||
|
pub fn replace_historic(
|
||||||
&mut self,
|
&mut self,
|
||||||
expected: &Arc<L>,
|
expected: &Arc<L>,
|
||||||
new: Arc<L>,
|
new: Arc<L>,
|
||||||
@@ -338,29 +330,23 @@ where
|
|||||||
"expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}"
|
"expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}"
|
||||||
);
|
);
|
||||||
|
|
||||||
let l0_index = if expected_l0 {
|
use std::collections::hash_map::Entry;
|
||||||
// find the index in case replace worked, we need to replace that as well
|
|
||||||
Some(
|
if expected_l0 {
|
||||||
self.l0_delta_layers
|
match self.mapping.entry(key.clone()) {
|
||||||
.iter()
|
Entry::Occupied(mut entry) => entry.insert(new.clone()),
|
||||||
.position(|slot| Self::compare_arced_layers(slot, expected))
|
Entry::Vacant(_) => anyhow::bail!("layer doesn't exist"),
|
||||||
.ok_or_else(|| anyhow::anyhow!("existing l0 delta layer was not found"))?,
|
};
|
||||||
)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let replaced = self.historic.replace(&key, new.clone(), |existing| {
|
match self.mapping.entry(key.clone()) {
|
||||||
Self::compare_arced_layers(existing, expected)
|
Entry::Occupied(mut entry) => entry.insert(new.clone()),
|
||||||
});
|
Entry::Vacant(_) => anyhow::bail!("layer doesn't exist"),
|
||||||
|
};
|
||||||
|
|
||||||
if let Replacement::Replaced { .. } = &replaced {
|
Ok(Replacement::Replaced {
|
||||||
if let Some(index) = l0_index {
|
in_buffered: false,
|
||||||
self.l0_delta_layers[index] = new;
|
})
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(replaced)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Helper function for BatchedUpdates::drop.
|
/// Helper function for BatchedUpdates::drop.
|
||||||
@@ -388,8 +374,8 @@ where
|
|||||||
let start = key.start.to_i128();
|
let start = key.start.to_i128();
|
||||||
let end = key.end.to_i128();
|
let end = key.end.to_i128();
|
||||||
|
|
||||||
let layer_covers = |layer: Option<Arc<L>>| match layer {
|
let layer_covers = |key: Option<&LayerKey>| match key {
|
||||||
Some(layer) => layer.get_lsn_range().start >= lsn.start,
|
Some(key) => self.mapping.get(key).unwrap().get_lsn_range().start >= lsn.start,
|
||||||
None => false,
|
None => false,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -409,7 +395,7 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
|
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
|
||||||
self.historic.iter()
|
self.mapping.values().cloned()
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -436,10 +422,13 @@ where
|
|||||||
// Initialize loop variables
|
// Initialize loop variables
|
||||||
let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
|
let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
|
||||||
let mut current_key = start;
|
let mut current_key = start;
|
||||||
let mut current_val = version.image_coverage.query(start);
|
let mut current_val = version.image_coverage.query(start)
|
||||||
|
.map(|key| self.mapping.get(&key).unwrap().clone());
|
||||||
|
|
||||||
// Loop through the change events and push intervals
|
// Loop through the change events and push intervals
|
||||||
for (change_key, change_val) in version.image_coverage.range(start..end) {
|
for (change_key, change_val) in version.image_coverage.range(start..end) {
|
||||||
|
let change_val = change_val.map(|key| self.mapping.get(&key).unwrap().clone());
|
||||||
|
|
||||||
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
|
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
|
||||||
coverage.push((kr, current_val.take()));
|
coverage.push((kr, current_val.take()));
|
||||||
current_key = change_key;
|
current_key = change_key;
|
||||||
@@ -533,6 +522,7 @@ where
|
|||||||
for (change_key, change_val) in version.delta_coverage.range(start..end) {
|
for (change_key, change_val) in version.delta_coverage.range(start..end) {
|
||||||
// If there's a relevant delta in this part, add 1 and recurse down
|
// If there's a relevant delta in this part, add 1 and recurse down
|
||||||
if let Some(val) = current_val {
|
if let Some(val) = current_val {
|
||||||
|
let val = self.mapping.get(&val).unwrap().clone();
|
||||||
if val.get_lsn_range().end > lsn.start {
|
if val.get_lsn_range().end > lsn.start {
|
||||||
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
|
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
|
||||||
let lr = lsn.start..val.get_lsn_range().start;
|
let lr = lsn.start..val.get_lsn_range().start;
|
||||||
@@ -555,6 +545,7 @@ where
|
|||||||
|
|
||||||
// Consider the last part
|
// Consider the last part
|
||||||
if let Some(val) = current_val {
|
if let Some(val) = current_val {
|
||||||
|
let val = self.mapping.get(&val).unwrap().clone();
|
||||||
if val.get_lsn_range().end > lsn.start {
|
if val.get_lsn_range().end > lsn.start {
|
||||||
let kr = Key::from_i128(current_key)..Key::from_i128(end);
|
let kr = Key::from_i128(current_key)..Key::from_i128(end);
|
||||||
let lr = lsn.start..val.get_lsn_range().start;
|
let lr = lsn.start..val.get_lsn_range().start;
|
||||||
@@ -711,7 +702,7 @@ where
|
|||||||
|
|
||||||
/// Return all L0 delta layers
|
/// Return all L0 delta layers
|
||||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
|
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
|
||||||
Ok(self.l0_delta_layers.clone())
|
Ok(self.l0_delta_layers.values().cloned().collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// debugging function to print out the contents of the layer map
|
/// debugging function to print out the contents of the layer map
|
||||||
@@ -736,32 +727,6 @@ where
|
|||||||
println!("End dump LayerMap");
|
println!("End dump LayerMap");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
|
|
||||||
///
|
|
||||||
/// Returns `true` if the two `Arc` point to the same layer, false otherwise.
|
|
||||||
#[inline(always)]
|
|
||||||
pub fn compare_arced_layers(left: &Arc<L>, right: &Arc<L>) -> bool {
|
|
||||||
// "dyn Trait" objects are "fat pointers" in that they have two components:
|
|
||||||
// - pointer to the object
|
|
||||||
// - pointer to the vtable
|
|
||||||
//
|
|
||||||
// rust does not provide a guarantee that these vtables are unique, but however
|
|
||||||
// `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
|
|
||||||
// pointer and the vtable need to be equal.
|
|
||||||
//
|
|
||||||
// See: https://github.com/rust-lang/rust/issues/103763
|
|
||||||
//
|
|
||||||
// A future version of rust will most likely use this form below, where we cast each
|
|
||||||
// pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
|
|
||||||
// not affect the comparison.
|
|
||||||
//
|
|
||||||
// See: https://github.com/rust-lang/rust/pull/106450
|
|
||||||
let left = Arc::as_ptr(left) as *const ();
|
|
||||||
let right = Arc::as_ptr(right) as *const ();
|
|
||||||
|
|
||||||
left == right
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -822,7 +787,6 @@ mod tests {
|
|||||||
assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
|
assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
|
||||||
|
|
||||||
let replaced = map
|
let replaced = map
|
||||||
.batch_update()
|
|
||||||
.replace_historic(&remote, downloaded.clone())
|
.replace_historic(&remote, downloaded.clone())
|
||||||
.expect("name derived attributes are the same");
|
.expect("name derived attributes are the same");
|
||||||
assert!(
|
assert!(
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ use super::layer_coverage::LayerCoverageTuple;
|
|||||||
/// These three values are enough to uniquely identify a layer, since
|
/// These three values are enough to uniquely identify a layer, since
|
||||||
/// a layer is obligated to contain all contents within range, so two
|
/// a layer is obligated to contain all contents within range, so two
|
||||||
/// deltas (or images) with the same range have identical content.
|
/// deltas (or images) with the same range have identical content.
|
||||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||||
pub struct LayerKey {
|
pub struct LayerKey {
|
||||||
// TODO I use i128 and u64 because it was easy for prototyping,
|
// TODO I use i128 and u64 because it was easy for prototyping,
|
||||||
// testing, and benchmarking. If we can use the Lsn and Key
|
// testing, and benchmarking. If we can use the Lsn and Key
|
||||||
@@ -438,46 +438,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
|||||||
///
|
///
|
||||||
/// Returns a `Replacement` value describing the outcome; only the case of
|
/// Returns a `Replacement` value describing the outcome; only the case of
|
||||||
/// `Replacement::Replaced` modifies the map and requires a rebuild.
|
/// `Replacement::Replaced` modifies the map and requires a rebuild.
|
||||||
pub fn replace<F>(
|
|
||||||
&mut self,
|
|
||||||
layer_key: &LayerKey,
|
|
||||||
new: Value,
|
|
||||||
check_expected: F,
|
|
||||||
) -> Replacement<Value>
|
|
||||||
where
|
|
||||||
F: FnOnce(&Value) -> bool,
|
|
||||||
{
|
|
||||||
let (slot, in_buffered) = match self.buffer.get(layer_key) {
|
|
||||||
Some(inner @ Some(_)) => {
|
|
||||||
// we compare against the buffered version, because there will be a later
|
|
||||||
// rebuild before querying
|
|
||||||
(inner.as_ref(), true)
|
|
||||||
}
|
|
||||||
Some(None) => {
|
|
||||||
// buffer has removal for this key; it will not be equivalent by any check_expected.
|
|
||||||
return Replacement::RemovalBuffered;
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
// no pending modification for the key, check layers
|
|
||||||
(self.layers.get(layer_key), false)
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
match slot {
|
|
||||||
Some(existing) if !check_expected(existing) => {
|
|
||||||
// unfortunate clone here, but otherwise the nll borrowck grows the region of
|
|
||||||
// 'a to cover the whole function, and we could not mutate in the other
|
|
||||||
// Some(existing) branch
|
|
||||||
Replacement::Unexpected(existing.clone())
|
|
||||||
}
|
|
||||||
None => Replacement::NotFound,
|
|
||||||
Some(_existing) => {
|
|
||||||
self.insert(layer_key.to_owned(), new);
|
|
||||||
Replacement::Replaced { in_buffered }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn rebuild(&mut self) {
|
pub fn rebuild(&mut self) {
|
||||||
// Find the first LSN that needs to be rebuilt
|
// Find the first LSN that needs to be rebuilt
|
||||||
let rebuild_since: u64 = match self.buffer.iter().next() {
|
let rebuild_since: u64 = match self.buffer.iter().next() {
|
||||||
@@ -521,17 +481,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Iterate all the layers
|
|
||||||
pub fn iter(&self) -> impl '_ + Iterator<Item = Value> {
|
|
||||||
// NOTE we can actually perform this without rebuilding,
|
|
||||||
// but it's not necessary for now.
|
|
||||||
if !self.buffer.is_empty() {
|
|
||||||
panic!("rebuild pls")
|
|
||||||
}
|
|
||||||
|
|
||||||
self.layers.values().cloned()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return a reference to a queryable map, assuming all updates
|
/// Return a reference to a queryable map, assuming all updates
|
||||||
/// have already been processed using self.rebuild()
|
/// have already been processed using self.rebuild()
|
||||||
pub fn get(&self) -> anyhow::Result<&HistoricLayerCoverage<Value>> {
|
pub fn get(&self) -> anyhow::Result<&HistoricLayerCoverage<Value>> {
|
||||||
@@ -670,139 +619,3 @@ fn test_retroactive_simple() {
|
|||||||
assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
|
assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_retroactive_replacement() {
|
|
||||||
let mut map = BufferedHistoricLayerCoverage::new();
|
|
||||||
|
|
||||||
let keys = [
|
|
||||||
LayerKey {
|
|
||||||
key: 0..5,
|
|
||||||
lsn: 100..101,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
LayerKey {
|
|
||||||
key: 3..9,
|
|
||||||
lsn: 110..111,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
LayerKey {
|
|
||||||
key: 4..6,
|
|
||||||
lsn: 120..121,
|
|
||||||
is_image: true,
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
let layers = [
|
|
||||||
"Image 1".to_string(),
|
|
||||||
"Image 2".to_string(),
|
|
||||||
"Image 3".to_string(),
|
|
||||||
];
|
|
||||||
|
|
||||||
for (key, layer) in keys.iter().zip(layers.iter()) {
|
|
||||||
map.insert(key.to_owned(), layer.to_owned());
|
|
||||||
}
|
|
||||||
|
|
||||||
// rebuild is not necessary here, because replace works for both buffered updates and existing
|
|
||||||
// layers.
|
|
||||||
|
|
||||||
for (key, orig_layer) in keys.iter().zip(layers.iter()) {
|
|
||||||
let replacement = format!("Remote {orig_layer}");
|
|
||||||
|
|
||||||
// evict
|
|
||||||
let ret = map.replace(key, replacement.clone(), |l| l == orig_layer);
|
|
||||||
assert!(
|
|
||||||
matches!(ret, Replacement::Replaced { .. }),
|
|
||||||
"replace {orig_layer}: {ret:?}"
|
|
||||||
);
|
|
||||||
map.rebuild();
|
|
||||||
|
|
||||||
let at = key.lsn.end + 1;
|
|
||||||
|
|
||||||
let version = map.get().expect("rebuilt").get_version(at).unwrap();
|
|
||||||
assert_eq!(
|
|
||||||
version.image_coverage.query(4).as_deref(),
|
|
||||||
Some(replacement.as_str()),
|
|
||||||
"query for 4 at version {at} after eviction",
|
|
||||||
);
|
|
||||||
|
|
||||||
// download
|
|
||||||
let ret = map.replace(key, orig_layer.clone(), |l| l == &replacement);
|
|
||||||
assert!(
|
|
||||||
matches!(ret, Replacement::Replaced { .. }),
|
|
||||||
"replace {orig_layer} back: {ret:?}"
|
|
||||||
);
|
|
||||||
map.rebuild();
|
|
||||||
let version = map.get().expect("rebuilt").get_version(at).unwrap();
|
|
||||||
assert_eq!(
|
|
||||||
version.image_coverage.query(4).as_deref(),
|
|
||||||
Some(orig_layer.as_str()),
|
|
||||||
"query for 4 at version {at} after download",
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn missing_key_is_not_inserted_with_replace() {
|
|
||||||
let mut map = BufferedHistoricLayerCoverage::new();
|
|
||||||
let key = LayerKey {
|
|
||||||
key: 0..5,
|
|
||||||
lsn: 100..101,
|
|
||||||
is_image: true,
|
|
||||||
};
|
|
||||||
|
|
||||||
let ret = map.replace(&key, "should not replace", |_| true);
|
|
||||||
assert!(matches!(ret, Replacement::NotFound), "{ret:?}");
|
|
||||||
map.rebuild();
|
|
||||||
assert!(map
|
|
||||||
.get()
|
|
||||||
.expect("no changes to rebuild")
|
|
||||||
.get_version(102)
|
|
||||||
.is_none());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn replacing_buffered_insert_and_remove() {
|
|
||||||
let mut map = BufferedHistoricLayerCoverage::new();
|
|
||||||
let key = LayerKey {
|
|
||||||
key: 0..5,
|
|
||||||
lsn: 100..101,
|
|
||||||
is_image: true,
|
|
||||||
};
|
|
||||||
|
|
||||||
map.insert(key.clone(), "Image 1");
|
|
||||||
let ret = map.replace(&key, "Remote Image 1", |&l| l == "Image 1");
|
|
||||||
assert!(
|
|
||||||
matches!(ret, Replacement::Replaced { in_buffered: true }),
|
|
||||||
"{ret:?}"
|
|
||||||
);
|
|
||||||
map.rebuild();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
map.get()
|
|
||||||
.expect("rebuilt")
|
|
||||||
.get_version(102)
|
|
||||||
.unwrap()
|
|
||||||
.image_coverage
|
|
||||||
.query(4),
|
|
||||||
Some("Remote Image 1")
|
|
||||||
);
|
|
||||||
|
|
||||||
map.remove(key.clone());
|
|
||||||
let ret = map.replace(&key, "should not replace", |_| true);
|
|
||||||
assert!(
|
|
||||||
matches!(ret, Replacement::RemovalBuffered),
|
|
||||||
"cannot replace after scheduled remove: {ret:?}"
|
|
||||||
);
|
|
||||||
|
|
||||||
map.rebuild();
|
|
||||||
|
|
||||||
let ret = map.replace(&key, "should not replace", |_| true);
|
|
||||||
assert!(
|
|
||||||
matches!(ret, Replacement::NotFound),
|
|
||||||
"cannot replace after remove + rebuild: {ret:?}"
|
|
||||||
);
|
|
||||||
|
|
||||||
let at_version = map.get().expect("rebuilt").get_version(102);
|
|
||||||
assert!(at_version.is_none());
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -101,24 +101,24 @@ impl<Value: Clone> LayerCoverage<Value> {
|
|||||||
/// Get the latest (by lsn.end) layer at a given key
|
/// Get the latest (by lsn.end) layer at a given key
|
||||||
///
|
///
|
||||||
/// Complexity: O(log N)
|
/// Complexity: O(log N)
|
||||||
pub fn query(&self, key: i128) -> Option<Value> {
|
pub fn query(&self, key: i128) -> Option<&Value> {
|
||||||
self.nodes
|
self.nodes
|
||||||
.range(..=key)
|
.range(..=key)
|
||||||
.rev()
|
.rev()
|
||||||
.next()?
|
.next()?
|
||||||
.1
|
.1
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map(|(_, v)| v.clone())
|
.map(|(_, v)| v)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Iterate the changes in layer coverage in a given range. You will likely
|
/// Iterate the changes in layer coverage in a given range. You will likely
|
||||||
/// want to start with self.query(key.start), and then follow up with self.range
|
/// want to start with self.query(key.start), and then follow up with self.range
|
||||||
///
|
///
|
||||||
/// Complexity: O(log N + result_size)
|
/// Complexity: O(log N + result_size)
|
||||||
pub fn range(&self, key: Range<i128>) -> impl '_ + Iterator<Item = (i128, Option<Value>)> {
|
pub fn range(&self, key: Range<i128>) -> impl '_ + Iterator<Item = (i128, Option<&Value>)> {
|
||||||
self.nodes
|
self.nodes
|
||||||
.range(key)
|
.range(key)
|
||||||
.map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
|
.map(|(k, v)| (*k, v.as_ref().map(|x| &x.1)))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// O(1) clone
|
/// O(1) clone
|
||||||
|
|||||||
@@ -540,11 +540,13 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
use {
|
use {
|
||||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||||
utils::http::error::ApiError,
|
utils::http::error::ApiError,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
pub async fn immediate_gc(
|
pub async fn immediate_gc(
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
|
|||||||
@@ -571,15 +571,14 @@ impl RemoteTimelineClient {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Launch a delete operation in the background.
|
|
||||||
///
|
///
|
||||||
/// The operation does not modify local state but assumes the local files have already been
|
/// Launch a delete operation in the background.
|
||||||
/// deleted, and is used to mirror those changes to remote.
|
|
||||||
///
|
///
|
||||||
/// Note: This schedules an index file upload before the deletions. The
|
/// Note: This schedules an index file upload before the deletions. The
|
||||||
/// deletion won't actually be performed, until any previously scheduled
|
/// deletion won't actually be performed, until any previously scheduled
|
||||||
/// upload operations, and the index file upload, have completed
|
/// upload operations, and the index file upload, have completed
|
||||||
/// succesfully.
|
/// succesfully.
|
||||||
|
///
|
||||||
pub fn schedule_layer_file_deletion(
|
pub fn schedule_layer_file_deletion(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
names: &[LayerFileName],
|
names: &[LayerFileName],
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -13,7 +13,6 @@ use crate::task_mgr::TaskKind;
|
|||||||
use crate::walrecord::NeonWalRecord;
|
use crate::walrecord::NeonWalRecord;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use either::Either;
|
|
||||||
use enum_map::EnumMap;
|
use enum_map::EnumMap;
|
||||||
use enumset::EnumSet;
|
use enumset::EnumSet;
|
||||||
use pageserver_api::models::LayerAccessKind;
|
use pageserver_api::models::LayerAccessKind;
|
||||||
@@ -93,23 +92,7 @@ pub enum ValueReconstructResult {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
|
pub struct LayerAccessStats(Mutex<LayerAccessStatsInner>);
|
||||||
|
|
||||||
/// This struct holds two instances of [`LayerAccessStatsInner`].
|
|
||||||
/// Accesses are recorded to both instances.
|
|
||||||
/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`].
|
|
||||||
/// The `for_eviction_policy` is never reset.
|
|
||||||
#[derive(Debug, Default, Clone)]
|
|
||||||
struct LayerAccessStatsLocked {
|
|
||||||
for_scraping_api: LayerAccessStatsInner,
|
|
||||||
for_eviction_policy: LayerAccessStatsInner,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl LayerAccessStatsLocked {
|
|
||||||
fn iter_mut(&mut self) -> impl Iterator<Item = &mut LayerAccessStatsInner> {
|
|
||||||
[&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Default, Clone)]
|
#[derive(Debug, Default, Clone)]
|
||||||
struct LayerAccessStatsInner {
|
struct LayerAccessStatsInner {
|
||||||
@@ -120,11 +103,11 @@ struct LayerAccessStatsInner {
|
|||||||
last_residence_changes: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
last_residence_changes: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone)]
|
||||||
pub(super) struct LayerAccessStatFullDetails {
|
struct LayerAccessStatFullDetails {
|
||||||
pub(super) when: SystemTime,
|
when: SystemTime,
|
||||||
pub(super) task_kind: TaskKind,
|
task_kind: TaskKind,
|
||||||
pub(super) access_kind: LayerAccessKind,
|
access_kind: LayerAccessKind,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy, strum_macros::EnumString)]
|
#[derive(Clone, Copy, strum_macros::EnumString)]
|
||||||
@@ -143,7 +126,7 @@ fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl LayerAccessStatFullDetails {
|
impl LayerAccessStatFullDetails {
|
||||||
fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
|
fn to_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
|
||||||
let Self {
|
let Self {
|
||||||
when,
|
when,
|
||||||
task_kind,
|
task_kind,
|
||||||
@@ -159,13 +142,13 @@ impl LayerAccessStatFullDetails {
|
|||||||
|
|
||||||
impl LayerAccessStats {
|
impl LayerAccessStats {
|
||||||
pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
|
pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
|
||||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
let new = LayerAccessStats(Mutex::new(LayerAccessStatsInner::default()));
|
||||||
new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
|
new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
|
||||||
new
|
new
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn for_new_layer_file() -> Self {
|
pub(crate) fn for_new_layer_file() -> Self {
|
||||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
let new = LayerAccessStats(Mutex::new(LayerAccessStatsInner::default()));
|
||||||
new.record_residence_event(
|
new.record_residence_event(
|
||||||
LayerResidenceStatus::Resident,
|
LayerResidenceStatus::Resident,
|
||||||
LayerResidenceEventReason::LayerCreate,
|
LayerResidenceEventReason::LayerCreate,
|
||||||
@@ -193,43 +176,38 @@ impl LayerAccessStats {
|
|||||||
status: LayerResidenceStatus,
|
status: LayerResidenceStatus,
|
||||||
reason: LayerResidenceEventReason,
|
reason: LayerResidenceEventReason,
|
||||||
) {
|
) {
|
||||||
let mut locked = self.0.lock().unwrap();
|
let mut inner = self.0.lock().unwrap();
|
||||||
locked.iter_mut().for_each(|inner| {
|
inner
|
||||||
inner
|
.last_residence_changes
|
||||||
.last_residence_changes
|
.write(LayerResidenceEvent::new(status, reason));
|
||||||
.write(LayerResidenceEvent::new(status, reason))
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
|
fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
|
||||||
|
let mut inner = self.0.lock().unwrap();
|
||||||
let this_access = LayerAccessStatFullDetails {
|
let this_access = LayerAccessStatFullDetails {
|
||||||
when: SystemTime::now(),
|
when: SystemTime::now(),
|
||||||
task_kind,
|
task_kind,
|
||||||
access_kind,
|
access_kind,
|
||||||
};
|
};
|
||||||
|
inner
|
||||||
let mut locked = self.0.lock().unwrap();
|
.first_access
|
||||||
locked.iter_mut().for_each(|inner| {
|
.get_or_insert_with(|| this_access.clone());
|
||||||
inner.first_access.get_or_insert(this_access);
|
inner.count_by_access_kind[access_kind] += 1;
|
||||||
inner.count_by_access_kind[access_kind] += 1;
|
inner.task_kind_flag |= task_kind;
|
||||||
inner.task_kind_flag |= task_kind;
|
inner.last_accesses.write(this_access);
|
||||||
inner.last_accesses.write(this_access);
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
fn to_api_model(
|
||||||
fn as_api_model(
|
|
||||||
&self,
|
&self,
|
||||||
reset: LayerAccessStatsReset,
|
reset: LayerAccessStatsReset,
|
||||||
) -> pageserver_api::models::LayerAccessStats {
|
) -> pageserver_api::models::LayerAccessStats {
|
||||||
let mut locked = self.0.lock().unwrap();
|
let mut inner = self.0.lock().unwrap();
|
||||||
let inner = &mut locked.for_scraping_api;
|
|
||||||
let LayerAccessStatsInner {
|
let LayerAccessStatsInner {
|
||||||
first_access,
|
first_access,
|
||||||
count_by_access_kind,
|
count_by_access_kind,
|
||||||
task_kind_flag,
|
task_kind_flag,
|
||||||
last_accesses,
|
last_accesses,
|
||||||
last_residence_changes,
|
last_residence_changes,
|
||||||
} = inner;
|
} = &*inner;
|
||||||
let ret = pageserver_api::models::LayerAccessStats {
|
let ret = pageserver_api::models::LayerAccessStats {
|
||||||
access_count_by_access_kind: count_by_access_kind
|
access_count_by_access_kind: count_by_access_kind
|
||||||
.iter()
|
.iter()
|
||||||
@@ -239,8 +217,8 @@ impl LayerAccessStats {
|
|||||||
.iter()
|
.iter()
|
||||||
.map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
|
.map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
|
||||||
.collect(),
|
.collect(),
|
||||||
first: first_access.as_ref().map(|a| a.as_api_model()),
|
first: first_access.as_ref().map(|a| a.to_api_model()),
|
||||||
accesses_history: last_accesses.map(|m| m.as_api_model()),
|
accesses_history: last_accesses.map(|m| m.to_api_model()),
|
||||||
residence_events_history: last_residence_changes.clone(),
|
residence_events_history: last_residence_changes.clone(),
|
||||||
};
|
};
|
||||||
match reset {
|
match reset {
|
||||||
@@ -254,20 +232,6 @@ impl LayerAccessStats {
|
|||||||
}
|
}
|
||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(super) fn most_recent_access_or_residence_event(
|
|
||||||
&self,
|
|
||||||
) -> Either<LayerAccessStatFullDetails, LayerResidenceEvent> {
|
|
||||||
let locked = self.0.lock().unwrap();
|
|
||||||
let inner = &locked.for_eviction_policy;
|
|
||||||
match inner.last_accesses.recent() {
|
|
||||||
Some(a) => Either::Left(*a),
|
|
||||||
None => match inner.last_residence_changes.recent() {
|
|
||||||
Some(e) => Either::Right(e.clone()),
|
|
||||||
None => unreachable!("constructors for LayerAccessStats ensure that there's always a residence change event"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
|
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
|
||||||
@@ -364,7 +328,7 @@ pub trait PersistentLayer: Layer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Permanently remove this layer from disk.
|
/// Permanently remove this layer from disk.
|
||||||
fn delete_resident_layer_file(&self) -> Result<()>;
|
fn delete(&self) -> Result<()>;
|
||||||
|
|
||||||
fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
|
fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
|
||||||
None
|
None
|
||||||
@@ -485,14 +449,3 @@ enum PathOrConf {
|
|||||||
Path(PathBuf),
|
Path(PathBuf),
|
||||||
Conf(&'static PageServerConf),
|
Conf(&'static PageServerConf),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Range wrapping newtype, which uses display to render Debug.
|
|
||||||
///
|
|
||||||
/// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
|
|
||||||
struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range<T>);
|
|
||||||
|
|
||||||
impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
write!(f, "{}..{}", self.0.start, self.0.end)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -194,10 +194,8 @@ pub struct DeltaLayer {
|
|||||||
|
|
||||||
impl std::fmt::Debug for DeltaLayer {
|
impl std::fmt::Debug for DeltaLayer {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
use super::RangeDisplayDebug;
|
|
||||||
|
|
||||||
f.debug_struct("DeltaLayer")
|
f.debug_struct("DeltaLayer")
|
||||||
.field("key_range", &RangeDisplayDebug(&self.key_range))
|
.field("key_range", &self.key_range)
|
||||||
.field("lsn_range", &self.lsn_range)
|
.field("lsn_range", &self.lsn_range)
|
||||||
.field("file_size", &self.file_size)
|
.field("file_size", &self.file_size)
|
||||||
.field("inner", &self.inner)
|
.field("inner", &self.inner)
|
||||||
@@ -438,7 +436,7 @@ impl PersistentLayer for DeltaLayer {
|
|||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
fn delete(&self) -> Result<()> {
|
||||||
// delete underlying file
|
// delete underlying file
|
||||||
fs::remove_file(self.path())?;
|
fs::remove_file(self.path())?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -452,7 +450,7 @@ impl PersistentLayer for DeltaLayer {
|
|||||||
let layer_file_name = self.filename().file_name();
|
let layer_file_name = self.filename().file_name();
|
||||||
let lsn_range = self.get_lsn_range();
|
let lsn_range = self.get_lsn_range();
|
||||||
|
|
||||||
let access_stats = self.access_stats.as_api_model(reset);
|
let access_stats = self.access_stats.to_api_model(reset);
|
||||||
|
|
||||||
HistoricLayerInfo::Delta {
|
HistoricLayerInfo::Delta {
|
||||||
layer_file_name,
|
layer_file_name,
|
||||||
|
|||||||
@@ -10,23 +10,12 @@ use std::str::FromStr;
|
|||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
// Note: Timeline::load_layer_map() relies on this sort order
|
// Note: Timeline::load_layer_map() relies on this sort order
|
||||||
#[derive(PartialEq, Eq, Clone, Hash)]
|
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||||
pub struct DeltaFileName {
|
pub struct DeltaFileName {
|
||||||
pub key_range: Range<Key>,
|
pub key_range: Range<Key>,
|
||||||
pub lsn_range: Range<Lsn>,
|
pub lsn_range: Range<Lsn>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Debug for DeltaFileName {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
||||||
use super::RangeDisplayDebug;
|
|
||||||
|
|
||||||
f.debug_struct("DeltaFileName")
|
|
||||||
.field("key_range", &RangeDisplayDebug(&self.key_range))
|
|
||||||
.field("lsn_range", &self.lsn_range)
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PartialOrd for DeltaFileName {
|
impl PartialOrd for DeltaFileName {
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
Some(self.cmp(other))
|
Some(self.cmp(other))
|
||||||
@@ -111,23 +100,12 @@ impl fmt::Display for DeltaFileName {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(PartialEq, Eq, Clone, Hash)]
|
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||||
pub struct ImageFileName {
|
pub struct ImageFileName {
|
||||||
pub key_range: Range<Key>,
|
pub key_range: Range<Key>,
|
||||||
pub lsn: Lsn,
|
pub lsn: Lsn,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Debug for ImageFileName {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
||||||
use super::RangeDisplayDebug;
|
|
||||||
|
|
||||||
f.debug_struct("ImageFileName")
|
|
||||||
.field("key_range", &RangeDisplayDebug(&self.key_range))
|
|
||||||
.field("lsn", &self.lsn)
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PartialOrd for ImageFileName {
|
impl PartialOrd for ImageFileName {
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
Some(self.cmp(other))
|
Some(self.cmp(other))
|
||||||
|
|||||||
@@ -119,10 +119,8 @@ pub struct ImageLayer {
|
|||||||
|
|
||||||
impl std::fmt::Debug for ImageLayer {
|
impl std::fmt::Debug for ImageLayer {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
use super::RangeDisplayDebug;
|
|
||||||
|
|
||||||
f.debug_struct("ImageLayer")
|
f.debug_struct("ImageLayer")
|
||||||
.field("key_range", &RangeDisplayDebug(&self.key_range))
|
.field("key_range", &self.key_range)
|
||||||
.field("file_size", &self.file_size)
|
.field("file_size", &self.file_size)
|
||||||
.field("lsn", &self.lsn)
|
.field("lsn", &self.lsn)
|
||||||
.field("inner", &self.inner)
|
.field("inner", &self.inner)
|
||||||
@@ -252,7 +250,7 @@ impl PersistentLayer for ImageLayer {
|
|||||||
unimplemented!();
|
unimplemented!();
|
||||||
}
|
}
|
||||||
|
|
||||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
fn delete(&self) -> Result<()> {
|
||||||
// delete underlying file
|
// delete underlying file
|
||||||
fs::remove_file(self.path())?;
|
fs::remove_file(self.path())?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -271,7 +269,7 @@ impl PersistentLayer for ImageLayer {
|
|||||||
layer_file_size: Some(self.file_size),
|
layer_file_size: Some(self.file_size),
|
||||||
lsn_start: lsn_range.start,
|
lsn_start: lsn_range.start,
|
||||||
remote: false,
|
remote: false,
|
||||||
access_stats: self.access_stats.as_api_model(reset),
|
access_stats: self.access_stats.to_api_model(reset),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -49,17 +49,6 @@ pub struct RemoteLayer {
|
|||||||
access_stats: LayerAccessStats,
|
access_stats: LayerAccessStats,
|
||||||
|
|
||||||
pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
|
pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
|
||||||
|
|
||||||
/// Has `LayerMap::replace` failed for this (true) or not (false).
|
|
||||||
///
|
|
||||||
/// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
|
|
||||||
/// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
|
|
||||||
/// unprocessable, because a LayerMap::replace failed.
|
|
||||||
///
|
|
||||||
/// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
|
|
||||||
/// a possible fast loop between `Timeline::get_reconstruct_data` and
|
|
||||||
/// `Timeline::download_remote_layer`, which also logs.
|
|
||||||
pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Debug for RemoteLayer {
|
impl std::fmt::Debug for RemoteLayer {
|
||||||
@@ -155,8 +144,8 @@ impl PersistentLayer for RemoteLayer {
|
|||||||
bail!("cannot iterate a remote layer");
|
bail!("cannot iterate a remote layer");
|
||||||
}
|
}
|
||||||
|
|
||||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
fn delete(&self) -> Result<()> {
|
||||||
bail!("remote layer has no layer file");
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
|
fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
|
||||||
@@ -182,7 +171,7 @@ impl PersistentLayer for RemoteLayer {
|
|||||||
lsn_start: lsn_range.start,
|
lsn_start: lsn_range.start,
|
||||||
lsn_end: lsn_range.end,
|
lsn_end: lsn_range.end,
|
||||||
remote: true,
|
remote: true,
|
||||||
access_stats: self.access_stats.as_api_model(reset),
|
access_stats: self.access_stats.to_api_model(reset),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
HistoricLayerInfo::Image {
|
HistoricLayerInfo::Image {
|
||||||
@@ -190,7 +179,7 @@ impl PersistentLayer for RemoteLayer {
|
|||||||
layer_file_size: self.layer_metadata.file_size(),
|
layer_file_size: self.layer_metadata.file_size(),
|
||||||
lsn_start: lsn_range.start,
|
lsn_start: lsn_range.start,
|
||||||
remote: true,
|
remote: true,
|
||||||
access_stats: self.access_stats.as_api_model(reset),
|
access_stats: self.access_stats.to_api_model(reset),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -218,7 +207,6 @@ impl RemoteLayer {
|
|||||||
file_name: fname.to_owned().into(),
|
file_name: fname.to_owned().into(),
|
||||||
layer_metadata: layer_metadata.clone(),
|
layer_metadata: layer_metadata.clone(),
|
||||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||||
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
|
|
||||||
access_stats,
|
access_stats,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -240,7 +228,6 @@ impl RemoteLayer {
|
|||||||
file_name: fname.to_owned().into(),
|
file_name: fname.to_owned().into(),
|
||||||
layer_metadata: layer_metadata.clone(),
|
layer_metadata: layer_metadata.clone(),
|
||||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||||
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
|
|
||||||
access_stats,
|
access_stats,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
use std::ops::ControlFlow;
|
use std::ops::ControlFlow;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::Duration;
|
||||||
|
|
||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext};
|
||||||
use crate::metrics::TENANT_TASK_EVENTS;
|
use crate::metrics::TENANT_TASK_EVENTS;
|
||||||
@@ -11,7 +11,6 @@ use crate::task_mgr;
|
|||||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||||
use crate::tenant::mgr;
|
use crate::tenant::mgr;
|
||||||
use crate::tenant::{Tenant, TenantState};
|
use crate::tenant::{Tenant, TenantState};
|
||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::id::TenantId;
|
use utils::id::TenantId;
|
||||||
|
|
||||||
@@ -54,55 +53,37 @@ async fn compaction_loop(tenant_id: TenantId) {
|
|||||||
info!("starting");
|
info!("starting");
|
||||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||||
async {
|
async {
|
||||||
let cancel = task_mgr::shutdown_token();
|
|
||||||
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
|
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||||
let mut first = true;
|
|
||||||
loop {
|
loop {
|
||||||
trace!("waking up");
|
trace!("waking up");
|
||||||
|
|
||||||
let tenant = tokio::select! {
|
let tenant = tokio::select! {
|
||||||
_ = cancel.cancelled() => {
|
_ = task_mgr::shutdown_watcher() => {
|
||||||
info!("received cancellation request");
|
info!("received cancellation request");
|
||||||
return;
|
return;
|
||||||
},
|
},
|
||||||
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
|
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
|
||||||
ControlFlow::Break(()) => return,
|
ControlFlow::Break(()) => return,
|
||||||
ControlFlow::Continue(tenant) => tenant,
|
ControlFlow::Continue(tenant) => tenant,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
let period = tenant.get_compaction_period();
|
let mut sleep_duration = tenant.get_compaction_period();
|
||||||
|
if sleep_duration == Duration::ZERO {
|
||||||
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
|
|
||||||
// loop, #3501. There are also additional "allowed_errors" in tests.
|
|
||||||
if first {
|
|
||||||
first = false;
|
|
||||||
if random_init_delay(period, &cancel).await.is_err() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let started_at = Instant::now();
|
|
||||||
|
|
||||||
let sleep_duration = if period == Duration::ZERO {
|
|
||||||
info!("automatic compaction is disabled");
|
info!("automatic compaction is disabled");
|
||||||
// check again in 10 seconds, in case it's been enabled again.
|
// check again in 10 seconds, in case it's been enabled again.
|
||||||
Duration::from_secs(10)
|
sleep_duration = Duration::from_secs(10);
|
||||||
} else {
|
} else {
|
||||||
// Run compaction
|
// Run compaction
|
||||||
if let Err(e) = tenant.compaction_iteration(&ctx).await {
|
if let Err(e) = tenant.compaction_iteration(&ctx).await {
|
||||||
error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
|
sleep_duration = wait_duration;
|
||||||
wait_duration
|
error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||||
} else {
|
|
||||||
period
|
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
|
|
||||||
warn_when_period_overrun(started_at.elapsed(), period, "compaction");
|
|
||||||
|
|
||||||
// Sleep
|
// Sleep
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = cancel.cancelled() => {
|
_ = task_mgr::shutdown_watcher() => {
|
||||||
info!("received cancellation request during idling");
|
info!("received cancellation request during idling");
|
||||||
break;
|
break;
|
||||||
},
|
},
|
||||||
@@ -124,16 +105,14 @@ async fn gc_loop(tenant_id: TenantId) {
|
|||||||
info!("starting");
|
info!("starting");
|
||||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||||
async {
|
async {
|
||||||
let cancel = task_mgr::shutdown_token();
|
|
||||||
// GC might require downloading, to find the cutoff LSN that corresponds to the
|
// GC might require downloading, to find the cutoff LSN that corresponds to the
|
||||||
// cutoff specified as time.
|
// cutoff specified as time.
|
||||||
let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||||
let mut first = true;
|
|
||||||
loop {
|
loop {
|
||||||
trace!("waking up");
|
trace!("waking up");
|
||||||
|
|
||||||
let tenant = tokio::select! {
|
let tenant = tokio::select! {
|
||||||
_ = cancel.cancelled() => {
|
_ = task_mgr::shutdown_watcher() => {
|
||||||
info!("received cancellation request");
|
info!("received cancellation request");
|
||||||
return;
|
return;
|
||||||
},
|
},
|
||||||
@@ -143,38 +122,27 @@ async fn gc_loop(tenant_id: TenantId) {
|
|||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
let period = tenant.get_gc_period();
|
let gc_period = tenant.get_gc_period();
|
||||||
|
let gc_horizon = tenant.get_gc_horizon();
|
||||||
if first {
|
let mut sleep_duration = gc_period;
|
||||||
first = false;
|
if sleep_duration == Duration::ZERO {
|
||||||
if random_init_delay(period, &cancel).await.is_err() {
|
info!("automatic GC is disabled");
|
||||||
break;
|
// check again in 10 seconds, in case it's been enabled again.
|
||||||
|
sleep_duration = Duration::from_secs(10);
|
||||||
|
} else {
|
||||||
|
// Run gc
|
||||||
|
if gc_horizon > 0 {
|
||||||
|
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await
|
||||||
|
{
|
||||||
|
sleep_duration = wait_duration;
|
||||||
|
error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let started_at = Instant::now();
|
|
||||||
|
|
||||||
let gc_horizon = tenant.get_gc_horizon();
|
|
||||||
let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
|
|
||||||
info!("automatic GC is disabled");
|
|
||||||
// check again in 10 seconds, in case it's been enabled again.
|
|
||||||
Duration::from_secs(10)
|
|
||||||
} else {
|
|
||||||
// Run gc
|
|
||||||
let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
|
|
||||||
if let Err(e) = res {
|
|
||||||
error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
|
|
||||||
wait_duration
|
|
||||||
} else {
|
|
||||||
period
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
warn_when_period_overrun(started_at.elapsed(), period, "gc");
|
|
||||||
|
|
||||||
// Sleep
|
// Sleep
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = cancel.cancelled() => {
|
_ = task_mgr::shutdown_watcher() => {
|
||||||
info!("received cancellation request during idling");
|
info!("received cancellation request during idling");
|
||||||
break;
|
break;
|
||||||
},
|
},
|
||||||
@@ -229,49 +197,3 @@ async fn wait_for_active_tenant(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug)]
|
|
||||||
#[error("cancelled")]
|
|
||||||
pub(crate) struct Cancelled;
|
|
||||||
|
|
||||||
/// Provide a random delay for background task initialization.
|
|
||||||
///
|
|
||||||
/// This delay prevents a thundering herd of background tasks and will likely keep them running on
|
|
||||||
/// different periods for more stable load.
|
|
||||||
pub(crate) async fn random_init_delay(
|
|
||||||
period: Duration,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<(), Cancelled> {
|
|
||||||
use rand::Rng;
|
|
||||||
|
|
||||||
let d = {
|
|
||||||
let mut rng = rand::thread_rng();
|
|
||||||
|
|
||||||
// gen_range asserts that the range cannot be empty, which it could be because period can
|
|
||||||
// be set to zero to disable gc or compaction, so lets set it to be at least 10s.
|
|
||||||
let period = std::cmp::max(period, Duration::from_secs(10));
|
|
||||||
|
|
||||||
// semi-ok default as the source of jitter
|
|
||||||
rng.gen_range(Duration::ZERO..=period)
|
|
||||||
};
|
|
||||||
|
|
||||||
tokio::select! {
|
|
||||||
_ = cancel.cancelled() => Err(Cancelled),
|
|
||||||
_ = tokio::time::sleep(d) => Ok(()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
|
|
||||||
// Duration::ZERO will happen because it's the "disable [bgtask]" value.
|
|
||||||
if elapsed >= period && period != Duration::ZERO {
|
|
||||||
// humantime does no significant digits clamping whereas Duration's debug is a bit more
|
|
||||||
// intelligent. however it makes sense to keep the "configuration format" for period, even
|
|
||||||
// though there's no way to output the actual config value.
|
|
||||||
warn!(
|
|
||||||
?elapsed,
|
|
||||||
period = %humantime::format_duration(period),
|
|
||||||
task,
|
|
||||||
"task iteration took longer than the configured period"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
//!
|
//!
|
||||||
|
|
||||||
mod eviction_task;
|
|
||||||
mod walreceiver;
|
mod walreceiver;
|
||||||
|
|
||||||
use anyhow::{anyhow, bail, ensure, Context};
|
use anyhow::{anyhow, bail, ensure, Context};
|
||||||
@@ -19,7 +18,6 @@ use tracing::*;
|
|||||||
use utils::id::TenantTimelineId;
|
use utils::id::TenantTimelineId;
|
||||||
|
|
||||||
use std::cmp::{max, min, Ordering};
|
use std::cmp::{max, min, Ordering};
|
||||||
use std::collections::BinaryHeap;
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::ops::{Deref, Range};
|
use std::ops::{Deref, Range};
|
||||||
@@ -49,7 +47,7 @@ use crate::metrics::TimelineMetrics;
|
|||||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||||
use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
|
use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||||
use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
|
use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
|
||||||
use crate::tenant::config::{EvictionPolicy, TenantConfOpt};
|
use crate::tenant::config::TenantConfOpt;
|
||||||
use pageserver_api::reltag::RelTag;
|
use pageserver_api::reltag::RelTag;
|
||||||
|
|
||||||
use postgres_connection::PgConnectionConfig;
|
use postgres_connection::PgConnectionConfig;
|
||||||
@@ -83,25 +81,6 @@ enum FlushLoopState {
|
|||||||
Exited,
|
Exited,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
||||||
pub struct Hole {
|
|
||||||
key_range: Range<Key>,
|
|
||||||
coverage_size: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Ord for Hole {
|
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
|
||||||
other.coverage_size.cmp(&self.coverage_size) // inverse order
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PartialOrd for Hole {
|
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
|
||||||
Some(self.cmp(other))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct Timeline {
|
pub struct Timeline {
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||||
@@ -312,9 +291,18 @@ impl LogicalSize {
|
|||||||
// we change the type.
|
// we change the type.
|
||||||
match self.initial_logical_size.get() {
|
match self.initial_logical_size.get() {
|
||||||
Some(initial_size) => {
|
Some(initial_size) => {
|
||||||
initial_size.checked_add_signed(size_increment)
|
let absolute_size_increment = u64::try_from(
|
||||||
.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
|
size_increment
|
||||||
.map(CurrentLogicalSize::Exact)
|
.checked_abs()
|
||||||
|
.with_context(|| format!("Size added after initial {size_increment} is not expected to be i64::MIN"))?,
|
||||||
|
).expect("casting nonnegative i64 to u64 should not fail");
|
||||||
|
|
||||||
|
if size_increment < 0 {
|
||||||
|
initial_size.checked_sub(absolute_size_increment)
|
||||||
|
} else {
|
||||||
|
initial_size.checked_add(absolute_size_increment)
|
||||||
|
}.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
|
||||||
|
.map(CurrentLogicalSize::Exact)
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
|
let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
|
||||||
@@ -633,10 +621,7 @@ impl Timeline {
|
|||||||
self.flush_frozen_layers_and_wait().await
|
self.flush_frozen_layers_and_wait().await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Outermost timeline compaction operation; downloads needed layers.
|
|
||||||
pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||||
const ROUNDS: usize = 2;
|
|
||||||
|
|
||||||
let last_record_lsn = self.get_last_record_lsn();
|
let last_record_lsn = self.get_last_record_lsn();
|
||||||
|
|
||||||
// Last record Lsn could be zero in case the timeline was just created
|
// Last record Lsn could be zero in case the timeline was just created
|
||||||
@@ -645,86 +630,6 @@ impl Timeline {
|
|||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
// retry two times to allow first round to find layers which need to be downloaded, then
|
|
||||||
// download them, then retry compaction
|
|
||||||
for round in 0..ROUNDS {
|
|
||||||
// should we error out with the most specific error?
|
|
||||||
let last_round = round == ROUNDS - 1;
|
|
||||||
|
|
||||||
let res = self.compact_inner(ctx).await;
|
|
||||||
|
|
||||||
// If `create_image_layers' or `compact_level0` scheduled any
|
|
||||||
// uploads or deletions, but didn't update the index file yet,
|
|
||||||
// do it now.
|
|
||||||
//
|
|
||||||
// This isn't necessary for correctness, the remote state is
|
|
||||||
// consistent without the uploads and deletions, and we would
|
|
||||||
// update the index file on next flush iteration too. But it
|
|
||||||
// could take a while until that happens.
|
|
||||||
//
|
|
||||||
// Additionally, only do this once before we return from this function.
|
|
||||||
if last_round || res.is_ok() {
|
|
||||||
if let Some(remote_client) = &self.remote_client {
|
|
||||||
remote_client.schedule_index_upload_for_file_changes()?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let rls = match res {
|
|
||||||
Ok(()) => return Ok(()),
|
|
||||||
Err(CompactionError::DownloadRequired(rls)) if !last_round => {
|
|
||||||
// this can be done at most one time before exiting, waiting
|
|
||||||
rls
|
|
||||||
}
|
|
||||||
Err(CompactionError::DownloadRequired(rls)) => {
|
|
||||||
anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len())
|
|
||||||
}
|
|
||||||
Err(CompactionError::Other(e)) => {
|
|
||||||
return Err(e);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// this path can be visited in the second round of retrying, if first one found that we
|
|
||||||
// must first download some remote layers
|
|
||||||
let total = rls.len();
|
|
||||||
|
|
||||||
let mut downloads = rls
|
|
||||||
.into_iter()
|
|
||||||
.map(|rl| self.download_remote_layer(rl))
|
|
||||||
.collect::<futures::stream::FuturesUnordered<_>>();
|
|
||||||
|
|
||||||
let mut failed = 0;
|
|
||||||
|
|
||||||
let cancelled = task_mgr::shutdown_watcher();
|
|
||||||
tokio::pin!(cancelled);
|
|
||||||
|
|
||||||
loop {
|
|
||||||
tokio::select! {
|
|
||||||
_ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"),
|
|
||||||
res = downloads.next() => {
|
|
||||||
match res {
|
|
||||||
Some(Ok(())) => {},
|
|
||||||
Some(Err(e)) => {
|
|
||||||
warn!("Downloading remote layer for compaction failed: {e:#}");
|
|
||||||
failed += 1;
|
|
||||||
}
|
|
||||||
None => break,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if failed != 0 {
|
|
||||||
anyhow::bail!("{failed} out of {total} layers failed to download, retrying later");
|
|
||||||
}
|
|
||||||
|
|
||||||
// if everything downloaded fine, lets try again
|
|
||||||
}
|
|
||||||
|
|
||||||
unreachable!("retry loop exits")
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compaction which might need to be retried after downloading remote layers.
|
|
||||||
async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> {
|
|
||||||
//
|
//
|
||||||
// High level strategy for compaction / image creation:
|
// High level strategy for compaction / image creation:
|
||||||
//
|
//
|
||||||
@@ -763,7 +668,7 @@ impl Timeline {
|
|||||||
// Is the timeline being deleted?
|
// Is the timeline being deleted?
|
||||||
let state = *self.state.borrow();
|
let state = *self.state.borrow();
|
||||||
if state == TimelineState::Stopping {
|
if state == TimelineState::Stopping {
|
||||||
return Err(anyhow::anyhow!("timeline is Stopping").into());
|
anyhow::bail!("timeline is Stopping");
|
||||||
}
|
}
|
||||||
|
|
||||||
let target_file_size = self.get_checkpoint_distance();
|
let target_file_size = self.get_checkpoint_distance();
|
||||||
@@ -783,8 +688,7 @@ impl Timeline {
|
|||||||
// "enough".
|
// "enough".
|
||||||
let layer_paths_to_upload = self
|
let layer_paths_to_upload = self
|
||||||
.create_image_layers(&partitioning, lsn, false, ctx)
|
.create_image_layers(&partitioning, lsn, false, ctx)
|
||||||
.await
|
.await?;
|
||||||
.map_err(anyhow::Error::from)?;
|
|
||||||
if let Some(remote_client) = &self.remote_client {
|
if let Some(remote_client) = &self.remote_client {
|
||||||
for (path, layer_metadata) in layer_paths_to_upload {
|
for (path, layer_metadata) in layer_paths_to_upload {
|
||||||
remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
|
remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
|
||||||
@@ -796,6 +700,18 @@ impl Timeline {
|
|||||||
self.compact_level0(&layer_removal_cs, target_file_size, ctx)
|
self.compact_level0(&layer_removal_cs, target_file_size, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
timer.stop_and_record();
|
timer.stop_and_record();
|
||||||
|
|
||||||
|
// If `create_image_layers' or `compact_level0` scheduled any
|
||||||
|
// uploads or deletions, but didn't update the index file yet,
|
||||||
|
// do it now.
|
||||||
|
//
|
||||||
|
// This isn't necessary for correctness, the remote state is
|
||||||
|
// consistent without the uploads and deletions, and we would
|
||||||
|
// update the index file on next flush iteration too. But it
|
||||||
|
// could take a while until that happens.
|
||||||
|
if let Some(remote_client) = &self.remote_client {
|
||||||
|
remote_client.schedule_index_upload_for_file_changes()?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
// no partitioning? This is normal, if the timeline was just created
|
// no partitioning? This is normal, if the timeline was just created
|
||||||
@@ -885,7 +801,6 @@ impl Timeline {
|
|||||||
pub fn activate(self: &Arc<Self>) {
|
pub fn activate(self: &Arc<Self>) {
|
||||||
self.set_state(TimelineState::Active);
|
self.set_state(TimelineState::Active);
|
||||||
self.launch_wal_receiver();
|
self.launch_wal_receiver();
|
||||||
self.launch_eviction_task();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_state(&self, new_state: TimelineState) {
|
pub fn set_state(&self, new_state: TimelineState) {
|
||||||
@@ -952,107 +867,24 @@ impl Timeline {
|
|||||||
Ok(Some(true))
|
Ok(Some(true))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Like [`evict_layer_batch`], but for just one layer.
|
|
||||||
/// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
|
|
||||||
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
||||||
let Some(local_layer) = self.find_layer(layer_file_name) else { return Ok(None) };
|
let Some(local_layer) = self.find_layer(layer_file_name) else { return Ok(None) };
|
||||||
let remote_client = self
|
if local_layer.is_remote_layer() {
|
||||||
.remote_client
|
return Ok(Some(false));
|
||||||
.as_ref()
|
|
||||||
.ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;
|
|
||||||
|
|
||||||
let cancel = CancellationToken::new();
|
|
||||||
let results = self
|
|
||||||
.evict_layer_batch(remote_client, &[local_layer], cancel)
|
|
||||||
.await?;
|
|
||||||
assert_eq!(results.len(), 1);
|
|
||||||
let result: Option<anyhow::Result<bool>> = results.into_iter().next().unwrap();
|
|
||||||
match result {
|
|
||||||
None => anyhow::bail!("task_mgr shutdown requested"),
|
|
||||||
Some(Ok(b)) => Ok(Some(b)),
|
|
||||||
Some(Err(e)) => Err(e),
|
|
||||||
}
|
}
|
||||||
}
|
let Some(remote_client) = &self.remote_client else { return Ok(Some(false)) };
|
||||||
|
|
||||||
/// Evict multiple layers at once, continuing through errors.
|
// ensure the current layer is uploaded for sure
|
||||||
///
|
|
||||||
/// Try to evict the given `layers_to_evict` by
|
|
||||||
///
|
|
||||||
/// 1. Replacing the given layer object in the layer map with a corresponding [`RemoteLayer`] object.
|
|
||||||
/// 2. Deleting the now unreferenced layer file from disk.
|
|
||||||
///
|
|
||||||
/// The `remote_client` should be this timeline's `self.remote_client`.
|
|
||||||
/// We make the caller provide it so that they are responsible for handling the case
|
|
||||||
/// where someone wants to evict the layer but no remote storage is configured.
|
|
||||||
///
|
|
||||||
/// Returns either `Err()` or `Ok(results)` where `results.len() == layers_to_evict.len()`.
|
|
||||||
/// If `Err()` is returned, no eviction was attempted.
|
|
||||||
/// Each position of `Ok(results)` corresponds to the layer in `layers_to_evict`.
|
|
||||||
/// Meaning of each `result[i]`:
|
|
||||||
/// - `Some(Err(...))` if layer replacement failed for an unexpected reason
|
|
||||||
/// - `Some(Ok(true))` if everything went well.
|
|
||||||
/// - `Some(Ok(false))` if there was an expected reason why the layer could not be replaced, e.g.:
|
|
||||||
/// - evictee was not yet downloaded
|
|
||||||
/// - replacement failed for an expectable reason (e.g., layer removed by GC before we grabbed all locks)
|
|
||||||
/// - `None` if no eviction attempt was made for the layer because `cancel.is_cancelled() == true`.
|
|
||||||
async fn evict_layer_batch(
|
|
||||||
&self,
|
|
||||||
remote_client: &Arc<RemoteTimelineClient>,
|
|
||||||
layers_to_evict: &[Arc<dyn PersistentLayer>],
|
|
||||||
cancel: CancellationToken,
|
|
||||||
) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
|
|
||||||
// ensure that the layers have finished uploading
|
|
||||||
// (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
|
|
||||||
remote_client
|
remote_client
|
||||||
.wait_completion()
|
.wait_completion()
|
||||||
.await
|
.await
|
||||||
.context("wait for layer upload ops to complete")?;
|
.context("wait for layer upload ops to complete")?;
|
||||||
|
|
||||||
// now lock out layer removal (compaction, gc, timeline deletion)
|
let layer_metadata = LayerFileMetadata::new(
|
||||||
let layer_removal_guard = self.layer_removal_cs.lock().await;
|
local_layer
|
||||||
|
.file_size()
|
||||||
// start the batch update
|
.expect("Local layer should have a file size"),
|
||||||
let mut layer_map = self.layers.write().unwrap();
|
);
|
||||||
let mut batch_updates = layer_map.batch_update();
|
|
||||||
|
|
||||||
let mut results = Vec::with_capacity(layers_to_evict.len());
|
|
||||||
|
|
||||||
for l in layers_to_evict.iter() {
|
|
||||||
let res = if cancel.is_cancelled() {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(self.evict_layer_batch_impl(&layer_removal_guard, l, &mut batch_updates))
|
|
||||||
};
|
|
||||||
results.push(res);
|
|
||||||
}
|
|
||||||
|
|
||||||
// commit the updates & release locks
|
|
||||||
batch_updates.flush();
|
|
||||||
drop(layer_map);
|
|
||||||
drop(layer_removal_guard);
|
|
||||||
|
|
||||||
assert_eq!(results.len(), layers_to_evict.len());
|
|
||||||
Ok(results)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn evict_layer_batch_impl(
|
|
||||||
&self,
|
|
||||||
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
|
||||||
local_layer: &Arc<dyn PersistentLayer>,
|
|
||||||
batch_updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
|
|
||||||
) -> anyhow::Result<bool> {
|
|
||||||
use super::layer_map::Replacement;
|
|
||||||
|
|
||||||
if local_layer.is_remote_layer() {
|
|
||||||
return Ok(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
let layer_file_size = local_layer
|
|
||||||
.file_size()
|
|
||||||
.expect("Local layer should have a file size");
|
|
||||||
|
|
||||||
let layer_metadata = LayerFileMetadata::new(layer_file_size);
|
|
||||||
|
|
||||||
let new_remote_layer = Arc::new(match local_layer.filename() {
|
let new_remote_layer = Arc::new(match local_layer.filename() {
|
||||||
LayerFileName::Image(image_name) => RemoteLayer::new_img(
|
LayerFileName::Image(image_name) => RemoteLayer::new_img(
|
||||||
self.tenant_id,
|
self.tenant_id,
|
||||||
@@ -1074,45 +906,16 @@ impl Timeline {
|
|||||||
),
|
),
|
||||||
});
|
});
|
||||||
|
|
||||||
let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? {
|
let gc_lock = self.layer_removal_cs.lock().await;
|
||||||
Replacement::Replaced { .. } => {
|
let mut layers = self.layers.write().unwrap();
|
||||||
if let Err(e) = local_layer.delete_resident_layer_file() {
|
let mut updates = layers.batch_update();
|
||||||
error!("failed to remove layer file on evict after replacement: {e:#?}");
|
self.delete_historic_layer(&gc_lock, local_layer, &mut updates)?;
|
||||||
}
|
updates.insert_historic(new_remote_layer);
|
||||||
// Always decrement the physical size gauge, even if we failed to delete the file.
|
updates.flush();
|
||||||
// Rationale: we already replaced the layer with a remote layer in the layer map,
|
drop(layers);
|
||||||
// and any subsequent download_remote_layer will
|
drop(gc_lock);
|
||||||
// 1. overwrite the file on disk and
|
|
||||||
// 2. add the downloaded size to the resident size gauge.
|
|
||||||
//
|
|
||||||
// If there is no re-download, and we restart the pageserver, then load_layer_map
|
|
||||||
// will treat the file as a local layer again, count it towards resident size,
|
|
||||||
// and it'll be like the layer removal never happened.
|
|
||||||
// The bump in resident size is perhaps unexpected but overall a robust behavior.
|
|
||||||
self.metrics
|
|
||||||
.resident_physical_size_gauge
|
|
||||||
.sub(layer_file_size);
|
|
||||||
|
|
||||||
true
|
Ok(Some(true))
|
||||||
}
|
|
||||||
Replacement::NotFound => {
|
|
||||||
debug!(evicted=?local_layer, "layer was no longer in layer map");
|
|
||||||
false
|
|
||||||
}
|
|
||||||
Replacement::RemovalBuffered => {
|
|
||||||
unreachable!("not doing anything else in this batch")
|
|
||||||
}
|
|
||||||
Replacement::Unexpected(other) => {
|
|
||||||
error!(
|
|
||||||
local_layer.ptr=?Arc::as_ptr(local_layer),
|
|
||||||
other.ptr=?Arc::as_ptr(&other),
|
|
||||||
?other,
|
|
||||||
"failed to replace");
|
|
||||||
false
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(replaced)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1153,13 +956,6 @@ impl Timeline {
|
|||||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_eviction_policy(&self) -> EvictionPolicy {
|
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap();
|
|
||||||
tenant_conf
|
|
||||||
.eviction_policy
|
|
||||||
.unwrap_or(self.conf.default_tenant_conf.eviction_policy)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Open a Timeline handle.
|
/// Open a Timeline handle.
|
||||||
///
|
///
|
||||||
/// Loads the metadata for the timeline into memory, but not the layer map.
|
/// Loads the metadata for the timeline into memory, but not the layer map.
|
||||||
@@ -1716,31 +1512,13 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
x @ Err(_) => x.context("Failed to calculate logical size")?,
|
x @ Err(_) => x.context("Failed to calculate logical size")?,
|
||||||
};
|
};
|
||||||
|
|
||||||
// we cannot query current_logical_size.current_size() to know the current
|
|
||||||
// *negative* value, only truncated to u64.
|
|
||||||
let added = self_clone
|
|
||||||
.current_logical_size
|
|
||||||
.size_added_after_initial
|
|
||||||
.load(AtomicOrdering::Relaxed);
|
|
||||||
|
|
||||||
let sum = calculated_size.saturating_add_signed(added);
|
|
||||||
|
|
||||||
// set the gauge value before it can be set in `update_current_logical_size`.
|
|
||||||
self_clone.metrics.current_logical_size_gauge.set(sum);
|
|
||||||
|
|
||||||
match self_clone
|
match self_clone
|
||||||
.current_logical_size
|
.current_logical_size
|
||||||
.initial_logical_size
|
.initial_logical_size
|
||||||
.set(calculated_size)
|
.set(calculated_size)
|
||||||
{
|
{
|
||||||
Ok(()) => (),
|
Ok(()) => (),
|
||||||
Err(_what_we_just_attempted_to_set) => {
|
Err(existing_size) => {
|
||||||
let existing_size = self_clone
|
|
||||||
.current_logical_size
|
|
||||||
.initial_logical_size
|
|
||||||
.get()
|
|
||||||
.expect("once_cell set was lost, then get failed, impossible.");
|
|
||||||
// This shouldn't happen because the semaphore is initialized with 1.
|
// This shouldn't happen because the semaphore is initialized with 1.
|
||||||
// But if it happens, just complain & report success so there are no further retries.
|
// But if it happens, just complain & report success so there are no further retries.
|
||||||
error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing")
|
error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing")
|
||||||
@@ -1798,9 +1576,15 @@ impl Timeline {
|
|||||||
let calculation = async {
|
let calculation = async {
|
||||||
let cancel = cancel.child_token();
|
let cancel = cancel.child_token();
|
||||||
let ctx = ctx.attached_child();
|
let ctx = ctx.attached_child();
|
||||||
self_calculation
|
tokio::task::spawn_blocking(move || {
|
||||||
.calculate_logical_size(init_lsn, cancel, &ctx)
|
// Run in a separate thread since this can do a lot of
|
||||||
.await
|
// synchronous file IO without .await inbetween
|
||||||
|
// if there are no RemoteLayers that would require downloading.
|
||||||
|
let h = tokio::runtime::Handle::current();
|
||||||
|
h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel, &ctx))
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.context("Failed to spawn calculation result task")?
|
||||||
};
|
};
|
||||||
let timeline_state_cancellation = async {
|
let timeline_state_cancellation = async {
|
||||||
loop {
|
loop {
|
||||||
@@ -1833,7 +1617,7 @@ impl Timeline {
|
|||||||
tokio::pin!(calculation);
|
tokio::pin!(calculation);
|
||||||
loop {
|
loop {
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
res = &mut calculation => { return res }
|
res = &mut calculation => { return res }
|
||||||
reason = timeline_state_cancellation => {
|
reason = timeline_state_cancellation => {
|
||||||
debug!(reason = reason, "cancelling calculation");
|
debug!(reason = reason, "cancelling calculation");
|
||||||
cancel.cancel();
|
cancel.cancel();
|
||||||
@@ -1917,15 +1701,10 @@ impl Timeline {
|
|||||||
// one value while current_logical_size is set to the
|
// one value while current_logical_size is set to the
|
||||||
// other.
|
// other.
|
||||||
match logical_size.current_size() {
|
match logical_size.current_size() {
|
||||||
Ok(CurrentLogicalSize::Exact(new_current_size)) => self
|
Ok(new_current_size) => self
|
||||||
.metrics
|
.metrics
|
||||||
.current_logical_size_gauge
|
.current_logical_size_gauge
|
||||||
.set(new_current_size),
|
.set(new_current_size.size()),
|
||||||
Ok(CurrentLogicalSize::Approximate(_)) => {
|
|
||||||
// don't update the gauge yet, this allows us not to update the gauge back and
|
|
||||||
// forth between the initial size calculation task.
|
|
||||||
}
|
|
||||||
// this is overflow
|
|
||||||
Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"),
|
Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1950,14 +1729,11 @@ impl Timeline {
|
|||||||
layer: Arc<dyn PersistentLayer>,
|
layer: Arc<dyn PersistentLayer>,
|
||||||
updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
|
updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
if !layer.is_remote_layer() {
|
let layer_size = layer.file_size();
|
||||||
layer.delete_resident_layer_file()?;
|
|
||||||
let layer_file_size = layer
|
layer.delete()?;
|
||||||
.file_size()
|
if let Some(layer_size) = layer_size {
|
||||||
.expect("Local layer should have a file size");
|
self.metrics.resident_physical_size_gauge.sub(layer_size);
|
||||||
self.metrics
|
|
||||||
.resident_physical_size_gauge
|
|
||||||
.sub(layer_file_size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO Removing from the bottom of the layer map is expensive.
|
// TODO Removing from the bottom of the layer map is expensive.
|
||||||
@@ -2497,7 +2273,7 @@ impl Timeline {
|
|||||||
// Only one thread may call this function at a time (for this
|
// Only one thread may call this function at a time (for this
|
||||||
// timeline). If two threads tried to flush the same frozen
|
// timeline). If two threads tried to flush the same frozen
|
||||||
// layer to disk at the same time, that would not work.
|
// layer to disk at the same time, that would not work.
|
||||||
assert!(LayerMap::compare_arced_layers(&l.unwrap(), &frozen_layer));
|
assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer));
|
||||||
|
|
||||||
// release lock on 'layers'
|
// release lock on 'layers'
|
||||||
}
|
}
|
||||||
@@ -2633,13 +2409,10 @@ impl Timeline {
|
|||||||
) -> anyhow::Result<(KeyPartitioning, Lsn)> {
|
) -> anyhow::Result<(KeyPartitioning, Lsn)> {
|
||||||
{
|
{
|
||||||
let partitioning_guard = self.partitioning.lock().unwrap();
|
let partitioning_guard = self.partitioning.lock().unwrap();
|
||||||
let distance = lsn.0 - partitioning_guard.1 .0;
|
if partitioning_guard.1 != Lsn(0)
|
||||||
if partitioning_guard.1 != Lsn(0) && distance <= self.repartition_threshold {
|
&& lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold
|
||||||
debug!(
|
{
|
||||||
distance,
|
// no repartitioning needed
|
||||||
threshold = self.repartition_threshold,
|
|
||||||
"no repartitioning needed"
|
|
||||||
);
|
|
||||||
return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
|
return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2657,12 +2430,8 @@ impl Timeline {
|
|||||||
|
|
||||||
// Is it time to create a new image layer for the given partition?
|
// Is it time to create a new image layer for the given partition?
|
||||||
fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
|
fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
|
||||||
let threshold = self.get_image_creation_threshold();
|
|
||||||
|
|
||||||
let layers = self.layers.read().unwrap();
|
let layers = self.layers.read().unwrap();
|
||||||
|
|
||||||
let mut max_deltas = 0;
|
|
||||||
|
|
||||||
for part_range in &partition.ranges {
|
for part_range in &partition.ranges {
|
||||||
let image_coverage = layers.image_coverage(part_range, lsn)?;
|
let image_coverage = layers.image_coverage(part_range, lsn)?;
|
||||||
for (img_range, last_img) in image_coverage {
|
for (img_range, last_img) in image_coverage {
|
||||||
@@ -2684,25 +2453,21 @@ impl Timeline {
|
|||||||
// are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
|
// are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
|
||||||
// after we read last_record_lsn, which is passed here in the 'lsn' argument.
|
// after we read last_record_lsn, which is passed here in the 'lsn' argument.
|
||||||
if img_lsn < lsn {
|
if img_lsn < lsn {
|
||||||
|
let threshold = self.get_image_creation_threshold();
|
||||||
let num_deltas =
|
let num_deltas =
|
||||||
layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
|
layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
|
||||||
|
|
||||||
max_deltas = max_deltas.max(num_deltas);
|
debug!(
|
||||||
|
"key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
|
||||||
|
img_range.start, img_range.end, num_deltas, img_lsn, lsn
|
||||||
|
);
|
||||||
if num_deltas >= threshold {
|
if num_deltas >= threshold {
|
||||||
debug!(
|
|
||||||
"key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
|
|
||||||
img_range.start, img_range.end, num_deltas, img_lsn, lsn
|
|
||||||
);
|
|
||||||
return Ok(true);
|
return Ok(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!(
|
|
||||||
max_deltas,
|
|
||||||
"none of the partitioned ranges had >= {threshold} deltas"
|
|
||||||
);
|
|
||||||
Ok(false)
|
Ok(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2815,55 +2580,25 @@ impl Timeline {
|
|||||||
Ok(layer_paths_to_upload)
|
Ok(layer_paths_to_upload)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
struct CompactLevel0Phase1Result {
|
struct CompactLevel0Phase1Result {
|
||||||
new_layers: Vec<DeltaLayer>,
|
new_layers: Vec<DeltaLayer>,
|
||||||
deltas_to_compact: Vec<Arc<dyn PersistentLayer>>,
|
deltas_to_compact: Vec<Arc<dyn PersistentLayer>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Top-level failure to compact.
|
|
||||||
#[derive(Debug)]
|
|
||||||
enum CompactionError {
|
|
||||||
/// L0 compaction requires layers to be downloaded.
|
|
||||||
///
|
|
||||||
/// This should not happen repeatedly, but will be retried once by top-level
|
|
||||||
/// `Timeline::compact`.
|
|
||||||
DownloadRequired(Vec<Arc<RemoteLayer>>),
|
|
||||||
/// Compaction cannot be done right now; page reconstruction and so on.
|
|
||||||
Other(anyhow::Error),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<anyhow::Error> for CompactionError {
|
|
||||||
fn from(value: anyhow::Error) -> Self {
|
|
||||||
CompactionError::Other(value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Timeline {
|
impl Timeline {
|
||||||
/// Level0 files first phase of compaction, explained in the [`compact_inner`] comment.
|
|
||||||
///
|
|
||||||
/// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
|
|
||||||
/// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
|
|
||||||
/// start of level0 files compaction, the on-demand download should be revisited as well.
|
|
||||||
async fn compact_level0_phase1(
|
async fn compact_level0_phase1(
|
||||||
&self,
|
&self,
|
||||||
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
|
||||||
target_file_size: u64,
|
target_file_size: u64,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<CompactLevel0Phase1Result, CompactionError> {
|
) -> anyhow::Result<CompactLevel0Phase1Result> {
|
||||||
let layers = self.layers.read().unwrap();
|
let layers = self.layers.read().unwrap();
|
||||||
let mut level0_deltas = layers.get_level0_deltas()?;
|
let mut level0_deltas = layers.get_level0_deltas()?;
|
||||||
drop(layers);
|
drop(layers);
|
||||||
|
|
||||||
// Only compact if enough layers have accumulated.
|
// Only compact if enough layers have accumulated.
|
||||||
let threshold = self.get_compaction_threshold();
|
if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() {
|
||||||
if level0_deltas.is_empty() || level0_deltas.len() < threshold {
|
return Ok(Default::default());
|
||||||
debug!(
|
|
||||||
level0_deltas = level0_deltas.len(),
|
|
||||||
threshold, "too few deltas to compact"
|
|
||||||
);
|
|
||||||
return Ok(CompactLevel0Phase1Result::default());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Gather the files to compact in this iteration.
|
// Gather the files to compact in this iteration.
|
||||||
@@ -2899,24 +2634,6 @@ impl Timeline {
|
|||||||
end: deltas_to_compact.last().unwrap().get_lsn_range().end,
|
end: deltas_to_compact.last().unwrap().get_lsn_range().end,
|
||||||
};
|
};
|
||||||
|
|
||||||
let remotes = deltas_to_compact
|
|
||||||
.iter()
|
|
||||||
.filter(|l| l.is_remote_layer())
|
|
||||||
.inspect(|l| info!("compact requires download of {}", l.filename().file_name()))
|
|
||||||
.map(|l| {
|
|
||||||
l.clone()
|
|
||||||
.downcast_remote_layer()
|
|
||||||
.expect("just checked it is remote layer")
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
if !remotes.is_empty() {
|
|
||||||
// caller is holding the lock to layer_removal_cs, and we don't want to download while
|
|
||||||
// holding that; in future download_remote_layer might take it as well. this is
|
|
||||||
// regardless of earlier image creation downloading on-demand, while holding the lock.
|
|
||||||
return Err(CompactionError::DownloadRequired(remotes));
|
|
||||||
}
|
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
|
"Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
|
||||||
lsn_range.start,
|
lsn_range.start,
|
||||||
@@ -2924,11 +2641,9 @@ impl Timeline {
|
|||||||
deltas_to_compact.len(),
|
deltas_to_compact.len(),
|
||||||
level0_deltas.len()
|
level0_deltas.len()
|
||||||
);
|
);
|
||||||
|
|
||||||
for l in deltas_to_compact.iter() {
|
for l in deltas_to_compact.iter() {
|
||||||
info!("compact includes {}", l.filename().file_name());
|
info!("compact includes {}", l.filename().file_name());
|
||||||
}
|
}
|
||||||
|
|
||||||
// We don't need the original list of layers anymore. Drop it so that
|
// We don't need the original list of layers anymore. Drop it so that
|
||||||
// we don't accidentally use it later in the function.
|
// we don't accidentally use it later in the function.
|
||||||
drop(level0_deltas);
|
drop(level0_deltas);
|
||||||
@@ -2972,47 +2687,6 @@ impl Timeline {
|
|||||||
},
|
},
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// Determine N largest holes where N is number of compacted layers.
|
|
||||||
let max_holes = deltas_to_compact.len();
|
|
||||||
let last_record_lsn = self.get_last_record_lsn();
|
|
||||||
let layers = self.layers.read().unwrap(); // Is'n it better to hold original layers lock till here?
|
|
||||||
let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
|
|
||||||
let min_hole_coverage_size = 3; // TODO: something more flexible?
|
|
||||||
|
|
||||||
// min-heap (reserve space for one more element added before eviction)
|
|
||||||
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
|
|
||||||
let mut prev: Option<Key> = None;
|
|
||||||
for (next_key, _next_lsn, _size) in itertools::process_results(
|
|
||||||
deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
|
|
||||||
|iter_iter| iter_iter.kmerge_by(|a, b| a.0 <= b.0),
|
|
||||||
)? {
|
|
||||||
if let Some(prev_key) = prev {
|
|
||||||
// just first fast filter
|
|
||||||
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
|
|
||||||
let key_range = prev_key..next_key;
|
|
||||||
// Measuring hole by just subtraction of i128 representation of key range boundaries
|
|
||||||
// has not so much sense, because largest holes will corresponds field1/field2 changes.
|
|
||||||
// But we are mostly interested to eliminate holes which cause generation of excessive image layers.
|
|
||||||
// That is why it is better to measure size of hole as number of covering image layers.
|
|
||||||
let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
|
|
||||||
if coverage_size >= min_hole_coverage_size {
|
|
||||||
heap.push(Hole {
|
|
||||||
key_range,
|
|
||||||
coverage_size,
|
|
||||||
});
|
|
||||||
if heap.len() > max_holes {
|
|
||||||
heap.pop(); // remove smallest hole
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
prev = Some(next_key.next());
|
|
||||||
}
|
|
||||||
drop(layers);
|
|
||||||
let mut holes = heap.into_vec();
|
|
||||||
holes.sort_unstable_by_key(|hole| hole.key_range.start);
|
|
||||||
let mut next_hole = 0; // index of next hole in holes vector
|
|
||||||
|
|
||||||
// Merge the contents of all the input delta layers into a new set
|
// Merge the contents of all the input delta layers into a new set
|
||||||
// of delta layers, based on the current partitioning.
|
// of delta layers, based on the current partitioning.
|
||||||
//
|
//
|
||||||
@@ -3107,22 +2781,14 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
if writer.is_some() {
|
if writer.is_some() {
|
||||||
let written_size = writer.as_mut().unwrap().size();
|
let written_size = writer.as_mut().unwrap().size();
|
||||||
let contains_hole =
|
// check if key cause layer overflow...
|
||||||
next_hole < holes.len() && key >= holes[next_hole].key_range.end;
|
|
||||||
// check if key cause layer overflow or contains hole...
|
|
||||||
if is_dup_layer
|
if is_dup_layer
|
||||||
|| dup_end_lsn.is_valid()
|
|| dup_end_lsn.is_valid()
|
||||||
|| written_size + key_values_total_size > target_file_size
|
|| written_size + key_values_total_size > target_file_size
|
||||||
|| contains_hole
|
|
||||||
{
|
{
|
||||||
// ... if so, flush previous layer and prepare to write new one
|
// ... if so, flush previous layer and prepare to write new one
|
||||||
new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?);
|
new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?);
|
||||||
writer = None;
|
writer = None;
|
||||||
|
|
||||||
if contains_hole {
|
|
||||||
// skip hole
|
|
||||||
next_hole += 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Remember size of key value because at next iteration we will access next item
|
// Remember size of key value because at next iteration we will access next item
|
||||||
@@ -3147,9 +2813,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||||
return Err(
|
anyhow::bail!("failpoint delta-layer-writer-fail-before-finish");
|
||||||
anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into(),
|
|
||||||
);
|
|
||||||
});
|
});
|
||||||
|
|
||||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||||
@@ -3168,7 +2832,7 @@ impl Timeline {
|
|||||||
|
|
||||||
// Fsync all the layer files and directory using multiple threads to
|
// Fsync all the layer files and directory using multiple threads to
|
||||||
// minimize latency.
|
// minimize latency.
|
||||||
par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
|
par_fsync::par_fsync(&layer_paths)?;
|
||||||
|
|
||||||
layer_paths.pop().unwrap();
|
layer_paths.pop().unwrap();
|
||||||
}
|
}
|
||||||
@@ -3190,13 +2854,11 @@ impl Timeline {
|
|||||||
layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||||
target_file_size: u64,
|
target_file_size: u64,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<(), CompactionError> {
|
) -> anyhow::Result<()> {
|
||||||
let CompactLevel0Phase1Result {
|
let CompactLevel0Phase1Result {
|
||||||
new_layers,
|
new_layers,
|
||||||
deltas_to_compact,
|
deltas_to_compact,
|
||||||
} = self
|
} = self.compact_level0_phase1(target_file_size, ctx).await?;
|
||||||
.compact_level0_phase1(layer_removal_cs, target_file_size, ctx)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
if new_layers.is_empty() && deltas_to_compact.is_empty() {
|
if new_layers.is_empty() && deltas_to_compact.is_empty() {
|
||||||
// nothing to do
|
// nothing to do
|
||||||
@@ -3220,12 +2882,7 @@ impl Timeline {
|
|||||||
for l in new_layers {
|
for l in new_layers {
|
||||||
let new_delta_path = l.path();
|
let new_delta_path = l.path();
|
||||||
|
|
||||||
let metadata = new_delta_path.metadata().with_context(|| {
|
let metadata = new_delta_path.metadata()?;
|
||||||
format!(
|
|
||||||
"read file metadata for new created layer {}",
|
|
||||||
new_delta_path.display()
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if let Some(remote_client) = &self.remote_client {
|
if let Some(remote_client) = &self.remote_client {
|
||||||
remote_client.schedule_layer_file_upload(
|
remote_client.schedule_layer_file_upload(
|
||||||
@@ -3459,7 +3116,7 @@ impl Timeline {
|
|||||||
|
|
||||||
let mut layers_to_remove = Vec::new();
|
let mut layers_to_remove = Vec::new();
|
||||||
|
|
||||||
// Scan all layers in the timeline (remote or on-disk).
|
// Scan all on-disk layers in the timeline.
|
||||||
//
|
//
|
||||||
// Garbage collect the layer if all conditions are satisfied:
|
// Garbage collect the layer if all conditions are satisfied:
|
||||||
// 1. it is older than cutoff LSN;
|
// 1. it is older than cutoff LSN;
|
||||||
@@ -3698,26 +3355,14 @@ impl Timeline {
|
|||||||
&self,
|
&self,
|
||||||
remote_layer: Arc<RemoteLayer>,
|
remote_layer: Arc<RemoteLayer>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
use std::sync::atomic::Ordering::Relaxed;
|
|
||||||
|
|
||||||
let permit = match Arc::clone(&remote_layer.ongoing_download)
|
let permit = match Arc::clone(&remote_layer.ongoing_download)
|
||||||
.acquire_owned()
|
.acquire_owned()
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(permit) => permit,
|
Ok(permit) => permit,
|
||||||
Err(_closed) => {
|
Err(_closed) => {
|
||||||
if remote_layer.download_replacement_failure.load(Relaxed) {
|
info!("download of layer has already finished");
|
||||||
// this path will be hit often, in case there are upper retries. however
|
return Ok(());
|
||||||
// hitting this error will prevent a busy loop between get_reconstruct_data and
|
|
||||||
// download, so an error is prefered.
|
|
||||||
//
|
|
||||||
// TODO: we really should poison the timeline, but panicking is not yet
|
|
||||||
// supported. Related: https://github.com/neondatabase/neon/issues/3621
|
|
||||||
anyhow::bail!("an earlier download succeeded but LayerMap::replace failed")
|
|
||||||
} else {
|
|
||||||
info!("download of layer has already finished");
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -3749,12 +3394,11 @@ impl Timeline {
|
|||||||
// Delta- or ImageLayer in the layer map.
|
// Delta- or ImageLayer in the layer map.
|
||||||
let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size);
|
let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size);
|
||||||
let mut layers = self_clone.layers.write().unwrap();
|
let mut layers = self_clone.layers.write().unwrap();
|
||||||
let mut updates = layers.batch_update();
|
|
||||||
{
|
{
|
||||||
use crate::tenant::layer_map::Replacement;
|
use crate::tenant::layer_map::Replacement;
|
||||||
let l: Arc<dyn PersistentLayer> = remote_layer.clone();
|
let l: Arc<dyn PersistentLayer> = remote_layer.clone();
|
||||||
let failure = match updates.replace_historic(&l, new_layer) {
|
match layers.replace_historic(&l, new_layer) {
|
||||||
Ok(Replacement::Replaced { .. }) => false,
|
Ok(Replacement::Replaced { .. }) => { /* expected */ }
|
||||||
Ok(Replacement::NotFound) => {
|
Ok(Replacement::NotFound) => {
|
||||||
// TODO: the downloaded file should probably be removed, otherwise
|
// TODO: the downloaded file should probably be removed, otherwise
|
||||||
// it will be added to the layermap on next load? we should
|
// it will be added to the layermap on next load? we should
|
||||||
@@ -3762,7 +3406,6 @@ impl Timeline {
|
|||||||
//
|
//
|
||||||
// See: https://github.com/neondatabase/neon/issues/3533
|
// See: https://github.com/neondatabase/neon/issues/3533
|
||||||
error!("replacing downloaded layer into layermap failed because layer was not found");
|
error!("replacing downloaded layer into layermap failed because layer was not found");
|
||||||
true
|
|
||||||
}
|
}
|
||||||
Ok(Replacement::RemovalBuffered) => {
|
Ok(Replacement::RemovalBuffered) => {
|
||||||
unreachable!("current implementation does not remove anything")
|
unreachable!("current implementation does not remove anything")
|
||||||
@@ -3778,38 +3421,16 @@ impl Timeline {
|
|||||||
error!(
|
error!(
|
||||||
expected.ptr = ?Arc::as_ptr(&l),
|
expected.ptr = ?Arc::as_ptr(&l),
|
||||||
other.ptr = ?Arc::as_ptr(&other),
|
other.ptr = ?Arc::as_ptr(&other),
|
||||||
?other,
|
|
||||||
"replacing downloaded layer into layermap failed because another layer was found instead of expected"
|
"replacing downloaded layer into layermap failed because another layer was found instead of expected"
|
||||||
);
|
);
|
||||||
true
|
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
// this is a precondition failure, the layer filename derived
|
// this is a precondition failure, the layer filename derived
|
||||||
// attributes didn't match up, which doesn't seem likely.
|
// attributes didn't match up, which doesn't seem likely.
|
||||||
error!("replacing downloaded layer into layermap failed: {e:#?}");
|
error!("replacing downloaded layer into layermap failed: {e:#?}")
|
||||||
true
|
|
||||||
}
|
}
|
||||||
};
|
|
||||||
|
|
||||||
if failure {
|
|
||||||
// mark the remote layer permanently failed; the timeline is most
|
|
||||||
// likely unusable after this. sadly we cannot just poison the layermap
|
|
||||||
// lock with panic, because that would create an issue with shutdown.
|
|
||||||
//
|
|
||||||
// this does not change the retry semantics on failed downloads.
|
|
||||||
//
|
|
||||||
// use of Relaxed is valid because closing of the semaphore gives
|
|
||||||
// happens-before and wakes up any waiters; we write this value before
|
|
||||||
// and any waiters (or would be waiters) will load it after closing
|
|
||||||
// semaphore.
|
|
||||||
//
|
|
||||||
// See: https://github.com/neondatabase/neon/issues/3533
|
|
||||||
remote_layer
|
|
||||||
.download_replacement_failure
|
|
||||||
.store(true, Relaxed);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
updates.flush();
|
|
||||||
drop(layers);
|
drop(layers);
|
||||||
|
|
||||||
// Now that we've inserted the download into the layer map,
|
// Now that we've inserted the download into the layer map,
|
||||||
@@ -3819,7 +3440,6 @@ impl Timeline {
|
|||||||
remote_layer.ongoing_download.close();
|
remote_layer.ongoing_download.close();
|
||||||
} else {
|
} else {
|
||||||
// Keep semaphore open. We'll drop the permit at the end of the function.
|
// Keep semaphore open. We'll drop the permit at the end of the function.
|
||||||
info!("on-demand download failed: {:?}", result.as_ref().unwrap_err());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Don't treat it as an error if the task that triggered the download
|
// Don't treat it as an error if the task that triggered the download
|
||||||
@@ -3833,7 +3453,7 @@ impl Timeline {
|
|||||||
drop(permit);
|
drop(permit);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}.in_current_span(),
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
receiver.await.context("download task cancelled")?
|
receiver.await.context("download task cancelled")?
|
||||||
|
|||||||
@@ -1,219 +0,0 @@
|
|||||||
//! The per-timeline layer eviction task.
|
|
||||||
|
|
||||||
use std::{
|
|
||||||
ops::ControlFlow,
|
|
||||||
sync::Arc,
|
|
||||||
time::{Duration, SystemTime},
|
|
||||||
};
|
|
||||||
|
|
||||||
use either::Either;
|
|
||||||
use tokio::time::Instant;
|
|
||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
use tracing::{debug, error, info, instrument, warn};
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
|
||||||
tenant::{
|
|
||||||
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
|
||||||
storage_layer::PersistentLayer,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
use super::Timeline;
|
|
||||||
|
|
||||||
impl Timeline {
|
|
||||||
pub(super) fn launch_eviction_task(self: &Arc<Self>) {
|
|
||||||
let self_clone = Arc::clone(self);
|
|
||||||
task_mgr::spawn(
|
|
||||||
BACKGROUND_RUNTIME.handle(),
|
|
||||||
TaskKind::Eviction,
|
|
||||||
Some(self.tenant_id),
|
|
||||||
Some(self.timeline_id),
|
|
||||||
&format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
|
|
||||||
false,
|
|
||||||
async move {
|
|
||||||
self_clone.eviction_task(task_mgr::shutdown_token()).await;
|
|
||||||
info!("eviction task finishing");
|
|
||||||
Ok(())
|
|
||||||
},
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
|
|
||||||
async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
|
|
||||||
use crate::tenant::tasks::random_init_delay;
|
|
||||||
{
|
|
||||||
let policy = self.get_eviction_policy();
|
|
||||||
let period = match policy {
|
|
||||||
EvictionPolicy::LayerAccessThreshold(lat) => lat.period,
|
|
||||||
EvictionPolicy::NoEviction => Duration::from_secs(10),
|
|
||||||
};
|
|
||||||
if random_init_delay(period, &cancel).await.is_err() {
|
|
||||||
info!("shutting down");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
loop {
|
|
||||||
let policy = self.get_eviction_policy();
|
|
||||||
let cf = self.eviction_iteration(&policy, cancel.clone()).await;
|
|
||||||
|
|
||||||
match cf {
|
|
||||||
ControlFlow::Break(()) => break,
|
|
||||||
ControlFlow::Continue(sleep_until) => {
|
|
||||||
tokio::select! {
|
|
||||||
_ = cancel.cancelled() => {
|
|
||||||
info!("shutting down");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
_ = tokio::time::sleep_until(sleep_until) => { }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))]
|
|
||||||
async fn eviction_iteration(
|
|
||||||
self: &Arc<Self>,
|
|
||||||
policy: &EvictionPolicy,
|
|
||||||
cancel: CancellationToken,
|
|
||||||
) -> ControlFlow<(), Instant> {
|
|
||||||
debug!("eviction iteration: {policy:?}");
|
|
||||||
match policy {
|
|
||||||
EvictionPolicy::NoEviction => {
|
|
||||||
// check again in 10 seconds; XXX config watch mechanism
|
|
||||||
ControlFlow::Continue(Instant::now() + Duration::from_secs(10))
|
|
||||||
}
|
|
||||||
EvictionPolicy::LayerAccessThreshold(p) => {
|
|
||||||
let start = Instant::now();
|
|
||||||
match self.eviction_iteration_threshold(p, cancel).await {
|
|
||||||
ControlFlow::Break(()) => return ControlFlow::Break(()),
|
|
||||||
ControlFlow::Continue(()) => (),
|
|
||||||
}
|
|
||||||
let elapsed = start.elapsed();
|
|
||||||
crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
|
|
||||||
ControlFlow::Continue(start + p.period)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn eviction_iteration_threshold(
|
|
||||||
self: &Arc<Self>,
|
|
||||||
p: &EvictionPolicyLayerAccessThreshold,
|
|
||||||
cancel: CancellationToken,
|
|
||||||
) -> ControlFlow<()> {
|
|
||||||
let now = SystemTime::now();
|
|
||||||
|
|
||||||
#[allow(dead_code)]
|
|
||||||
#[derive(Debug, Default)]
|
|
||||||
struct EvictionStats {
|
|
||||||
candidates: usize,
|
|
||||||
evicted: usize,
|
|
||||||
errors: usize,
|
|
||||||
not_evictable: usize,
|
|
||||||
skipped_for_shutdown: usize,
|
|
||||||
}
|
|
||||||
let mut stats = EvictionStats::default();
|
|
||||||
// Gather layers for eviction.
|
|
||||||
// NB: all the checks can be invalidated as soon as we release the layer map lock.
|
|
||||||
// We don't want to hold the layer map lock during eviction.
|
|
||||||
// So, we just need to deal with this.
|
|
||||||
let candidates: Vec<Arc<dyn PersistentLayer>> = {
|
|
||||||
let layers = self.layers.read().unwrap();
|
|
||||||
let mut candidates = Vec::new();
|
|
||||||
for hist_layer in layers.iter_historic_layers() {
|
|
||||||
if hist_layer.is_remote_layer() {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let last_activity_ts = match hist_layer
|
|
||||||
.access_stats()
|
|
||||||
.most_recent_access_or_residence_event()
|
|
||||||
{
|
|
||||||
Either::Left(mra) => mra.when,
|
|
||||||
Either::Right(re) => re.timestamp,
|
|
||||||
};
|
|
||||||
let no_activity_for = match now.duration_since(last_activity_ts) {
|
|
||||||
Ok(d) => d,
|
|
||||||
Err(_e) => {
|
|
||||||
// We reach here if `now` < `last_activity_ts`, which can legitimately
|
|
||||||
// happen if there is an access between us getting `now`, and us getting
|
|
||||||
// the access stats from the layer.
|
|
||||||
//
|
|
||||||
// The other reason why it can happen is system clock skew because
|
|
||||||
// SystemTime::now() is not monotonic, so, even if there is no access
|
|
||||||
// to the layer after we get `now` at the beginning of this function,
|
|
||||||
// it could be that `now` < `last_activity_ts`.
|
|
||||||
//
|
|
||||||
// To distinguish the cases, we would need to record `Instant`s in the
|
|
||||||
// access stats (i.e., monotonic timestamps), but then, the timestamps
|
|
||||||
// values in the access stats would need to be `Instant`'s, and hence
|
|
||||||
// they would be meaningless outside of the pageserver process.
|
|
||||||
// At the time of writing, the trade-off is that access stats are more
|
|
||||||
// valuable than detecting clock skew.
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
if no_activity_for > p.threshold {
|
|
||||||
candidates.push(hist_layer)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
candidates
|
|
||||||
};
|
|
||||||
stats.candidates = candidates.len();
|
|
||||||
|
|
||||||
let remote_client = match self.remote_client.as_ref() {
|
|
||||||
None => {
|
|
||||||
error!(
|
|
||||||
num_candidates = candidates.len(),
|
|
||||||
"no remote storage configured, cannot evict layers"
|
|
||||||
);
|
|
||||||
return ControlFlow::Continue(());
|
|
||||||
}
|
|
||||||
Some(c) => c,
|
|
||||||
};
|
|
||||||
|
|
||||||
let results = match self
|
|
||||||
.evict_layer_batch(remote_client, &candidates[..], cancel)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Err(pre_err) => {
|
|
||||||
stats.errors += candidates.len();
|
|
||||||
error!("could not do any evictions: {pre_err:#}");
|
|
||||||
return ControlFlow::Continue(());
|
|
||||||
}
|
|
||||||
Ok(results) => results,
|
|
||||||
};
|
|
||||||
assert_eq!(results.len(), candidates.len());
|
|
||||||
for (l, result) in candidates.iter().zip(results) {
|
|
||||||
match result {
|
|
||||||
None => {
|
|
||||||
stats.skipped_for_shutdown += 1;
|
|
||||||
}
|
|
||||||
Some(Ok(true)) => {
|
|
||||||
debug!("evicted layer {l:?}");
|
|
||||||
stats.evicted += 1;
|
|
||||||
}
|
|
||||||
Some(Ok(false)) => {
|
|
||||||
debug!("layer is not evictable: {l:?}");
|
|
||||||
stats.not_evictable += 1;
|
|
||||||
}
|
|
||||||
Some(Err(e)) => {
|
|
||||||
// This variant is the case where an unexpected error happened during eviction.
|
|
||||||
// Expected errors that result in non-eviction are `Some(Ok(false))`.
|
|
||||||
// So, dump Debug here to gather as much info as possible in this rare case.
|
|
||||||
warn!("failed to evict layer {l:?}: {e:?}");
|
|
||||||
stats.errors += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if stats.candidates == stats.not_evictable {
|
|
||||||
debug!(stats=?stats, "eviction iteration complete");
|
|
||||||
} else if stats.errors > 0 || stats.not_evictable > 0 {
|
|
||||||
warn!(stats=?stats, "eviction iteration complete");
|
|
||||||
} else {
|
|
||||||
info!(stats=?stats, "eviction iteration complete");
|
|
||||||
}
|
|
||||||
ControlFlow::Continue(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user