mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 22:10:39 +00:00
Merge pull request #3656 from neondatabase/releases/2023-02-21
Release 2023-02-21
This commit is contained in:
@@ -21,3 +21,4 @@
|
||||
!workspace_hack/
|
||||
!neon_local/
|
||||
!scripts/ninstall.sh
|
||||
!vm-cgconfig.conf
|
||||
|
||||
@@ -2,11 +2,11 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-ap-southeast-1
|
||||
bucket_region: ap-southeast-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
|
||||
4
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
4
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
@@ -2,11 +2,11 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-eu-central-1
|
||||
bucket_region: eu-central-1
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
|
||||
4
.github/ansible/prod.us-east-2.hosts.yaml
vendored
4
.github/ansible/prod.us-east-2.hosts.yaml
vendored
@@ -2,11 +2,11 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-us-east-2
|
||||
bucket_region: us-east-2
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
|
||||
4
.github/ansible/prod.us-west-2.hosts.yaml
vendored
4
.github/ansible/prod.us-west-2.hosts.yaml
vendored
@@ -2,11 +2,11 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-prod-storage-us-west-2
|
||||
bucket_region: us-west-2
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
|
||||
broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
|
||||
40
.github/ansible/production.hosts.yaml
vendored
40
.github/ansible/production.hosts.yaml
vendored
@@ -1,40 +0,0 @@
|
||||
---
|
||||
storage:
|
||||
vars:
|
||||
console_mgmt_base_url: http://console-release.local
|
||||
bucket_name: zenith-storage-oregon
|
||||
bucket_region: us-west-2
|
||||
broker_endpoint: http://storage-broker.prod.local:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "{{ inventory_hostname }}"
|
||||
safekeeper_s3_prefix: prod-1/wal
|
||||
hostname_suffix: ".local"
|
||||
remote_user: admin
|
||||
sentry_environment: production
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
zenith-1-ps-2:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-3:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-4:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-5:
|
||||
console_region_id: aws-us-west-2
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
zenith-1-sk-1:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-sk-2:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-sk-4:
|
||||
console_region_id: aws-us-west-2
|
||||
9
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
9
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
@@ -2,12 +2,17 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-dev-storage-eu-west-1
|
||||
bucket_region: eu-west-1
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.build
|
||||
broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "20m"
|
||||
threshold: "20m"
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
9
.github/ansible/staging.us-east-2.hosts.yaml
vendored
9
.github/ansible/staging.us-east-2.hosts.yaml
vendored
@@ -2,12 +2,17 @@ storage:
|
||||
vars:
|
||||
bucket_name: neon-staging-storage-us-east-2
|
||||
bucket_region: us-east-2
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
console_mgmt_base_url: http://neon-internal-api.aws.neon.build
|
||||
broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "20m"
|
||||
threshold: "20m"
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
@@ -6,11 +6,11 @@ image:
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.eu-west-1.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -10,7 +10,7 @@ settings:
|
||||
uri: "https://console.stage.neon.tech/psql_session/"
|
||||
domain: "pg.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy-link pods
|
||||
|
||||
@@ -6,11 +6,11 @@ image:
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.cloud.stage.neon.tech"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -6,11 +6,11 @@ image:
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -6,11 +6,11 @@ image:
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.ap-southeast-1.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -6,11 +6,11 @@ image:
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.eu-central-1.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -6,11 +6,11 @@ image:
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -6,11 +6,11 @@ image:
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.cloud.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -6,11 +6,11 @@ image:
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.us-west-2.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
|
||||
@@ -1,56 +0,0 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: production
|
||||
neon_service: storage-broker
|
||||
|
||||
# Use L4 LB
|
||||
service:
|
||||
# service.annotations -- Annotations to add to the service
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||
# assign service to this name at external-dns
|
||||
external-dns.alpha.kubernetes.io/hostname: storage-broker.prod.local
|
||||
# service.type -- Service type
|
||||
type: LoadBalancer
|
||||
# service.port -- broker listen port
|
||||
port: 50051
|
||||
|
||||
ingress:
|
||||
enabled: false
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "production"
|
||||
25
.github/workflows/build_and_test.yml
vendored
25
.github/workflows/build_and_test.yml
vendored
@@ -611,34 +611,31 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_INFORMANT_VERSION: 0.1.1
|
||||
VM_BUILDER_VERSION: v0.4.6
|
||||
|
||||
steps:
|
||||
- name: Downloading latest vm-builder
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Downloading vm-builder
|
||||
run: |
|
||||
curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder
|
||||
curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
||||
chmod +x vm-builder
|
||||
|
||||
- name: Pulling compute-node image
|
||||
run: |
|
||||
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Downloading VM informant version ${{ env.VM_INFORMANT_VERSION }}
|
||||
- name: Building VM compute-node rootfs
|
||||
run: |
|
||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/${{ env.VM_INFORMANT_VERSION }}/vm-informant -o vm-informant
|
||||
chmod +x vm-informant
|
||||
|
||||
- name: Adding VM informant to compute-node image
|
||||
run: |
|
||||
ID=$(docker create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}})
|
||||
docker cp vm-informant $ID:/bin/vm-informant
|
||||
docker commit $ID temp-vm-compute-node
|
||||
docker rm -f $ID
|
||||
docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node .
|
||||
|
||||
- name: Build vm image
|
||||
run: |
|
||||
# note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
|
||||
./vm-builder -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Pushing vm-compute-node image
|
||||
run: |
|
||||
|
||||
77
.github/workflows/deploy-prod.yml
vendored
77
.github/workflows/deploy-prod.yml
vendored
@@ -165,80 +165,3 @@ jobs:
|
||||
- name: Deploy storage-broker
|
||||
run:
|
||||
helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
||||
|
||||
# Deploy to old account below
|
||||
|
||||
deploy:
|
||||
runs-on: prod
|
||||
container:
|
||||
image: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
options: --user root --privileged
|
||||
if: inputs.deployStorage && inputs.disclamerAcknowledged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
environment:
|
||||
name: prod-old
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Redeploy
|
||||
run: |
|
||||
export DOCKER_TAG=${{ inputs.dockerTag }}
|
||||
cd "$(pwd)/.github/ansible"
|
||||
|
||||
./get_binaries.sh
|
||||
|
||||
eval $(ssh-agent)
|
||||
echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key
|
||||
echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
|
||||
chmod 0600 ssh-key
|
||||
ssh-add ssh-key
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
|
||||
ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied
|
||||
- name: Cleanup ansible folder
|
||||
run: rm -rf ~/.ansible
|
||||
|
||||
deploy-storage-broker:
|
||||
name: deploy storage broker on old staging and old prod
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
if: inputs.deployStorageBroker && inputs.disclamerAcknowledged
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
environment:
|
||||
name: prod-old
|
||||
env:
|
||||
KUBECONFIG: .kubeconfig
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Store kubeconfig file
|
||||
run: |
|
||||
echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
|
||||
chmod 0600 ${KUBECONFIG}
|
||||
|
||||
- name: Add neon helm chart
|
||||
run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
|
||||
- name: Deploy storage-broker
|
||||
run:
|
||||
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
|
||||
|
||||
- name: Cleanup helm folder
|
||||
run: rm -rf ~/.cache
|
||||
|
||||
66
Cargo.lock
generated
66
Cargo.lock
generated
@@ -2110,6 +2110,16 @@ version = "0.3.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
|
||||
|
||||
[[package]]
|
||||
name = "mime_guess"
|
||||
version = "2.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
|
||||
dependencies = [
|
||||
"mime",
|
||||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "minimal-lexical"
|
||||
version = "0.2.1"
|
||||
@@ -2884,6 +2894,7 @@ dependencies = [
|
||||
"md5",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"parking_lot",
|
||||
"pin-project-lite",
|
||||
"pq_proto",
|
||||
@@ -2892,6 +2903,8 @@ dependencies = [
|
||||
"rcgen",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
"reqwest-tracing",
|
||||
"routerify",
|
||||
"rstest",
|
||||
"rustls",
|
||||
@@ -2909,7 +2922,9 @@ dependencies = [
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"tracing-utils",
|
||||
"url",
|
||||
"utils",
|
||||
"uuid",
|
||||
@@ -3079,6 +3094,7 @@ dependencies = [
|
||||
"js-sys",
|
||||
"log",
|
||||
"mime",
|
||||
"mime_guess",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
@@ -3098,6 +3114,36 @@ dependencies = [
|
||||
"winreg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-middleware"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a1c03e9011a8c59716ad13115550469e081e2e9892656b0ba6a47c907921894"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"http",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"task-local-extensions",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-tracing"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b739d87a6b2cf4743968ad2b4cef648fbe0204c19999509824425babb2097bce"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"opentelemetry",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
"task-local-extensions",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.16.20"
|
||||
@@ -3790,6 +3836,15 @@ dependencies = [
|
||||
"xattr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "task-local-extensions"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4167afbec18ae012de40f8cf1b9bf48420abb390678c34821caa07d924941cc4"
|
||||
dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.3.0"
|
||||
@@ -3809,6 +3864,8 @@ name = "tenant_size_model"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -4358,6 +4415,15 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicase"
|
||||
version = "2.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
|
||||
dependencies = [
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-bidi"
|
||||
version = "0.3.10"
|
||||
|
||||
@@ -76,6 +76,8 @@ prost = "0.11"
|
||||
rand = "0.8"
|
||||
regex = "1.4"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
|
||||
reqwest-middleware = "0.2.0"
|
||||
routerify = "3"
|
||||
rpds = "0.12.0"
|
||||
rustls = "0.20"
|
||||
|
||||
@@ -11,7 +11,7 @@ FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
|
||||
libicu-dev
|
||||
libicu-dev libxslt1-dev
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -23,7 +23,8 @@ FROM build-deps AS pg-build
|
||||
ARG PG_VERSION
|
||||
COPY vendor/postgres-${PG_VERSION} postgres
|
||||
RUN cd postgres && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu \
|
||||
--with-libxml --with-libxslt && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
@@ -34,7 +35,8 @@ RUN cd postgres && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -50,15 +52,15 @@ RUN apt update && \
|
||||
libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
|
||||
protobuf-c-compiler xsltproc
|
||||
|
||||
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz && \
|
||||
tar zxvf SFCGAL-v1.3.10.tar.gz && \
|
||||
cd SFCGAL-v1.3.10 && cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
|
||||
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
|
||||
mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
|
||||
cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make clean && cp -R /sfcgal/* /
|
||||
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||
tar xvzf postgis-3.3.1.tar.gz && \
|
||||
cd postgis-3.3.1 && \
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
|
||||
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
|
||||
./autogen.sh && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
||||
@@ -83,29 +85,14 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||
FROM build-deps AS plv8-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
|
||||
apt install -y ninja-build python3-dev libncurses5 binutils clang
|
||||
|
||||
# https://github.com/plv8/plv8/issues/475:
|
||||
# v8 uses gold for linking and sets `--thread-count=4` which breaks
|
||||
# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
|
||||
# Install newer gold version manually as debian-testing binutils version updates
|
||||
# libc version, which in turn breaks other extension built against non-testing libc.
|
||||
RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
|
||||
tar xvzf binutils-2.38.tar.gz && \
|
||||
cd binutils-2.38 && \
|
||||
cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
cd ../bfd && ./configure && make bfdver.h && \
|
||||
cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
|
||||
cp /usr/local/bin/ld.gold /usr/bin/gold
|
||||
|
||||
# Sed is used to patch for https://github.com/plv8/plv8/issues/503
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||
tar xvzf v3.1.4.tar.gz && \
|
||||
cd plv8-3.1.4 && \
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \
|
||||
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
|
||||
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
|
||||
@@ -126,20 +113,17 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2
|
||||
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
|
||||
&& rm /tmp/cmake-install.sh
|
||||
|
||||
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
|
||||
tar xvzf h3.tgz && \
|
||||
cd h3-4.0.1 && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
|
||||
mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && cd build && \
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
DESTDIR=/h3 make install && \
|
||||
cp -R /h3/usr / && \
|
||||
rm -rf build
|
||||
|
||||
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
|
||||
tar xvzf h3-pg.tgz && \
|
||||
cd h3-pg-4.0.1 && \
|
||||
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \
|
||||
mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
@@ -155,9 +139,8 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
|
||||
FROM build-deps AS unit-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \
|
||||
tar xvzf 7.7.tar.gz && \
|
||||
cd postgresql-unit-7.7 && \
|
||||
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
|
||||
mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
# unit extension's "create extension" script relies on absolute install path to fill some reference tables.
|
||||
@@ -176,12 +159,27 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz &
|
||||
FROM build-deps AS vector-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN git clone --branch v0.4.0 https://github.com/pgvector/pgvector.git && \
|
||||
cd pgvector && \
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pgjwt-pg-build"
|
||||
# compile pgjwt extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pgjwt-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
|
||||
RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
|
||||
mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
@@ -196,6 +194,7 @@ COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=h3-pg-build /h3/usr /
|
||||
COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -258,6 +257,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
|
||||
# libicu67, locales for collations (including ICU)
|
||||
# libossp-uuid16 for extension ossp-uuid
|
||||
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
||||
# libxml2, libxslt1.1 for xml2
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
locales \
|
||||
@@ -269,6 +269,8 @@ RUN apt update && \
|
||||
libproj19 \
|
||||
libprotobuf-c1 \
|
||||
libsfcgal1 \
|
||||
libxml2 \
|
||||
libxslt1.1 \
|
||||
gdb && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
32
Dockerfile.vm-compute-node
Normal file
32
Dockerfile.vm-compute-node
Normal file
@@ -0,0 +1,32 @@
|
||||
# Note: this file *mostly* just builds on Dockerfile.compute-node
|
||||
|
||||
ARG SRC_IMAGE
|
||||
ARG VM_INFORMANT_VERSION=v0.1.6
|
||||
|
||||
# Pull VM informant and set up inittab
|
||||
FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant
|
||||
|
||||
RUN set -e \
|
||||
&& rm -f /etc/inittab \
|
||||
&& touch /etc/inittab
|
||||
|
||||
ADD vm-cgconfig.conf /etc/cgconfig.conf
|
||||
RUN set -e \
|
||||
&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
|
||||
&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart --cgroup=neon-postgres'" >> /etc/inittab
|
||||
|
||||
# Combine, starting from non-VM compute node image.
|
||||
FROM $SRC_IMAGE as base
|
||||
|
||||
# Temporarily set user back to root so we can run apt update and adduser
|
||||
USER root
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
cgroup-tools
|
||||
RUN adduser vm-informant --disabled-password --no-create-home
|
||||
USER postgres
|
||||
|
||||
COPY --from=informant /etc/inittab /etc/inittab
|
||||
COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant
|
||||
|
||||
ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"]
|
||||
12
Makefile
12
Makefile
@@ -136,9 +136,15 @@ neon-pg-ext-%: postgres-%
|
||||
|
||||
.PHONY: neon-pg-ext-clean-%
|
||||
neon-pg-ext-clean-%:
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
|
||||
|
||||
.PHONY: neon-pg-ext
|
||||
neon-pg-ext: \
|
||||
|
||||
15
README.md
15
README.md
@@ -34,6 +34,11 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \
|
||||
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
|
||||
protobuf-devel
|
||||
```
|
||||
* On Arch based systems, these packages are needed:
|
||||
```bash
|
||||
pacman -S base-devel readline zlib libseccomp openssl clang \
|
||||
postgresql-libs cmake postgresql protobuf
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
```
|
||||
@@ -83,9 +88,10 @@ cd neon
|
||||
|
||||
# The preferred and default is to make a debug build. This will create a
|
||||
# demonstrably slower build than a release build. For a release build,
|
||||
# use "BUILD_TYPE=release make -j`nproc`"
|
||||
# use "BUILD_TYPE=release make -j`nproc` -s"
|
||||
# Remove -s for the verbose build log
|
||||
|
||||
make -j`nproc`
|
||||
make -j`nproc` -s
|
||||
```
|
||||
|
||||
#### Building on OSX
|
||||
@@ -99,9 +105,10 @@ cd neon
|
||||
|
||||
# The preferred and default is to make a debug build. This will create a
|
||||
# demonstrably slower build than a release build. For a release build,
|
||||
# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`"
|
||||
# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu` -s"
|
||||
# Remove -s for the verbose build log
|
||||
|
||||
make -j`sysctl -n hw.logicalcpu`
|
||||
make -j`sysctl -n hw.logicalcpu` -s
|
||||
```
|
||||
|
||||
#### Dependency installation notes
|
||||
|
||||
@@ -44,7 +44,6 @@ use tracing::{error, info};
|
||||
|
||||
use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::informant::spawn_vm_informant_if_present;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
@@ -141,8 +140,6 @@ fn main() -> Result<()> {
|
||||
// requests, while configuration is still in progress.
|
||||
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
|
||||
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
||||
// Also spawn the thread responsible for handling the VM informant -- if it's present
|
||||
let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant");
|
||||
|
||||
// Start Postgres
|
||||
let mut delay_exit = false;
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
use std::path::Path;
|
||||
use std::process;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
const VM_INFORMANT_PATH: &str = "/bin/vm-informant";
|
||||
const RESTART_INFORMANT_AFTER_MILLIS: u64 = 5000;
|
||||
|
||||
/// Launch a thread to start the VM informant if it's present (and restart, on failure)
|
||||
pub fn spawn_vm_informant_if_present() -> Result<Option<thread::JoinHandle<()>>> {
|
||||
let exists = Path::new(VM_INFORMANT_PATH)
|
||||
.try_exists()
|
||||
.context("could not check if path exists")?;
|
||||
|
||||
if !exists {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some(
|
||||
thread::Builder::new()
|
||||
.name("run-vm-informant".into())
|
||||
.spawn(move || run_informant())?,
|
||||
))
|
||||
}
|
||||
|
||||
fn run_informant() -> ! {
|
||||
let restart_wait = Duration::from_millis(RESTART_INFORMANT_AFTER_MILLIS);
|
||||
|
||||
info!("starting VM informant");
|
||||
|
||||
loop {
|
||||
let mut cmd = process::Command::new(VM_INFORMANT_PATH);
|
||||
// Block on subprocess:
|
||||
let result = cmd.status();
|
||||
|
||||
match result {
|
||||
Err(e) => warn!("failed to run VM informant at {VM_INFORMANT_PATH:?}: {e}"),
|
||||
Ok(status) if !status.success() => {
|
||||
warn!("{VM_INFORMANT_PATH} exited with code {status:?}, retrying")
|
||||
}
|
||||
Ok(_) => info!("{VM_INFORMANT_PATH} ended gracefully (unexpectedly). Retrying"),
|
||||
}
|
||||
|
||||
// Wait before retrying
|
||||
thread::sleep(restart_wait);
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,6 @@ pub mod http;
|
||||
#[macro_use]
|
||||
pub mod logger;
|
||||
pub mod compute;
|
||||
pub mod informant;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
|
||||
@@ -16,7 +16,7 @@ listen_http_addr = '127.0.0.1:9898'
|
||||
checkpoint_distance = '268435456' # in bytes
|
||||
checkpoint_timeout = '10m'
|
||||
|
||||
gc_period = '100 s'
|
||||
gc_period = '1 hour'
|
||||
gc_horizon = '67108864'
|
||||
|
||||
max_file_descriptors = '100'
|
||||
@@ -101,7 +101,7 @@ away.
|
||||
|
||||
#### gc_period
|
||||
|
||||
Interval at which garbage collection is triggered. Default is 100 s.
|
||||
Interval at which garbage collection is triggered. Default is 1 hour.
|
||||
|
||||
#### image_creation_threshold
|
||||
|
||||
@@ -109,7 +109,7 @@ L0 delta layer threshold for L1 image layer creation. Default is 3.
|
||||
|
||||
#### pitr_interval
|
||||
|
||||
WAL retention duration for PITR branching. Default is 30 days.
|
||||
WAL retention duration for PITR branching. Default is 7 days.
|
||||
|
||||
#### walreceiver_connect_timeout
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ The synthetic size is designed to:
|
||||
|
||||
## Terms & assumptions
|
||||
|
||||
- logical size is the size of a database *at a given point in
|
||||
- logical size is the size of a branch *at a given point in
|
||||
time*. It's the total size of all tables in all databases, as you
|
||||
see with "\l+" in psql for example, plus the Postgres SLRUs and some
|
||||
small amount of metadata. NOTE that currently, Neon does not include
|
||||
|
||||
@@ -75,27 +75,36 @@ impl StartupMessageParams {
|
||||
/// taking into account all escape sequences but leaving them as-is.
|
||||
/// [`None`] means that there's no `options` in [`Self`].
|
||||
pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
|
||||
// See `postgres: pg_split_opts`.
|
||||
let mut last_was_escape = false;
|
||||
let iter = self
|
||||
.get("options")?
|
||||
.split(move |c: char| {
|
||||
// We split by non-escaped whitespace symbols.
|
||||
let should_split = c.is_ascii_whitespace() && !last_was_escape;
|
||||
last_was_escape = c == '\\' && !last_was_escape;
|
||||
should_split
|
||||
})
|
||||
.filter(|s| !s.is_empty());
|
||||
|
||||
Some(iter)
|
||||
self.get("options").map(Self::parse_options_raw)
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
/// applying all escape sequences (using owned strings as needed).
|
||||
/// [`None`] means that there's no `options` in [`Self`].
|
||||
pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
|
||||
self.get("options").map(Self::parse_options_escaped)
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
/// taking into account all escape sequences but leaving them as-is.
|
||||
pub fn parse_options_raw(input: &str) -> impl Iterator<Item = &str> {
|
||||
// See `postgres: pg_split_opts`.
|
||||
let iter = self.options_raw()?.map(|s| {
|
||||
let mut last_was_escape = false;
|
||||
input
|
||||
.split(move |c: char| {
|
||||
// We split by non-escaped whitespace symbols.
|
||||
let should_split = c.is_ascii_whitespace() && !last_was_escape;
|
||||
last_was_escape = c == '\\' && !last_was_escape;
|
||||
should_split
|
||||
})
|
||||
.filter(|s| !s.is_empty())
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
/// applying all escape sequences (using owned strings as needed).
|
||||
pub fn parse_options_escaped(input: &str) -> impl Iterator<Item = Cow<'_, str>> {
|
||||
// See `postgres: pg_split_opts`.
|
||||
Self::parse_options_raw(input).map(|s| {
|
||||
let mut preserve_next_escape = false;
|
||||
let escape = |c| {
|
||||
// We should remove '\\' unless it's preceded by '\\'.
|
||||
@@ -108,9 +117,12 @@ impl StartupMessageParams {
|
||||
true => Cow::Owned(s.replace(escape, "")),
|
||||
false => Cow::Borrowed(s),
|
||||
}
|
||||
});
|
||||
})
|
||||
}
|
||||
|
||||
Some(iter)
|
||||
/// Iterate through key-value pairs in an arbitrary order.
|
||||
pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
|
||||
self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
|
||||
}
|
||||
|
||||
// This function is mostly useful in tests.
|
||||
|
||||
@@ -7,5 +7,7 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
219
libs/tenant_size_model/src/calculation.rs
Normal file
219
libs/tenant_size_model/src/calculation.rs
Normal file
@@ -0,0 +1,219 @@
|
||||
use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
|
||||
|
||||
//
|
||||
// *-g--*---D--->
|
||||
// /
|
||||
// /
|
||||
// / *---b----*-B--->
|
||||
// / /
|
||||
// / /
|
||||
// -----*--e---*-----f----* C
|
||||
// E \
|
||||
// \
|
||||
// *--a---*---A-->
|
||||
//
|
||||
// If A and B need to be retained, is it cheaper to store
|
||||
// snapshot at C+a+b, or snapshots at A and B ?
|
||||
//
|
||||
// If D also needs to be retained, which is cheaper:
|
||||
//
|
||||
// 1. E+g+e+f+a+b
|
||||
// 2. D+C+a+b
|
||||
// 3. D+A+B
|
||||
|
||||
/// [`Segment`] which has had it's size calculated.
|
||||
#[derive(Clone, Debug)]
|
||||
struct SegmentSize {
|
||||
method: SegmentMethod,
|
||||
|
||||
// calculated size of this subtree, using this method
|
||||
accum_size: u64,
|
||||
|
||||
seg_id: usize,
|
||||
children: Vec<SegmentSize>,
|
||||
}
|
||||
|
||||
struct SizeAlternatives {
|
||||
// cheapest alternative if parent is available.
|
||||
incremental: SegmentSize,
|
||||
|
||||
// cheapest alternative if parent node is not available
|
||||
non_incremental: Option<SegmentSize>,
|
||||
}
|
||||
|
||||
impl StorageModel {
|
||||
pub fn calculate(&self) -> SizeResult {
|
||||
// Build adjacency list. 'child_list' is indexed by segment id. Each entry
|
||||
// contains a list of all child segments of the segment.
|
||||
let mut roots: Vec<usize> = Vec::new();
|
||||
let mut child_list: Vec<Vec<usize>> = Vec::new();
|
||||
child_list.resize(self.segments.len(), Vec::new());
|
||||
|
||||
for (seg_id, seg) in self.segments.iter().enumerate() {
|
||||
if let Some(parent_id) = seg.parent {
|
||||
child_list[parent_id].push(seg_id);
|
||||
} else {
|
||||
roots.push(seg_id);
|
||||
}
|
||||
}
|
||||
|
||||
let mut segment_results = Vec::new();
|
||||
segment_results.resize(
|
||||
self.segments.len(),
|
||||
SegmentSizeResult {
|
||||
method: SegmentMethod::Skipped,
|
||||
accum_size: 0,
|
||||
},
|
||||
);
|
||||
|
||||
let mut total_size = 0;
|
||||
for root in roots {
|
||||
if let Some(selected) = self.size_here(root, &child_list).non_incremental {
|
||||
StorageModel::fill_selected_sizes(&selected, &mut segment_results);
|
||||
total_size += selected.accum_size;
|
||||
} else {
|
||||
// Couldn't find any way to get this root. Error?
|
||||
}
|
||||
}
|
||||
|
||||
SizeResult {
|
||||
total_size,
|
||||
segments: segment_results,
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_selected_sizes(selected: &SegmentSize, result: &mut Vec<SegmentSizeResult>) {
|
||||
result[selected.seg_id] = SegmentSizeResult {
|
||||
method: selected.method,
|
||||
accum_size: selected.accum_size,
|
||||
};
|
||||
// recurse to children
|
||||
for child in selected.children.iter() {
|
||||
StorageModel::fill_selected_sizes(child, result);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// This is the core of the sizing calculation.
|
||||
//
|
||||
// This is a recursive function, that for each Segment calculates the best way
|
||||
// to reach all the Segments that are marked as needed in this subtree, under two
|
||||
// different conditions:
|
||||
// a) when the parent of this segment is available (as a snaphot or through WAL), and
|
||||
// b) when the parent of this segment is not available.
|
||||
//
|
||||
fn size_here(&self, seg_id: usize, child_list: &Vec<Vec<usize>>) -> SizeAlternatives {
|
||||
let seg = &self.segments[seg_id];
|
||||
// First figure out the best way to get each child
|
||||
let mut children = Vec::new();
|
||||
for child_id in &child_list[seg_id] {
|
||||
children.push(self.size_here(*child_id, child_list))
|
||||
}
|
||||
|
||||
// Method 1. If this node is not needed, we can skip it as long as we
|
||||
// take snapshots later in each sub-tree
|
||||
let snapshot_later = if !seg.needed {
|
||||
let mut snapshot_later = SegmentSize {
|
||||
seg_id,
|
||||
method: SegmentMethod::Skipped,
|
||||
accum_size: 0,
|
||||
children: Vec::new(),
|
||||
};
|
||||
|
||||
let mut possible = true;
|
||||
for child in children.iter() {
|
||||
if let Some(non_incremental) = &child.non_incremental {
|
||||
snapshot_later.accum_size += non_incremental.accum_size;
|
||||
snapshot_later.children.push(non_incremental.clone())
|
||||
} else {
|
||||
possible = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if possible {
|
||||
Some(snapshot_later)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Method 2. Get a snapshot here. This assumed to be possible, if the 'size' of
|
||||
// this Segment was given.
|
||||
let snapshot_here = if !seg.needed || seg.parent.is_none() {
|
||||
if let Some(snapshot_size) = seg.size {
|
||||
let mut snapshot_here = SegmentSize {
|
||||
seg_id,
|
||||
method: SegmentMethod::SnapshotHere,
|
||||
accum_size: snapshot_size,
|
||||
children: Vec::new(),
|
||||
};
|
||||
for child in children.iter() {
|
||||
snapshot_here.accum_size += child.incremental.accum_size;
|
||||
snapshot_here.children.push(child.incremental.clone())
|
||||
}
|
||||
Some(snapshot_here)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Method 3. Use WAL to get here from parent
|
||||
let wal_here = {
|
||||
let mut wal_here = SegmentSize {
|
||||
seg_id,
|
||||
method: SegmentMethod::Wal,
|
||||
accum_size: if let Some(parent_id) = seg.parent {
|
||||
seg.lsn - self.segments[parent_id].lsn
|
||||
} else {
|
||||
0
|
||||
},
|
||||
children: Vec::new(),
|
||||
};
|
||||
for child in children {
|
||||
wal_here.accum_size += child.incremental.accum_size;
|
||||
wal_here.children.push(child.incremental)
|
||||
}
|
||||
wal_here
|
||||
};
|
||||
|
||||
// If the parent is not available, what's the cheapest method involving
|
||||
// a snapshot here or later?
|
||||
let mut cheapest_non_incremental: Option<SegmentSize> = None;
|
||||
if let Some(snapshot_here) = snapshot_here {
|
||||
cheapest_non_incremental = Some(snapshot_here);
|
||||
}
|
||||
if let Some(snapshot_later) = snapshot_later {
|
||||
// Use <=, to prefer skipping if the size is equal
|
||||
if let Some(parent) = &cheapest_non_incremental {
|
||||
if snapshot_later.accum_size <= parent.accum_size {
|
||||
cheapest_non_incremental = Some(snapshot_later);
|
||||
}
|
||||
} else {
|
||||
cheapest_non_incremental = Some(snapshot_later);
|
||||
}
|
||||
}
|
||||
|
||||
// And what's the cheapest method, if the parent is available?
|
||||
let cheapest_incremental = if let Some(cheapest_non_incremental) = &cheapest_non_incremental
|
||||
{
|
||||
// Is it cheaper to use a snapshot here or later, anyway?
|
||||
// Use <, to prefer Wal over snapshot if the cost is the same
|
||||
if wal_here.accum_size < cheapest_non_incremental.accum_size {
|
||||
wal_here
|
||||
} else {
|
||||
cheapest_non_incremental.clone()
|
||||
}
|
||||
} else {
|
||||
wal_here
|
||||
};
|
||||
|
||||
SizeAlternatives {
|
||||
incremental: cheapest_incremental,
|
||||
non_incremental: cheapest_non_incremental,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,401 +1,70 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
//! Synthetic size calculation
|
||||
|
||||
use anyhow::Context;
|
||||
mod calculation;
|
||||
pub mod svg;
|
||||
|
||||
/// Pricing model or history size builder.
|
||||
/// StorageModel is the input to the synthetic size calculation. It represents
|
||||
/// a tree of timelines, with just the information that's needed for the
|
||||
/// calculation. This doesn't track timeline names or where each timeline
|
||||
/// begins and ends, for example. Instead, it consists of "points of interest"
|
||||
/// on the timelines. A point of interest could be the timeline start or end point,
|
||||
/// the oldest point on a timeline that needs to be retained because of PITR
|
||||
/// cutoff, or snapshot points named by the user. For each such point, and the
|
||||
/// edge connecting the points (implicit in Segment), we store information about
|
||||
/// whether we need to be able to recover to the point, and if known, the logical
|
||||
/// size at the point.
|
||||
///
|
||||
/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
|
||||
/// type.
|
||||
pub struct Storage<K: 'static> {
|
||||
segments: Vec<Segment>,
|
||||
|
||||
/// Mapping from the branch name to the index of a segment describing it's latest state.
|
||||
branches: HashMap<K, usize>,
|
||||
/// The segments must form a well-formed tree, with no loops.
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct StorageModel {
|
||||
pub segments: Vec<Segment>,
|
||||
}
|
||||
|
||||
/// Snapshot of a branch.
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
/// Segment represents one point in the tree of branches, *and* the edge that leads
|
||||
/// to it (if any). We don't need separate structs for points and edges, because each
|
||||
/// point can have only one parent.
|
||||
///
|
||||
/// When 'needed' is true, it means that we need to be able to reconstruct
|
||||
/// any version between 'parent.lsn' and 'lsn'. If you want to represent that only
|
||||
/// a single point is needed, create two Segments with the same lsn, and mark only
|
||||
/// the child as needed.
|
||||
///
|
||||
#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct Segment {
|
||||
/// Previous segment index into ['Storage::segments`], if any.
|
||||
parent: Option<usize>,
|
||||
pub parent: Option<usize>,
|
||||
|
||||
/// Description of how did we get to this state.
|
||||
///
|
||||
/// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when
|
||||
/// modifying a branch directly.
|
||||
pub op: Cow<'static, str>,
|
||||
/// LSN at this point
|
||||
pub lsn: u64,
|
||||
|
||||
/// LSN before this state
|
||||
start_lsn: u64,
|
||||
/// Logical size at this node, if known.
|
||||
pub size: Option<u64>,
|
||||
|
||||
/// LSN at this state
|
||||
pub end_lsn: u64,
|
||||
|
||||
/// Logical size before this state
|
||||
start_size: u64,
|
||||
|
||||
/// Logical size at this state. Can be None in the last Segment of a branch.
|
||||
pub end_size: Option<u64>,
|
||||
|
||||
/// Indices to [`Storage::segments`]
|
||||
///
|
||||
/// FIXME: this could be an Option<usize>
|
||||
children_after: Vec<usize>,
|
||||
|
||||
/// Determined by `retention_period` given to [`Storage::calculate`]
|
||||
/// If true, the segment from parent to this node is needed by `retention_period`
|
||||
pub needed: bool,
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
//
|
||||
// *-g--*---D--->
|
||||
// /
|
||||
// /
|
||||
// / *---b----*-B--->
|
||||
// / /
|
||||
// / /
|
||||
// -----*--e---*-----f----* C
|
||||
// E \
|
||||
// \
|
||||
// *--a---*---A-->
|
||||
//
|
||||
// If A and B need to be retained, is it cheaper to store
|
||||
// snapshot at C+a+b, or snapshots at A and B ?
|
||||
//
|
||||
// If D also needs to be retained, which is cheaper:
|
||||
//
|
||||
// 1. E+g+e+f+a+b
|
||||
// 2. D+C+a+b
|
||||
// 3. D+A+B
|
||||
/// Result of synthetic size calculation. Returned by StorageModel::calculate()
|
||||
pub struct SizeResult {
|
||||
pub total_size: u64,
|
||||
|
||||
/// [`Segment`] which has had it's size calculated.
|
||||
pub struct SegmentSize {
|
||||
pub seg_id: usize,
|
||||
|
||||
pub method: SegmentMethod,
|
||||
|
||||
this_size: u64,
|
||||
|
||||
pub children: Vec<SegmentSize>,
|
||||
// This has same length as the StorageModel::segments vector in the input.
|
||||
// Each entry in this array corresponds to the entry with same index in
|
||||
// StorageModel::segments.
|
||||
pub segments: Vec<SegmentSizeResult>,
|
||||
}
|
||||
|
||||
impl SegmentSize {
|
||||
fn total(&self) -> u64 {
|
||||
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||
}
|
||||
|
||||
pub fn total_children(&self) -> u64 {
|
||||
if self.method == SnapshotAfter {
|
||||
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||
} else {
|
||||
self.children.iter().fold(0, |acc, x| acc + x.total())
|
||||
}
|
||||
}
|
||||
#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct SegmentSizeResult {
|
||||
pub method: SegmentMethod,
|
||||
// calculated size of this subtree, using this method
|
||||
pub accum_size: u64,
|
||||
}
|
||||
|
||||
/// Different methods to retain history from a particular state
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum SegmentMethod {
|
||||
SnapshotAfter,
|
||||
Wal,
|
||||
WalNeeded,
|
||||
SnapshotHere, // A logical snapshot is needed after this segment
|
||||
Wal, // Keep WAL leading up to this node
|
||||
Skipped,
|
||||
}
|
||||
|
||||
use SegmentMethod::*;
|
||||
|
||||
impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
/// Creates a new storage with the given default branch name.
|
||||
pub fn new(initial_branch: K) -> Storage<K> {
|
||||
let init_segment = Segment {
|
||||
op: "".into(),
|
||||
needed: false,
|
||||
parent: None,
|
||||
start_lsn: 0,
|
||||
end_lsn: 0,
|
||||
start_size: 0,
|
||||
end_size: Some(0),
|
||||
children_after: Vec::new(),
|
||||
};
|
||||
|
||||
Storage {
|
||||
segments: vec![init_segment],
|
||||
branches: HashMap::from([(initial_branch, 0)]),
|
||||
}
|
||||
}
|
||||
|
||||
/// Advances the branch with a new point, at given LSN.
|
||||
pub fn insert_point<Q: ?Sized>(
|
||||
&mut self,
|
||||
branch: &Q,
|
||||
op: Cow<'static, str>,
|
||||
lsn: u64,
|
||||
size: Option<u64>,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
|
||||
let newseg_id = self.segments.len();
|
||||
let lastseg = &mut self.segments[lastseg_id];
|
||||
|
||||
assert!(lsn > lastseg.end_lsn);
|
||||
|
||||
let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
|
||||
|
||||
let newseg = Segment {
|
||||
op,
|
||||
parent: Some(lastseg_id),
|
||||
start_lsn: lastseg.end_lsn,
|
||||
end_lsn: lsn,
|
||||
start_size,
|
||||
end_size: size,
|
||||
children_after: Vec::new(),
|
||||
needed: false,
|
||||
};
|
||||
lastseg.children_after.push(newseg_id);
|
||||
|
||||
self.segments.push(newseg);
|
||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
|
||||
pub fn modify_branch<Q: ?Sized>(
|
||||
&mut self,
|
||||
branch: &Q,
|
||||
op: Cow<'static, str>,
|
||||
lsn_bytes: u64,
|
||||
size_bytes: i64,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
|
||||
let newseg_id = self.segments.len();
|
||||
let lastseg = &mut self.segments[lastseg_id];
|
||||
|
||||
let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
|
||||
|
||||
let newseg = Segment {
|
||||
op,
|
||||
parent: Some(lastseg_id),
|
||||
start_lsn: lastseg.end_lsn,
|
||||
end_lsn: lastseg.end_lsn + lsn_bytes,
|
||||
start_size: last_end_size,
|
||||
end_size: Some((last_end_size as i64 + size_bytes) as u64),
|
||||
children_after: Vec::new(),
|
||||
needed: false,
|
||||
};
|
||||
lastseg.children_after.push(newseg_id);
|
||||
|
||||
self.segments.push(newseg);
|
||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
self.modify_branch(branch, "insert".into(), bytes, bytes as i64)
|
||||
}
|
||||
|
||||
pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
self.modify_branch(branch, "update".into(), bytes, 0i64)
|
||||
}
|
||||
|
||||
pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64))
|
||||
}
|
||||
|
||||
pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q> + std::fmt::Debug,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
// Find the right segment
|
||||
let branchseg_id = *self.branches.get(parent).with_context(|| {
|
||||
format!(
|
||||
"should had found the parent {:?} by key. in branches {:?}",
|
||||
parent, self.branches
|
||||
)
|
||||
})?;
|
||||
|
||||
let _branchseg = &mut self.segments[branchseg_id];
|
||||
|
||||
// Create branch name for it
|
||||
self.branches.insert(name, branchseg_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result<SegmentSize> {
|
||||
// Phase 1: Mark all the segments that need to be retained
|
||||
for (_branch, &last_seg_id) in self.branches.iter() {
|
||||
let last_seg = &self.segments[last_seg_id];
|
||||
let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period);
|
||||
let mut seg_id = last_seg_id;
|
||||
loop {
|
||||
let seg = &mut self.segments[seg_id];
|
||||
if seg.end_lsn < cutoff_lsn {
|
||||
break;
|
||||
}
|
||||
seg.needed = true;
|
||||
if let Some(prev_seg_id) = seg.parent {
|
||||
seg_id = prev_seg_id;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: For each oldest segment in a chain that needs to be retained,
|
||||
// calculate if we should store snapshot or WAL
|
||||
self.size_from_snapshot_later(0)
|
||||
}
|
||||
|
||||
fn size_from_wal(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
|
||||
let seg = &self.segments[seg_id];
|
||||
|
||||
let this_size = seg.end_lsn - seg.start_lsn;
|
||||
|
||||
let mut children = Vec::new();
|
||||
|
||||
// try both ways
|
||||
for &child_id in seg.children_after.iter() {
|
||||
// try each child both ways
|
||||
let child = &self.segments[child_id];
|
||||
let p1 = self.size_from_wal(child_id)?;
|
||||
|
||||
let p = if !child.needed {
|
||||
let p2 = self.size_from_snapshot_later(child_id)?;
|
||||
if p1.total() < p2.total() {
|
||||
p1
|
||||
} else {
|
||||
p2
|
||||
}
|
||||
} else {
|
||||
p1
|
||||
};
|
||||
children.push(p);
|
||||
}
|
||||
Ok(SegmentSize {
|
||||
seg_id,
|
||||
method: if seg.needed { WalNeeded } else { Wal },
|
||||
this_size,
|
||||
children,
|
||||
})
|
||||
}
|
||||
|
||||
fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
|
||||
// If this is needed, then it's time to do the snapshot and continue
|
||||
// with wal method.
|
||||
let seg = &self.segments[seg_id];
|
||||
//eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed);
|
||||
if seg.needed {
|
||||
let mut children = Vec::new();
|
||||
|
||||
for &child_id in seg.children_after.iter() {
|
||||
// try each child both ways
|
||||
let child = &self.segments[child_id];
|
||||
let p1 = self.size_from_wal(child_id)?;
|
||||
|
||||
let p = if !child.needed {
|
||||
let p2 = self.size_from_snapshot_later(child_id)?;
|
||||
if p1.total() < p2.total() {
|
||||
p1
|
||||
} else {
|
||||
p2
|
||||
}
|
||||
} else {
|
||||
p1
|
||||
};
|
||||
children.push(p);
|
||||
}
|
||||
Ok(SegmentSize {
|
||||
seg_id,
|
||||
method: WalNeeded,
|
||||
this_size: seg.start_size,
|
||||
children,
|
||||
})
|
||||
} else {
|
||||
// If any of the direct children are "needed", need to be able to reconstruct here
|
||||
let mut children_needed = false;
|
||||
for &child in seg.children_after.iter() {
|
||||
let seg = &self.segments[child];
|
||||
if seg.needed {
|
||||
children_needed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let method1 = if !children_needed {
|
||||
let mut children = Vec::new();
|
||||
for child in seg.children_after.iter() {
|
||||
children.push(self.size_from_snapshot_later(*child)?);
|
||||
}
|
||||
Some(SegmentSize {
|
||||
seg_id,
|
||||
method: Skipped,
|
||||
this_size: 0,
|
||||
children,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// If this a junction, consider snapshotting here
|
||||
let method2 = if children_needed || seg.children_after.len() >= 2 {
|
||||
let mut children = Vec::new();
|
||||
for child in seg.children_after.iter() {
|
||||
children.push(self.size_from_wal(*child)?);
|
||||
}
|
||||
let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") };
|
||||
Some(SegmentSize {
|
||||
seg_id,
|
||||
method: SnapshotAfter,
|
||||
this_size,
|
||||
children,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(match (method1, method2) {
|
||||
(None, None) => anyhow::bail!(
|
||||
"neither method was applicable: children_after={}, children_needed={}",
|
||||
seg.children_after.len(),
|
||||
children_needed
|
||||
),
|
||||
(Some(method), None) => method,
|
||||
(None, Some(method)) => method,
|
||||
(Some(method1), Some(method2)) => {
|
||||
if method1.total() < method2.total() {
|
||||
method1
|
||||
} else {
|
||||
method2
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_segments(self) -> Vec<Segment> {
|
||||
self.segments
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,269 +0,0 @@
|
||||
//! Tenant size model testing ground.
|
||||
//!
|
||||
//! Has a number of scenarios and a `main` for invoking these by number, calculating the history
|
||||
//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios
|
||||
//! into pngs.
|
||||
|
||||
use tenant_size_model::{Segment, SegmentSize, Storage};
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
fn scenario_1() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
fn scenario_2() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child")?;
|
||||
storage.update("child", 1_000)?;
|
||||
|
||||
// More updates on parent
|
||||
storage.update("main", 1_000)?;
|
||||
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
// Like 2, but more updates on main
|
||||
fn scenario_3() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child")?;
|
||||
storage.update("child", 1_000)?;
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
// Diverged branches
|
||||
fn scenario_4() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child")?;
|
||||
storage.update("child", 1_000)?;
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..8 {
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
fn scenario_5() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
let mut storage = Storage::new("a");
|
||||
storage.insert("a", 5000)?;
|
||||
storage.branch("a", "b")?;
|
||||
storage.update("b", 4000)?;
|
||||
storage.update("a", 2000)?;
|
||||
storage.branch("a", "c")?;
|
||||
storage.insert("c", 4000)?;
|
||||
storage.insert("a", 2000)?;
|
||||
|
||||
let size = storage.calculate(5000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
fn scenario_6() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
use std::borrow::Cow;
|
||||
|
||||
const NO_OP: Cow<'static, str> = Cow::Borrowed("");
|
||||
|
||||
let branches = [
|
||||
Some(0x7ff1edab8182025f15ae33482edb590a_u128),
|
||||
Some(0xb1719e044db05401a05a2ed588a3ad3f),
|
||||
Some(0xb68d6691c895ad0a70809470020929ef),
|
||||
];
|
||||
|
||||
// compared to other scenarios, this one uses bytes instead of kB
|
||||
|
||||
let mut storage = Storage::new(None);
|
||||
|
||||
storage.branch(&None, branches[0])?; // at 0
|
||||
storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064
|
||||
storage.branch(&branches[0], branches[1])?; // at 108951064
|
||||
storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472
|
||||
storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424
|
||||
storage.branch(&branches[0], branches[2])?; // at 283415424
|
||||
storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616
|
||||
storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400
|
||||
|
||||
let size = storage.calculate(100_000)?;
|
||||
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args: Vec<String> = std::env::args().collect();
|
||||
|
||||
let scenario = if args.len() < 2 { "1" } else { &args[1] };
|
||||
|
||||
let (segments, size) = match scenario {
|
||||
"1" => scenario_1(),
|
||||
"2" => scenario_2(),
|
||||
"3" => scenario_3(),
|
||||
"4" => scenario_4(),
|
||||
"5" => scenario_5(),
|
||||
"6" => scenario_6(),
|
||||
other => {
|
||||
eprintln!("invalid scenario {}", other);
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
.unwrap();
|
||||
|
||||
graphviz_tree(&segments, &size);
|
||||
}
|
||||
|
||||
fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
|
||||
use tenant_size_model::SegmentMethod::*;
|
||||
|
||||
let seg_id = node.seg_id;
|
||||
let seg = segments.get(seg_id).unwrap();
|
||||
let lsn = seg.end_lsn;
|
||||
let size = seg.end_size.unwrap_or(0);
|
||||
let method = node.method;
|
||||
|
||||
println!(" {{");
|
||||
println!(" node [width=0.1 height=0.1 shape=oval]");
|
||||
|
||||
let tenant_size = node.total_children();
|
||||
|
||||
let penwidth = if seg.needed { 6 } else { 3 };
|
||||
let x = match method {
|
||||
SnapshotAfter =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"),
|
||||
Wal =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
|
||||
WalNeeded =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
|
||||
Skipped =>
|
||||
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"),
|
||||
};
|
||||
|
||||
println!(" \"seg{seg_id}\" [{x}]");
|
||||
println!(" }}");
|
||||
|
||||
// Recurse. Much of the data is actually on the edge
|
||||
for child in node.children.iter() {
|
||||
let child_id = child.seg_id;
|
||||
graphviz_recurse(segments, child);
|
||||
|
||||
let edge_color = match child.method {
|
||||
SnapshotAfter => "gray",
|
||||
Wal => "black",
|
||||
WalNeeded => "black",
|
||||
Skipped => "gray",
|
||||
};
|
||||
|
||||
println!(" {{");
|
||||
println!(" edge [] ");
|
||||
print!(" \"seg{seg_id}\" -> \"seg{child_id}\" [");
|
||||
print!("color={edge_color}");
|
||||
if child.method == WalNeeded {
|
||||
print!(" penwidth=6");
|
||||
}
|
||||
if child.method == Wal {
|
||||
print!(" penwidth=3");
|
||||
}
|
||||
|
||||
let next = segments.get(child_id).unwrap();
|
||||
|
||||
if next.op.is_empty() {
|
||||
print!(
|
||||
" label=\"{} / {}\"",
|
||||
next.end_lsn - seg.end_lsn,
|
||||
(next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128)
|
||||
);
|
||||
} else {
|
||||
print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
|
||||
}
|
||||
println!("]");
|
||||
println!(" }}");
|
||||
}
|
||||
}
|
||||
|
||||
fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
|
||||
println!("digraph G {{");
|
||||
println!(" fontname=\"Helvetica,Arial,sans-serif\"");
|
||||
println!(" node [fontname=\"Helvetica,Arial,sans-serif\"]");
|
||||
println!(" edge [fontname=\"Helvetica,Arial,sans-serif\"]");
|
||||
println!(" graph [center=1 rankdir=LR]");
|
||||
println!(" edge [dir=none]");
|
||||
|
||||
graphviz_recurse(segments, tree);
|
||||
|
||||
println!("}}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scenarios_return_same_size() {
|
||||
type ScenarioFn = fn() -> anyhow::Result<(Vec<Segment>, SegmentSize)>;
|
||||
let truths: &[(u32, ScenarioFn, _)] = &[
|
||||
(line!(), scenario_1, 8000),
|
||||
(line!(), scenario_2, 9000),
|
||||
(line!(), scenario_3, 13000),
|
||||
(line!(), scenario_4, 16000),
|
||||
(line!(), scenario_5, 17000),
|
||||
(line!(), scenario_6, 333_792_000),
|
||||
];
|
||||
|
||||
for (line, scenario, expected) in truths {
|
||||
let (_, size) = scenario().unwrap();
|
||||
assert_eq!(*expected, size.total_children(), "scenario on line {line}");
|
||||
}
|
||||
}
|
||||
193
libs/tenant_size_model/src/svg.rs
Normal file
193
libs/tenant_size_model/src/svg.rs
Normal file
@@ -0,0 +1,193 @@
|
||||
use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
|
||||
use std::fmt::Write;
|
||||
|
||||
const SVG_WIDTH: f32 = 500.0;
|
||||
|
||||
struct SvgDraw<'a> {
|
||||
storage: &'a StorageModel,
|
||||
branches: &'a [String],
|
||||
seg_to_branch: &'a [usize],
|
||||
sizes: &'a [SegmentSizeResult],
|
||||
|
||||
// layout
|
||||
xscale: f32,
|
||||
min_lsn: u64,
|
||||
seg_coordinates: Vec<(f32, f32)>,
|
||||
}
|
||||
|
||||
fn draw_legend(result: &mut String) -> anyhow::Result<()> {
|
||||
writeln!(
|
||||
result,
|
||||
"<circle cx=\"10\" cy=\"10\" r=\"5\" stroke=\"red\"/>"
|
||||
)?;
|
||||
writeln!(result, "<text x=\"20\" y=\"15\">logical snapshot</text>")?;
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"5\" y1=\"30\" x2=\"15\" y2=\"30\" stroke-width=\"6\" stroke=\"black\" />"
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
"<text x=\"20\" y=\"35\">WAL within retention period</text>"
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"5\" y1=\"50\" x2=\"15\" y2=\"50\" stroke-width=\"3\" stroke=\"black\" />"
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
"<text x=\"20\" y=\"55\">WAL retained to avoid copy</text>"
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
|
||||
)?;
|
||||
writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn draw_svg(
|
||||
storage: &StorageModel,
|
||||
branches: &[String],
|
||||
seg_to_branch: &[usize],
|
||||
sizes: &SizeResult,
|
||||
) -> anyhow::Result<String> {
|
||||
let mut draw = SvgDraw {
|
||||
storage,
|
||||
branches,
|
||||
seg_to_branch,
|
||||
sizes: &sizes.segments,
|
||||
|
||||
xscale: 0.0,
|
||||
min_lsn: 0,
|
||||
seg_coordinates: Vec::new(),
|
||||
};
|
||||
|
||||
let mut result = String::new();
|
||||
|
||||
writeln!(result, "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" height=\"300\" width=\"500\">")?;
|
||||
|
||||
draw.calculate_svg_layout();
|
||||
|
||||
// Draw the tree
|
||||
for (seg_id, _seg) in storage.segments.iter().enumerate() {
|
||||
draw.draw_seg_phase1(seg_id, &mut result)?;
|
||||
}
|
||||
|
||||
// Draw snapshots
|
||||
for (seg_id, _seg) in storage.segments.iter().enumerate() {
|
||||
draw.draw_seg_phase2(seg_id, &mut result)?;
|
||||
}
|
||||
|
||||
draw_legend(&mut result)?;
|
||||
|
||||
write!(result, "</svg>")?;
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
impl<'a> SvgDraw<'a> {
|
||||
fn calculate_svg_layout(&mut self) {
|
||||
// Find x scale
|
||||
let segments = &self.storage.segments;
|
||||
let min_lsn = segments.iter().map(|s| s.lsn).fold(u64::MAX, std::cmp::min);
|
||||
let max_lsn = segments.iter().map(|s| s.lsn).fold(0, std::cmp::max);
|
||||
|
||||
// Start with 1 pixel = 1 byte. Double the scale until it fits into the image
|
||||
let mut xscale = 1.0;
|
||||
while (max_lsn - min_lsn) as f32 / xscale > SVG_WIDTH {
|
||||
xscale *= 2.0;
|
||||
}
|
||||
|
||||
// Layout the timelines on Y dimension.
|
||||
// TODO
|
||||
let mut y = 100.0;
|
||||
let mut branch_y_coordinates = Vec::new();
|
||||
for _branch in self.branches {
|
||||
branch_y_coordinates.push(y);
|
||||
y += 40.0;
|
||||
}
|
||||
|
||||
// Calculate coordinates for each point
|
||||
let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
|
||||
.map(|(seg, branch_id)| {
|
||||
let x = (seg.lsn - min_lsn) as f32 / xscale;
|
||||
let y = branch_y_coordinates[*branch_id];
|
||||
(x, y)
|
||||
})
|
||||
.collect();
|
||||
|
||||
self.xscale = xscale;
|
||||
self.min_lsn = min_lsn;
|
||||
self.seg_coordinates = seg_coordinates;
|
||||
}
|
||||
|
||||
/// Draws lines between points
|
||||
fn draw_seg_phase1(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
|
||||
let seg = &self.storage.segments[seg_id];
|
||||
|
||||
let wal_bytes = if let Some(parent_id) = seg.parent {
|
||||
seg.lsn - self.storage.segments[parent_id].lsn
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
let style = match self.sizes[seg_id].method {
|
||||
SegmentMethod::SnapshotHere => "stroke-width=\"1\" stroke=\"gray\"",
|
||||
SegmentMethod::Wal if seg.needed && wal_bytes > 0 => {
|
||||
"stroke-width=\"6\" stroke=\"black\""
|
||||
}
|
||||
SegmentMethod::Wal => "stroke-width=\"3\" stroke=\"black\"",
|
||||
SegmentMethod::Skipped => "stroke-width=\"1\" stroke=\"gray\"",
|
||||
};
|
||||
if let Some(parent_id) = seg.parent {
|
||||
let (x1, y1) = self.seg_coordinates[parent_id];
|
||||
let (x2, y2) = self.seg_coordinates[seg_id];
|
||||
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
" <title>{wal_bytes} bytes of WAL (seg {seg_id})</title>"
|
||||
)?;
|
||||
writeln!(result, "</line>")?;
|
||||
} else {
|
||||
// draw a little dash to mark the starting point of this branch
|
||||
let (x, y) = self.seg_coordinates[seg_id];
|
||||
let (x1, y1) = (x, y - 5.0);
|
||||
let (x2, y2) = (x, y + 5.0);
|
||||
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
|
||||
)?;
|
||||
writeln!(result, " <title>(seg {seg_id})</title>")?;
|
||||
writeln!(result, "</line>")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Draw circles where snapshots are taken
|
||||
fn draw_seg_phase2(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
|
||||
let seg = &self.storage.segments[seg_id];
|
||||
|
||||
// draw a snapshot point if it's needed
|
||||
let (coord_x, coord_y) = self.seg_coordinates[seg_id];
|
||||
if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
|
||||
writeln!(
|
||||
result,
|
||||
"<circle cx=\"{coord_x}\" cy=\"{coord_y}\" r=\"5\" stroke=\"red\">",
|
||||
)?;
|
||||
writeln!(
|
||||
result,
|
||||
" <title>logical size {}</title>",
|
||||
seg.size.unwrap()
|
||||
)?;
|
||||
write!(result, "</circle>")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
313
libs/tenant_size_model/tests/tests.rs
Normal file
313
libs/tenant_size_model/tests/tests.rs
Normal file
@@ -0,0 +1,313 @@
|
||||
//! Tenant size model tests.
|
||||
|
||||
use tenant_size_model::{Segment, SizeResult, StorageModel};
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
struct ScenarioBuilder {
|
||||
segments: Vec<Segment>,
|
||||
|
||||
/// Mapping from the branch name to the index of a segment describing its latest state.
|
||||
branches: HashMap<String, usize>,
|
||||
}
|
||||
|
||||
impl ScenarioBuilder {
|
||||
/// Creates a new storage with the given default branch name.
|
||||
pub fn new(initial_branch: &str) -> ScenarioBuilder {
|
||||
let init_segment = Segment {
|
||||
parent: None,
|
||||
lsn: 0,
|
||||
size: Some(0),
|
||||
needed: false, // determined later
|
||||
};
|
||||
|
||||
ScenarioBuilder {
|
||||
segments: vec![init_segment],
|
||||
branches: HashMap::from([(initial_branch.into(), 0)]),
|
||||
}
|
||||
}
|
||||
|
||||
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
|
||||
pub fn modify_branch(&mut self, branch: &str, lsn_bytes: u64, size_bytes: i64) {
|
||||
let lastseg_id = *self.branches.get(branch).unwrap();
|
||||
let newseg_id = self.segments.len();
|
||||
let lastseg = &mut self.segments[lastseg_id];
|
||||
|
||||
let newseg = Segment {
|
||||
parent: Some(lastseg_id),
|
||||
lsn: lastseg.lsn + lsn_bytes,
|
||||
size: Some((lastseg.size.unwrap() as i64 + size_bytes) as u64),
|
||||
needed: false,
|
||||
};
|
||||
|
||||
self.segments.push(newseg);
|
||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, branch: &str, bytes: u64) {
|
||||
self.modify_branch(branch, bytes, bytes as i64);
|
||||
}
|
||||
|
||||
pub fn update(&mut self, branch: &str, bytes: u64) {
|
||||
self.modify_branch(branch, bytes, 0i64);
|
||||
}
|
||||
|
||||
pub fn _delete(&mut self, branch: &str, bytes: u64) {
|
||||
self.modify_branch(branch, bytes, -(bytes as i64));
|
||||
}
|
||||
|
||||
/// Panics if the parent branch cannot be found.
|
||||
pub fn branch(&mut self, parent: &str, name: &str) {
|
||||
// Find the right segment
|
||||
let branchseg_id = *self
|
||||
.branches
|
||||
.get(parent)
|
||||
.expect("should had found the parent by key");
|
||||
let _branchseg = &mut self.segments[branchseg_id];
|
||||
|
||||
// Create branch name for it
|
||||
self.branches.insert(name.to_string(), branchseg_id);
|
||||
}
|
||||
|
||||
pub fn calculate(&mut self, retention_period: u64) -> (StorageModel, SizeResult) {
|
||||
// Phase 1: Mark all the segments that need to be retained
|
||||
for (_branch, &last_seg_id) in self.branches.iter() {
|
||||
let last_seg = &self.segments[last_seg_id];
|
||||
let cutoff_lsn = last_seg.lsn.saturating_sub(retention_period);
|
||||
let mut seg_id = last_seg_id;
|
||||
loop {
|
||||
let seg = &mut self.segments[seg_id];
|
||||
if seg.lsn <= cutoff_lsn {
|
||||
break;
|
||||
}
|
||||
seg.needed = true;
|
||||
if let Some(prev_seg_id) = seg.parent {
|
||||
seg_id = prev_seg_id;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Perform the calculation
|
||||
let storage_model = StorageModel {
|
||||
segments: self.segments.clone(),
|
||||
};
|
||||
let size_result = storage_model.calculate();
|
||||
(storage_model, size_result)
|
||||
}
|
||||
}
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
#[test]
|
||||
fn scenario_1() {
|
||||
// Create main branch
|
||||
let mut scenario = ScenarioBuilder::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
scenario.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Calculate the synthetic size with retention horizon 1000
|
||||
let (_model, result) = scenario.calculate(1000);
|
||||
|
||||
// The end of the branch is at LSN 10000. Need to retain
|
||||
// a logical snapshot at LSN 9000, plus the WAL between 9000-10000.
|
||||
// The logical snapshot has size 5000.
|
||||
assert_eq!(result.total_size, 5000 + 1000);
|
||||
}
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
#[test]
|
||||
fn scenario_2() {
|
||||
// Create main branch
|
||||
let mut scenario = ScenarioBuilder::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
scenario.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Branch
|
||||
scenario.branch("main", "child");
|
||||
scenario.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
scenario.update("main", 1_000);
|
||||
|
||||
//
|
||||
// The history looks like this now:
|
||||
//
|
||||
// 10000 11000
|
||||
// *----*----*--------------* main
|
||||
// |
|
||||
// | 11000
|
||||
// +-------------- child
|
||||
//
|
||||
//
|
||||
// With retention horizon 1000, we need to retain logical snapshot
|
||||
// at the branch point, size 5000, and the WAL from 10000-11000 on
|
||||
// both branches.
|
||||
let (_model, result) = scenario.calculate(1000);
|
||||
|
||||
assert_eq!(result.total_size, 5000 + 1000 + 1000);
|
||||
}
|
||||
|
||||
// Like 2, but more updates on main
|
||||
#[test]
|
||||
fn scenario_3() {
|
||||
// Create main branch
|
||||
let mut scenario = ScenarioBuilder::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
scenario.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Branch
|
||||
scenario.branch("main", "child");
|
||||
scenario.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..5 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
//
|
||||
// The history looks like this now:
|
||||
//
|
||||
// 10000 15000
|
||||
// *----*----*------------------------------------* main
|
||||
// |
|
||||
// | 11000
|
||||
// +-------------- child
|
||||
//
|
||||
//
|
||||
// With retention horizon 1000, it's still cheapest to retain
|
||||
// - snapshot at branch point (size 5000)
|
||||
// - WAL on child between 10000-11000
|
||||
// - WAL on main between 10000-15000
|
||||
//
|
||||
// This is in total 5000 + 1000 + 5000
|
||||
//
|
||||
let (_model, result) = scenario.calculate(1000);
|
||||
|
||||
assert_eq!(result.total_size, 5000 + 1000 + 5000);
|
||||
}
|
||||
|
||||
// Diverged branches
|
||||
#[test]
|
||||
fn scenario_4() {
|
||||
// Create main branch
|
||||
let mut scenario = ScenarioBuilder::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
scenario.insert("main", 5_000);
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
// Branch
|
||||
scenario.branch("main", "child");
|
||||
scenario.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..8 {
|
||||
scenario.update("main", 1_000);
|
||||
}
|
||||
|
||||
//
|
||||
// The history looks like this now:
|
||||
//
|
||||
// 10000 18000
|
||||
// *----*----*------------------------------------* main
|
||||
// |
|
||||
// | 11000
|
||||
// +-------------- child
|
||||
//
|
||||
//
|
||||
// With retention horizon 1000, it's now cheapest to retain
|
||||
// separate snapshots on both branches:
|
||||
// - snapshot on main branch at LSN 17000 (size 5000)
|
||||
// - WAL on main between 17000-18000
|
||||
// - snapshot on child branch at LSN 10000 (size 5000)
|
||||
// - WAL on child between 10000-11000
|
||||
//
|
||||
// This is in total 5000 + 1000 + 5000 + 1000 = 12000
|
||||
//
|
||||
// (If we used the the method from the previous scenario, and
|
||||
// kept only snapshot at the branch point, we'd need to keep
|
||||
// all the WAL between 10000-18000 on the main branch, so
|
||||
// the total size would be 5000 + 1000 + 8000 = 14000. The
|
||||
// calculation always picks the cheapest alternative)
|
||||
|
||||
let (_model, result) = scenario.calculate(1000);
|
||||
|
||||
assert_eq!(result.total_size, 5000 + 1000 + 5000 + 1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scenario_5() {
|
||||
let mut scenario = ScenarioBuilder::new("a");
|
||||
scenario.insert("a", 5000);
|
||||
scenario.branch("a", "b");
|
||||
scenario.update("b", 4000);
|
||||
scenario.update("a", 2000);
|
||||
scenario.branch("a", "c");
|
||||
scenario.insert("c", 4000);
|
||||
scenario.insert("a", 2000);
|
||||
|
||||
let (_model, result) = scenario.calculate(1000);
|
||||
|
||||
assert_eq!(result.total_size, 17000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scenario_6() {
|
||||
let branches = [
|
||||
"7ff1edab8182025f15ae33482edb590a",
|
||||
"b1719e044db05401a05a2ed588a3ad3f",
|
||||
"0xb68d6691c895ad0a70809470020929ef",
|
||||
];
|
||||
|
||||
// compared to other scenarios, this one uses bytes instead of kB
|
||||
|
||||
let mut scenario = ScenarioBuilder::new("");
|
||||
|
||||
scenario.branch("", branches[0]); // at 0
|
||||
scenario.modify_branch(branches[0], 108951064, 43696128); // at 108951064
|
||||
scenario.branch(branches[0], branches[1]); // at 108951064
|
||||
scenario.modify_branch(branches[1], 15560408, -1851392); // at 124511472
|
||||
scenario.modify_branch(branches[0], 174464360, -1531904); // at 283415424
|
||||
scenario.branch(branches[0], branches[2]); // at 283415424
|
||||
scenario.modify_branch(branches[2], 15906192, 8192); // at 299321616
|
||||
scenario.modify_branch(branches[0], 18909976, 32768); // at 302325400
|
||||
|
||||
let (model, result) = scenario.calculate(100_000);
|
||||
|
||||
// FIXME: We previously calculated 333_792_000. But with this PR, we get
|
||||
// a much lower number. At a quick look at the model output and the
|
||||
// calculations here, the new result seems correct to me.
|
||||
eprintln!(
|
||||
" MODEL: {}",
|
||||
serde_json::to_string(&model.segments).unwrap()
|
||||
);
|
||||
eprintln!(
|
||||
"RESULT: {}",
|
||||
serde_json::to_string(&result.segments).unwrap()
|
||||
);
|
||||
|
||||
assert_eq!(result.total_size, 136_236_928);
|
||||
}
|
||||
@@ -45,3 +45,115 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Disable the default rust panic hook by using `set_hook`.
|
||||
///
|
||||
/// For neon binaries, the assumption is that tracing is configured before with [`init`], after
|
||||
/// that sentry is configured (if needed). sentry will install it's own on top of this, always
|
||||
/// processing the panic before we log it.
|
||||
///
|
||||
/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
|
||||
/// If the assumptions about the initialization order are not held, use
|
||||
/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
|
||||
/// lost.
|
||||
#[must_use]
|
||||
pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
|
||||
std::panic::set_hook(Box::new(tracing_panic_hook));
|
||||
TracingPanicHookGuard::new()
|
||||
}
|
||||
|
||||
/// Drop guard which restores the std panic hook on drop.
|
||||
///
|
||||
/// Tracing should not be used when it's not configured, but we cannot really latch on to any
|
||||
/// imaginary lifetime of tracing.
|
||||
pub struct TracingPanicHookGuard {
|
||||
act: bool,
|
||||
}
|
||||
|
||||
impl TracingPanicHookGuard {
|
||||
fn new() -> Self {
|
||||
TracingPanicHookGuard { act: true }
|
||||
}
|
||||
|
||||
/// Make this hook guard not do anything when dropped.
|
||||
pub fn forget(&mut self) {
|
||||
self.act = false;
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TracingPanicHookGuard {
|
||||
fn drop(&mut self) {
|
||||
if self.act {
|
||||
let _ = std::panic::take_hook();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Named symbol for our panic hook, which logs the panic.
|
||||
fn tracing_panic_hook(info: &std::panic::PanicInfo) {
|
||||
// following rust 1.66.1 std implementation:
|
||||
// https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
|
||||
let location = info.location();
|
||||
|
||||
let msg = match info.payload().downcast_ref::<&'static str>() {
|
||||
Some(s) => *s,
|
||||
None => match info.payload().downcast_ref::<String>() {
|
||||
Some(s) => &s[..],
|
||||
None => "Box<dyn Any>",
|
||||
},
|
||||
};
|
||||
|
||||
let thread = std::thread::current();
|
||||
let thread = thread.name().unwrap_or("<unnamed>");
|
||||
let backtrace = std::backtrace::Backtrace::capture();
|
||||
|
||||
let _entered = if let Some(location) = location {
|
||||
tracing::error_span!("panic", %thread, location = %PrettyLocation(location))
|
||||
} else {
|
||||
// very unlikely to hit here, but the guarantees of std could change
|
||||
tracing::error_span!("panic", %thread)
|
||||
}
|
||||
.entered();
|
||||
|
||||
if backtrace.status() == std::backtrace::BacktraceStatus::Captured {
|
||||
// this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really
|
||||
// get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to
|
||||
// string, maybe even to a TLS one but tracing already does that.
|
||||
tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}");
|
||||
} else {
|
||||
tracing::error!("{msg}");
|
||||
}
|
||||
|
||||
// ensure that we log something on the panic if this hook is left after tracing has been
|
||||
// unconfigured. worst case when teardown is racing the panic is to log the panic twice.
|
||||
tracing::dispatcher::get_default(|d| {
|
||||
if let Some(_none) = d.downcast_ref::<tracing::subscriber::NoSubscriber>() {
|
||||
let location = location.map(PrettyLocation);
|
||||
log_panic_to_stderr(thread, msg, location, &backtrace);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[cold]
|
||||
fn log_panic_to_stderr(
|
||||
thread: &str,
|
||||
msg: &str,
|
||||
location: Option<PrettyLocation<'_, '_>>,
|
||||
backtrace: &std::backtrace::Backtrace,
|
||||
) {
|
||||
eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}");
|
||||
}
|
||||
|
||||
struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>);
|
||||
|
||||
impl std::fmt::Display for PrettyLocation<'_, '_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for PrettyLocation<'_, '_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
<Self as std::fmt::Display>::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -88,6 +88,13 @@ fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
};
|
||||
|
||||
// Initialize logging, which must be initialized before the custom panic hook is installed.
|
||||
logging::init(conf.log_format)?;
|
||||
|
||||
// mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
|
||||
// disarming this hook on pageserver, because we never tear down tracing.
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(
|
||||
Some(GIT_VERSION.into()),
|
||||
@@ -210,9 +217,6 @@ fn start_pageserver(
|
||||
launch_ts: &'static LaunchTimestamp,
|
||||
conf: &'static PageServerConf,
|
||||
) -> anyhow::Result<()> {
|
||||
// Initialize logging
|
||||
logging::init(conf.log_format)?;
|
||||
|
||||
// Print version and launch timestamp to the log,
|
||||
// and expose them as prometheus metrics.
|
||||
// A changed version string indicates changed software.
|
||||
|
||||
@@ -25,7 +25,7 @@ const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
|
||||
const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize)]
|
||||
#[derive(Serialize, Debug)]
|
||||
struct Ids {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
tenant_id: TenantId,
|
||||
@@ -75,7 +75,7 @@ pub async fn collect_metrics(
|
||||
// define client here to reuse it for all requests
|
||||
let client = reqwest::Client::new();
|
||||
let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
|
||||
let mut prev_iteration_time: Option<std::time::Instant> = None;
|
||||
let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
@@ -86,11 +86,11 @@ pub async fn collect_metrics(
|
||||
_ = ticker.tick() => {
|
||||
|
||||
// send cached metrics every cached_metric_collection_interval
|
||||
let send_cached = prev_iteration_time
|
||||
.map(|x| x.elapsed() >= cached_metric_collection_interval)
|
||||
.unwrap_or(false);
|
||||
let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
|
||||
|
||||
prev_iteration_time = Some(std::time::Instant::now());
|
||||
if send_cached {
|
||||
prev_iteration_time = std::time::Instant::now();
|
||||
}
|
||||
|
||||
collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
|
||||
}
|
||||
@@ -287,6 +287,12 @@ pub async fn collect_metrics_iteration(
|
||||
}
|
||||
} else {
|
||||
error!("metrics endpoint refused the sent metrics: {:?}", res);
|
||||
for metric in chunk_to_send.iter() {
|
||||
// Report if the metric value is suspiciously large
|
||||
if metric.value > (1u64 << 40) {
|
||||
error!("potentially abnormal metric value: {:?}", metric);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
|
||||
@@ -437,6 +437,13 @@ paths:
|
||||
type: boolean
|
||||
description: |
|
||||
When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
|
||||
- name: retention_period
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
description: |
|
||||
Override the default retention period (in bytes) used for size calculation.
|
||||
get:
|
||||
description: |
|
||||
Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
|
||||
|
||||
@@ -7,19 +7,21 @@ use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
|
||||
use super::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
};
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::mgr::TenantMapInsertError;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
@@ -38,7 +40,7 @@ use utils::{
|
||||
|
||||
// Imports only used for testing APIs
|
||||
#[cfg(feature = "testing")]
|
||||
use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
|
||||
use super::models::ConfigureFailpointsRequest;
|
||||
|
||||
struct State {
|
||||
conf: &'static PageServerConf,
|
||||
@@ -479,11 +481,19 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
/// to debug any of the calculations. Requires `tenant_id` request parameter, supports
|
||||
/// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
|
||||
/// values.
|
||||
///
|
||||
/// 'retention_period' query parameter overrides the cutoff that is used to calculate the size
|
||||
/// (only if it is shorter than the real cutoff).
|
||||
///
|
||||
/// Note: we don't update the cached size and prometheus metric here.
|
||||
/// The retention period might be different, and it's nice to have a method to just calculate it
|
||||
/// without modifying anything anyway.
|
||||
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
|
||||
let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
|
||||
let headers = request.headers();
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
@@ -492,24 +502,29 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
|
||||
// this can be long operation
|
||||
let inputs = tenant
|
||||
.gather_size_inputs(&ctx)
|
||||
.gather_size_inputs(retention_period, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let size = if !inputs_only.unwrap_or(false) {
|
||||
Some(
|
||||
tenant
|
||||
.calc_and_update_cached_synthetic_size(&inputs)
|
||||
.map_err(ApiError::InternalServerError)?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let mut sizes = None;
|
||||
if !inputs_only.unwrap_or(false) {
|
||||
let storage_model = inputs
|
||||
.calculate_model()
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let size = storage_model.calculate();
|
||||
|
||||
/// Private response type with the additional "unstable" `inputs` field.
|
||||
///
|
||||
/// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is
|
||||
/// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`.
|
||||
// If request header expects html, return html
|
||||
if headers["Accept"] == "text/html" {
|
||||
return synthetic_size_html_response(inputs, storage_model, size);
|
||||
}
|
||||
sizes = Some(size);
|
||||
} else if headers["Accept"] == "text/html" {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"inputs_only parameter is incompatible with html output request"
|
||||
)));
|
||||
}
|
||||
|
||||
/// The type resides in the pageserver not to expose `ModelInputs`.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Serialize)]
|
||||
struct TenantHistorySize {
|
||||
@@ -519,6 +534,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
///
|
||||
/// Will be none if `?inputs_only=true` was given.
|
||||
size: Option<u64>,
|
||||
/// Size of each segment used in the model.
|
||||
/// Will be null if `?inputs_only=true` was given.
|
||||
segment_sizes: Option<Vec<tenant_size_model::SegmentSizeResult>>,
|
||||
inputs: crate::tenant::size::ModelInputs,
|
||||
}
|
||||
|
||||
@@ -526,7 +544,8 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
StatusCode::OK,
|
||||
TenantHistorySize {
|
||||
id: tenant_id,
|
||||
size,
|
||||
size: sizes.as_ref().map(|x| x.total_size),
|
||||
segment_sizes: sizes.map(|x| x.segments),
|
||||
inputs,
|
||||
},
|
||||
)
|
||||
@@ -591,6 +610,62 @@ async fn evict_timeline_layer_handler(request: Request<Body>) -> Result<Response
|
||||
}
|
||||
}
|
||||
|
||||
/// Get tenant_size SVG graph along with the JSON data.
|
||||
fn synthetic_size_html_response(
|
||||
inputs: ModelInputs,
|
||||
storage_model: StorageModel,
|
||||
sizes: SizeResult,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let mut timeline_ids: Vec<String> = Vec::new();
|
||||
let mut timeline_map: HashMap<TimelineId, usize> = HashMap::new();
|
||||
for (index, ti) in inputs.timeline_inputs.iter().enumerate() {
|
||||
timeline_map.insert(ti.timeline_id, index);
|
||||
timeline_ids.push(ti.timeline_id.to_string());
|
||||
}
|
||||
let seg_to_branch: Vec<usize> = inputs
|
||||
.segments
|
||||
.iter()
|
||||
.map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
|
||||
.collect();
|
||||
|
||||
let svg =
|
||||
tenant_size_model::svg::draw_svg(&storage_model, &timeline_ids, &seg_to_branch, &sizes)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let mut response = String::new();
|
||||
|
||||
use std::fmt::Write;
|
||||
write!(response, "<html>\n<body>\n").unwrap();
|
||||
write!(response, "<div>\n{svg}\n</div>").unwrap();
|
||||
writeln!(response, "Project size: {}", sizes.total_size).unwrap();
|
||||
writeln!(response, "<pre>").unwrap();
|
||||
writeln!(
|
||||
response,
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&inputs).unwrap()
|
||||
)
|
||||
.unwrap();
|
||||
writeln!(
|
||||
response,
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&sizes.segments).unwrap()
|
||||
)
|
||||
.unwrap();
|
||||
writeln!(response, "</pre>").unwrap();
|
||||
write!(response, "</body>\n</html>\n").unwrap();
|
||||
|
||||
html_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
|
||||
let response = Response::builder()
|
||||
.status(status)
|
||||
.header(hyper::header::CONTENT_TYPE, "text/html")
|
||||
.body(Body::from(data.as_bytes().to_vec()))
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
// Helper function to standardize the error messages we produce on bad durations
|
||||
//
|
||||
// Intended to be used with anyhow's `with_context`, e.g.:
|
||||
@@ -850,7 +925,6 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
}
|
||||
|
||||
// Run GC immediately on given timeline.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
@@ -954,6 +1028,17 @@ async fn active_timeline_of_active_tenant(
|
||||
.map_err(ApiError::NotFound)
|
||||
}
|
||||
|
||||
async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
// Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook().
|
||||
// For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it.
|
||||
// Use catch_unwind to ensure that tokio nor hyper are distracted by our panic.
|
||||
let query = req.uri().query();
|
||||
let _ = std::panic::catch_unwind(|| {
|
||||
panic!("unconditional panic for testing panic hook integration; request query: {query:?}")
|
||||
});
|
||||
json_response(StatusCode::NO_CONTENT, ())
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(
|
||||
StatusCode::NOT_FOUND,
|
||||
@@ -1019,7 +1104,7 @@ pub fn make_router(
|
||||
.get("/v1/tenant", tenant_list_handler)
|
||||
.post("/v1/tenant", tenant_create_handler)
|
||||
.get("/v1/tenant/:tenant_id", tenant_status)
|
||||
.get("/v1/tenant/:tenant_id/size", tenant_size_handler)
|
||||
.get("/v1/tenant/:tenant_id/synthetic_size", tenant_size_handler)
|
||||
.put("/v1/tenant/config", update_tenant_config_handler)
|
||||
.get("/v1/tenant/:tenant_id/config", get_tenant_config_handler)
|
||||
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
|
||||
@@ -1038,7 +1123,7 @@ pub fn make_router(
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc",
|
||||
testing_api!("run timeline GC", timeline_gc_handler),
|
||||
timeline_gc_handler,
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
|
||||
@@ -1072,5 +1157,6 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||
evict_timeline_layer_handler,
|
||||
)
|
||||
.get("/v1/panic", always_panic_handler)
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -2418,6 +2418,9 @@ impl Tenant {
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
|
||||
pub async fn gather_size_inputs(
|
||||
&self,
|
||||
// `max_retention_period` overrides the cutoff that is used to calculate the size
|
||||
// (only if it is shorter than the real cutoff).
|
||||
max_retention_period: Option<u64>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<size::ModelInputs> {
|
||||
let logical_sizes_at_once = self
|
||||
@@ -2425,32 +2428,41 @@ impl Tenant {
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.inner();
|
||||
|
||||
// TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries
|
||||
// are for testing/experimenting, we tolerate this.
|
||||
// TODO: Having a single mutex block concurrent reads is not great for performance.
|
||||
//
|
||||
// But the only case where we need to run multiple of these at once is when we
|
||||
// request a size for a tenant manually via API, while another background calculation
|
||||
// is in progress (which is not a common case).
|
||||
//
|
||||
// See more for on the issue #2748 condenced out of the initial PR review.
|
||||
let mut shared_cache = self.cached_logical_sizes.lock().await;
|
||||
|
||||
size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await
|
||||
size::gather_inputs(
|
||||
self,
|
||||
logical_sizes_at_once,
|
||||
max_retention_period,
|
||||
&mut shared_cache,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Calculate synthetic tenant size
|
||||
/// Calculate synthetic tenant size and cache the result.
|
||||
/// This is periodically called by background worker.
|
||||
/// result is cached in tenant struct
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
|
||||
pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
|
||||
let inputs = self.gather_size_inputs(ctx).await?;
|
||||
let inputs = self.gather_size_inputs(None, ctx).await?;
|
||||
|
||||
self.calc_and_update_cached_synthetic_size(&inputs)
|
||||
}
|
||||
|
||||
/// Calculate synthetic size , cache it and set metric value
|
||||
pub fn calc_and_update_cached_synthetic_size(
|
||||
&self,
|
||||
inputs: &size::ModelInputs,
|
||||
) -> anyhow::Result<u64> {
|
||||
let size = inputs.calculate()?;
|
||||
|
||||
self.set_cached_synthetic_size(size);
|
||||
|
||||
Ok(size)
|
||||
}
|
||||
|
||||
/// Cache given synthetic size and update the metric value
|
||||
pub fn set_cached_synthetic_size(&self, size: u64) {
|
||||
self.cached_synthetic_tenant_size
|
||||
.store(size, Ordering::Relaxed);
|
||||
|
||||
@@ -2458,8 +2470,6 @@ impl Tenant {
|
||||
.get_metric_with_label_values(&[&self.tenant_id.to_string()])
|
||||
.unwrap()
|
||||
.set(size);
|
||||
|
||||
Ok(size)
|
||||
}
|
||||
|
||||
pub fn get_cached_synthetic_size(&self) -> u64 {
|
||||
|
||||
@@ -154,6 +154,12 @@ where
|
||||
expected: &Arc<L>,
|
||||
new: Arc<L>,
|
||||
) -> anyhow::Result<Replacement<Arc<L>>> {
|
||||
fail::fail_point!("layermap-replace-notfound", |_| Ok(
|
||||
// this is not what happens if an L0 layer was not found a anyhow error but perhaps
|
||||
// that should be changed. this is good enough to show a replacement failure.
|
||||
Replacement::NotFound
|
||||
));
|
||||
|
||||
self.layer_map.replace_historic_noflush(expected, new)
|
||||
}
|
||||
|
||||
|
||||
@@ -540,13 +540,11 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use {
|
||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub async fn immediate_gc(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
|
||||
@@ -571,14 +571,15 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Launch a delete operation in the background.
|
||||
///
|
||||
/// The operation does not modify local state but assumes the local files have already been
|
||||
/// deleted, and is used to mirror those changes to remote.
|
||||
///
|
||||
/// Note: This schedules an index file upload before the deletions. The
|
||||
/// deletion won't actually be performed, until any previously scheduled
|
||||
/// upload operations, and the index file upload, have completed
|
||||
/// succesfully.
|
||||
///
|
||||
pub fn schedule_layer_file_deletion(
|
||||
self: &Arc<Self>,
|
||||
names: &[LayerFileName],
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -49,6 +49,17 @@ pub struct RemoteLayer {
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
|
||||
|
||||
/// Has `LayerMap::replace` failed for this (true) or not (false).
|
||||
///
|
||||
/// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
|
||||
/// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
|
||||
/// unprocessable, because a LayerMap::replace failed.
|
||||
///
|
||||
/// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
|
||||
/// a possible fast loop between `Timeline::get_reconstruct_data` and
|
||||
/// `Timeline::download_remote_layer`, which also logs.
|
||||
pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for RemoteLayer {
|
||||
@@ -207,6 +218,7 @@ impl RemoteLayer {
|
||||
file_name: fname.to_owned().into(),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
|
||||
access_stats,
|
||||
}
|
||||
}
|
||||
@@ -228,6 +240,7 @@ impl RemoteLayer {
|
||||
file_name: fname.to_owned().into(),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
|
||||
access_stats,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
@@ -11,6 +11,7 @@ use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::id::TenantId;
|
||||
|
||||
@@ -53,37 +54,55 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
info!("starting");
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
let mut first = true;
|
||||
loop {
|
||||
trace!("waking up");
|
||||
|
||||
let tenant = tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request");
|
||||
return;
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
|
||||
ControlFlow::Break(()) => return,
|
||||
ControlFlow::Continue(tenant) => tenant,
|
||||
},
|
||||
};
|
||||
};
|
||||
|
||||
let mut sleep_duration = tenant.get_compaction_period();
|
||||
if sleep_duration == Duration::ZERO {
|
||||
info!("automatic compaction is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
sleep_duration = Duration::from_secs(10);
|
||||
} else {
|
||||
// Run compaction
|
||||
if let Err(e) = tenant.compaction_iteration(&ctx).await {
|
||||
sleep_duration = wait_duration;
|
||||
error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
let period = tenant.get_compaction_period();
|
||||
|
||||
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
|
||||
// loop
|
||||
if first {
|
||||
first = false;
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let started_at = Instant::now();
|
||||
|
||||
let sleep_duration = if period == Duration::ZERO {
|
||||
info!("automatic compaction is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
Duration::from_secs(10)
|
||||
} else {
|
||||
// Run compaction
|
||||
if let Err(e) = tenant.compaction_iteration(&ctx).await {
|
||||
error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
|
||||
wait_duration
|
||||
} else {
|
||||
period
|
||||
}
|
||||
};
|
||||
|
||||
warn_when_period_overrun(started_at.elapsed(), period, "compaction");
|
||||
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
},
|
||||
@@ -105,14 +124,16 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
info!("starting");
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
// GC might require downloading, to find the cutoff LSN that corresponds to the
|
||||
// cutoff specified as time.
|
||||
let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
let mut first = true;
|
||||
loop {
|
||||
trace!("waking up");
|
||||
|
||||
let tenant = tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request");
|
||||
return;
|
||||
},
|
||||
@@ -122,27 +143,38 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
},
|
||||
};
|
||||
|
||||
let gc_period = tenant.get_gc_period();
|
||||
let gc_horizon = tenant.get_gc_horizon();
|
||||
let mut sleep_duration = gc_period;
|
||||
if sleep_duration == Duration::ZERO {
|
||||
info!("automatic GC is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
sleep_duration = Duration::from_secs(10);
|
||||
} else {
|
||||
// Run gc
|
||||
if gc_horizon > 0 {
|
||||
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await
|
||||
{
|
||||
sleep_duration = wait_duration;
|
||||
error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
}
|
||||
let period = tenant.get_gc_period();
|
||||
|
||||
if first {
|
||||
first = false;
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let started_at = Instant::now();
|
||||
|
||||
let gc_horizon = tenant.get_gc_horizon();
|
||||
let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
|
||||
info!("automatic GC is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
Duration::from_secs(10)
|
||||
} else {
|
||||
// Run gc
|
||||
let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
|
||||
if let Err(e) = res {
|
||||
error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
|
||||
wait_duration
|
||||
} else {
|
||||
period
|
||||
}
|
||||
};
|
||||
|
||||
warn_when_period_overrun(started_at.elapsed(), period, "gc");
|
||||
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
},
|
||||
@@ -197,3 +229,49 @@ async fn wait_for_active_tenant(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("cancelled")]
|
||||
pub(crate) struct Cancelled;
|
||||
|
||||
/// Provide a random delay for background task initialization.
|
||||
///
|
||||
/// This delay prevents a thundering herd of background tasks and will likely keep them running on
|
||||
/// different periods for more stable load.
|
||||
pub(crate) async fn random_init_delay(
|
||||
period: Duration,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), Cancelled> {
|
||||
use rand::Rng;
|
||||
|
||||
let d = {
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
// gen_range asserts that the range cannot be empty, which it could be because period can
|
||||
// be set to zero to disable gc or compaction, so lets set it to be at least 10s.
|
||||
let period = std::cmp::max(period, Duration::from_secs(10));
|
||||
|
||||
// semi-ok default as the source of jitter
|
||||
rng.gen_range(Duration::ZERO..=period)
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => Err(Cancelled),
|
||||
_ = tokio::time::sleep(d) => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
|
||||
// Duration::ZERO will happen because it's the "disable [bgtask]" value.
|
||||
if elapsed >= period && period != Duration::ZERO {
|
||||
// humantime does no significant digits clamping whereas Duration's debug is a bit more
|
||||
// intelligent. however it makes sense to keep the "configuration format" for period, even
|
||||
// though there's no way to output the actual config value.
|
||||
warn!(
|
||||
?elapsed,
|
||||
period = %humantime::format_duration(period),
|
||||
task,
|
||||
"task iteration took longer than the configured period"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -613,7 +613,10 @@ impl Timeline {
|
||||
self.flush_frozen_layers_and_wait().await
|
||||
}
|
||||
|
||||
/// Outermost timeline compaction operation; downloads needed layers.
|
||||
pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
const ROUNDS: usize = 2;
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
|
||||
// Last record Lsn could be zero in case the timeline was just created
|
||||
@@ -622,6 +625,86 @@ impl Timeline {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// retry two times to allow first round to find layers which need to be downloaded, then
|
||||
// download them, then retry compaction
|
||||
for round in 0..ROUNDS {
|
||||
// should we error out with the most specific error?
|
||||
let last_round = round == ROUNDS - 1;
|
||||
|
||||
let res = self.compact_inner(ctx).await;
|
||||
|
||||
// If `create_image_layers' or `compact_level0` scheduled any
|
||||
// uploads or deletions, but didn't update the index file yet,
|
||||
// do it now.
|
||||
//
|
||||
// This isn't necessary for correctness, the remote state is
|
||||
// consistent without the uploads and deletions, and we would
|
||||
// update the index file on next flush iteration too. But it
|
||||
// could take a while until that happens.
|
||||
//
|
||||
// Additionally, only do this on the terminal round before sleeping.
|
||||
if last_round {
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
}
|
||||
}
|
||||
|
||||
let rls = match res {
|
||||
Ok(()) => return Ok(()),
|
||||
Err(CompactionError::DownloadRequired(rls)) if !last_round => {
|
||||
// this can be done at most one time before exiting, waiting
|
||||
rls
|
||||
}
|
||||
Err(CompactionError::DownloadRequired(rls)) => {
|
||||
anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len())
|
||||
}
|
||||
Err(CompactionError::Other(e)) => {
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
|
||||
// this path can be visited in the second round of retrying, if first one found that we
|
||||
// must first download some remote layers
|
||||
let total = rls.len();
|
||||
|
||||
let mut downloads = rls
|
||||
.into_iter()
|
||||
.map(|rl| self.download_remote_layer(rl))
|
||||
.collect::<futures::stream::FuturesUnordered<_>>();
|
||||
|
||||
let mut failed = 0;
|
||||
|
||||
let cancelled = task_mgr::shutdown_watcher();
|
||||
tokio::pin!(cancelled);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"),
|
||||
res = downloads.next() => {
|
||||
match res {
|
||||
Some(Ok(())) => {},
|
||||
Some(Err(e)) => {
|
||||
warn!("Downloading remote layer for compaction failed: {e:#}");
|
||||
failed += 1;
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if failed != 0 {
|
||||
anyhow::bail!("{failed} out of {total} layers failed to download, retrying later");
|
||||
}
|
||||
|
||||
// if everything downloaded fine, lets try again
|
||||
}
|
||||
|
||||
unreachable!("retry loop exits")
|
||||
}
|
||||
|
||||
/// Compaction which might need to be retried after downloading remote layers.
|
||||
async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> {
|
||||
//
|
||||
// High level strategy for compaction / image creation:
|
||||
//
|
||||
@@ -660,7 +743,7 @@ impl Timeline {
|
||||
// Is the timeline being deleted?
|
||||
let state = *self.state.borrow();
|
||||
if state == TimelineState::Stopping {
|
||||
anyhow::bail!("timeline is Stopping");
|
||||
return Err(anyhow::anyhow!("timeline is Stopping").into());
|
||||
}
|
||||
|
||||
let target_file_size = self.get_checkpoint_distance();
|
||||
@@ -680,7 +763,8 @@ impl Timeline {
|
||||
// "enough".
|
||||
let layer_paths_to_upload = self
|
||||
.create_image_layers(&partitioning, lsn, false, ctx)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
for (path, layer_metadata) in layer_paths_to_upload {
|
||||
remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
|
||||
@@ -692,18 +776,6 @@ impl Timeline {
|
||||
self.compact_level0(&layer_removal_cs, target_file_size, ctx)
|
||||
.await?;
|
||||
timer.stop_and_record();
|
||||
|
||||
// If `create_image_layers' or `compact_level0` scheduled any
|
||||
// uploads or deletions, but didn't update the index file yet,
|
||||
// do it now.
|
||||
//
|
||||
// This isn't necessary for correctness, the remote state is
|
||||
// consistent without the uploads and deletions, and we would
|
||||
// update the index file on next flush iteration too. But it
|
||||
// could take a while until that happens.
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
@@ -2536,10 +2608,13 @@ impl Timeline {
|
||||
) -> anyhow::Result<(KeyPartitioning, Lsn)> {
|
||||
{
|
||||
let partitioning_guard = self.partitioning.lock().unwrap();
|
||||
if partitioning_guard.1 != Lsn(0)
|
||||
&& lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold
|
||||
{
|
||||
// no repartitioning needed
|
||||
let distance = lsn.0 - partitioning_guard.1 .0;
|
||||
if partitioning_guard.1 != Lsn(0) && distance <= self.repartition_threshold {
|
||||
debug!(
|
||||
distance,
|
||||
threshold = self.repartition_threshold,
|
||||
"no repartitioning needed"
|
||||
);
|
||||
return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
|
||||
}
|
||||
}
|
||||
@@ -2557,8 +2632,12 @@ impl Timeline {
|
||||
|
||||
// Is it time to create a new image layer for the given partition?
|
||||
fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
let layers = self.layers.read().unwrap();
|
||||
|
||||
let mut max_deltas = 0;
|
||||
|
||||
for part_range in &partition.ranges {
|
||||
let image_coverage = layers.image_coverage(part_range, lsn)?;
|
||||
for (img_range, last_img) in image_coverage {
|
||||
@@ -2580,21 +2659,25 @@ impl Timeline {
|
||||
// are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
|
||||
// after we read last_record_lsn, which is passed here in the 'lsn' argument.
|
||||
if img_lsn < lsn {
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
let num_deltas =
|
||||
layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
|
||||
|
||||
debug!(
|
||||
"key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
|
||||
img_range.start, img_range.end, num_deltas, img_lsn, lsn
|
||||
);
|
||||
max_deltas = max_deltas.max(num_deltas);
|
||||
if num_deltas >= threshold {
|
||||
debug!(
|
||||
"key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
|
||||
img_range.start, img_range.end, num_deltas, img_lsn, lsn
|
||||
);
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!(
|
||||
max_deltas,
|
||||
"none of the partitioned ranges had >= {threshold} deltas"
|
||||
);
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
@@ -2707,25 +2790,55 @@ impl Timeline {
|
||||
Ok(layer_paths_to_upload)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct CompactLevel0Phase1Result {
|
||||
new_layers: Vec<DeltaLayer>,
|
||||
deltas_to_compact: Vec<Arc<dyn PersistentLayer>>,
|
||||
}
|
||||
|
||||
/// Top-level failure to compact.
|
||||
#[derive(Debug)]
|
||||
enum CompactionError {
|
||||
/// L0 compaction requires layers to be downloaded.
|
||||
///
|
||||
/// This should not happen repeatedly, but will be retried once by top-level
|
||||
/// `Timeline::compact`.
|
||||
DownloadRequired(Vec<Arc<RemoteLayer>>),
|
||||
/// Compaction cannot be done right now; page reconstruction and so on.
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<anyhow::Error> for CompactionError {
|
||||
fn from(value: anyhow::Error) -> Self {
|
||||
CompactionError::Other(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
/// Level0 files first phase of compaction, explained in the [`compact_inner`] comment.
|
||||
///
|
||||
/// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
|
||||
/// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
|
||||
/// start of level0 files compaction, the on-demand download should be revisited as well.
|
||||
async fn compact_level0_phase1(
|
||||
&self,
|
||||
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||
target_file_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<CompactLevel0Phase1Result> {
|
||||
) -> Result<CompactLevel0Phase1Result, CompactionError> {
|
||||
let layers = self.layers.read().unwrap();
|
||||
let mut level0_deltas = layers.get_level0_deltas()?;
|
||||
drop(layers);
|
||||
|
||||
// Only compact if enough layers have accumulated.
|
||||
if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() {
|
||||
return Ok(Default::default());
|
||||
let threshold = self.get_compaction_threshold();
|
||||
if level0_deltas.is_empty() || level0_deltas.len() < threshold {
|
||||
debug!(
|
||||
level0_deltas = level0_deltas.len(),
|
||||
threshold, "too few deltas to compact"
|
||||
);
|
||||
return Ok(CompactLevel0Phase1Result::default());
|
||||
}
|
||||
|
||||
// Gather the files to compact in this iteration.
|
||||
@@ -2761,6 +2874,24 @@ impl Timeline {
|
||||
end: deltas_to_compact.last().unwrap().get_lsn_range().end,
|
||||
};
|
||||
|
||||
let remotes = deltas_to_compact
|
||||
.iter()
|
||||
.filter(|l| l.is_remote_layer())
|
||||
.inspect(|l| info!("compact requires download of {}", l.filename().file_name()))
|
||||
.map(|l| {
|
||||
l.clone()
|
||||
.downcast_remote_layer()
|
||||
.expect("just checked it is remote layer")
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if !remotes.is_empty() {
|
||||
// caller is holding the lock to layer_removal_cs, and we don't want to download while
|
||||
// holding that; in future download_remote_layer might take it as well. this is
|
||||
// regardless of earlier image creation downloading on-demand, while holding the lock.
|
||||
return Err(CompactionError::DownloadRequired(remotes));
|
||||
}
|
||||
|
||||
info!(
|
||||
"Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
|
||||
lsn_range.start,
|
||||
@@ -2768,9 +2899,11 @@ impl Timeline {
|
||||
deltas_to_compact.len(),
|
||||
level0_deltas.len()
|
||||
);
|
||||
|
||||
for l in deltas_to_compact.iter() {
|
||||
info!("compact includes {}", l.filename().file_name());
|
||||
}
|
||||
|
||||
// We don't need the original list of layers anymore. Drop it so that
|
||||
// we don't accidentally use it later in the function.
|
||||
drop(level0_deltas);
|
||||
@@ -2940,7 +3073,9 @@ impl Timeline {
|
||||
}
|
||||
|
||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||
anyhow::bail!("failpoint delta-layer-writer-fail-before-finish");
|
||||
return Err(
|
||||
anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into(),
|
||||
);
|
||||
});
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||
@@ -2959,7 +3094,7 @@ impl Timeline {
|
||||
|
||||
// Fsync all the layer files and directory using multiple threads to
|
||||
// minimize latency.
|
||||
par_fsync::par_fsync(&layer_paths)?;
|
||||
par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
|
||||
|
||||
layer_paths.pop().unwrap();
|
||||
}
|
||||
@@ -2981,11 +3116,13 @@ impl Timeline {
|
||||
layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||
target_file_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), CompactionError> {
|
||||
let CompactLevel0Phase1Result {
|
||||
new_layers,
|
||||
deltas_to_compact,
|
||||
} = self.compact_level0_phase1(target_file_size, ctx).await?;
|
||||
} = self
|
||||
.compact_level0_phase1(layer_removal_cs, target_file_size, ctx)
|
||||
.await?;
|
||||
|
||||
if new_layers.is_empty() && deltas_to_compact.is_empty() {
|
||||
// nothing to do
|
||||
@@ -3009,7 +3146,12 @@ impl Timeline {
|
||||
for l in new_layers {
|
||||
let new_delta_path = l.path();
|
||||
|
||||
let metadata = new_delta_path.metadata()?;
|
||||
let metadata = new_delta_path.metadata().with_context(|| {
|
||||
format!(
|
||||
"read file metadata for new created layer {}",
|
||||
new_delta_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.schedule_layer_file_upload(
|
||||
@@ -3243,7 +3385,7 @@ impl Timeline {
|
||||
|
||||
let mut layers_to_remove = Vec::new();
|
||||
|
||||
// Scan all on-disk layers in the timeline.
|
||||
// Scan all layers in the timeline (remote or on-disk).
|
||||
//
|
||||
// Garbage collect the layer if all conditions are satisfied:
|
||||
// 1. it is older than cutoff LSN;
|
||||
@@ -3482,14 +3624,26 @@ impl Timeline {
|
||||
&self,
|
||||
remote_layer: Arc<RemoteLayer>,
|
||||
) -> anyhow::Result<()> {
|
||||
use std::sync::atomic::Ordering::Relaxed;
|
||||
|
||||
let permit = match Arc::clone(&remote_layer.ongoing_download)
|
||||
.acquire_owned()
|
||||
.await
|
||||
{
|
||||
Ok(permit) => permit,
|
||||
Err(_closed) => {
|
||||
info!("download of layer has already finished");
|
||||
return Ok(());
|
||||
if remote_layer.download_replacement_failure.load(Relaxed) {
|
||||
// this path will be hit often, in case there are upper retries. however
|
||||
// hitting this error will prevent a busy loop between get_reconstruct_data and
|
||||
// download, so an error is prefered.
|
||||
//
|
||||
// TODO: we really should poison the timeline, but panicking is not yet
|
||||
// supported. Related: https://github.com/neondatabase/neon/issues/3621
|
||||
anyhow::bail!("an earlier download succeeded but LayerMap::replace failed")
|
||||
} else {
|
||||
info!("download of layer has already finished");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -3525,8 +3679,8 @@ impl Timeline {
|
||||
{
|
||||
use crate::tenant::layer_map::Replacement;
|
||||
let l: Arc<dyn PersistentLayer> = remote_layer.clone();
|
||||
match updates.replace_historic(&l, new_layer) {
|
||||
Ok(Replacement::Replaced { .. }) => { /* expected */ }
|
||||
let failure = match updates.replace_historic(&l, new_layer) {
|
||||
Ok(Replacement::Replaced { .. }) => false,
|
||||
Ok(Replacement::NotFound) => {
|
||||
// TODO: the downloaded file should probably be removed, otherwise
|
||||
// it will be added to the layermap on next load? we should
|
||||
@@ -3534,6 +3688,7 @@ impl Timeline {
|
||||
//
|
||||
// See: https://github.com/neondatabase/neon/issues/3533
|
||||
error!("replacing downloaded layer into layermap failed because layer was not found");
|
||||
true
|
||||
}
|
||||
Ok(Replacement::RemovalBuffered) => {
|
||||
unreachable!("current implementation does not remove anything")
|
||||
@@ -3552,12 +3707,32 @@ impl Timeline {
|
||||
?other,
|
||||
"replacing downloaded layer into layermap failed because another layer was found instead of expected"
|
||||
);
|
||||
true
|
||||
}
|
||||
Err(e) => {
|
||||
// this is a precondition failure, the layer filename derived
|
||||
// attributes didn't match up, which doesn't seem likely.
|
||||
error!("replacing downloaded layer into layermap failed: {e:#?}")
|
||||
error!("replacing downloaded layer into layermap failed: {e:#?}");
|
||||
true
|
||||
}
|
||||
};
|
||||
|
||||
if failure {
|
||||
// mark the remote layer permanently failed; the timeline is most
|
||||
// likely unusable after this. sadly we cannot just poison the layermap
|
||||
// lock with panic, because that would create an issue with shutdown.
|
||||
//
|
||||
// this does not change the retry semantics on failed downloads.
|
||||
//
|
||||
// use of Relaxed is valid because closing of the semaphore gives
|
||||
// happens-before and wakes up any waiters; we write this value before
|
||||
// and any waiters (or would be waiters) will load it after closing
|
||||
// semaphore.
|
||||
//
|
||||
// See: https://github.com/neondatabase/neon/issues/3533
|
||||
remote_layer
|
||||
.download_replacement_failure
|
||||
.store(true, Relaxed);
|
||||
}
|
||||
}
|
||||
updates.flush();
|
||||
@@ -3583,7 +3758,7 @@ impl Timeline {
|
||||
drop(permit);
|
||||
|
||||
Ok(())
|
||||
},
|
||||
}.in_current_span(),
|
||||
);
|
||||
|
||||
receiver.await.context("download task cancelled")?
|
||||
|
||||
@@ -41,9 +41,23 @@ impl Timeline {
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
|
||||
async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
|
||||
use crate::tenant::tasks::random_init_delay;
|
||||
{
|
||||
let policy = self.get_eviction_policy();
|
||||
let period = match policy {
|
||||
EvictionPolicy::LayerAccessThreshold(lat) => lat.period,
|
||||
EvictionPolicy::NoEviction => Duration::from_secs(10),
|
||||
};
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
info!("shutting down");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
loop {
|
||||
let policy = self.get_eviction_policy();
|
||||
let cf = self.eviction_iteration(&policy, cancel.clone()).await;
|
||||
|
||||
match cf {
|
||||
ControlFlow::Break(()) => break,
|
||||
ControlFlow::Continue(sleep_until) => {
|
||||
@@ -78,13 +92,7 @@ impl Timeline {
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
let elapsed = start.elapsed();
|
||||
if elapsed > p.period {
|
||||
warn!(
|
||||
configured_period = %humantime::format_duration(p.period),
|
||||
last_period = %humantime::format_duration(elapsed),
|
||||
"this eviction period took longer than the configured period"
|
||||
);
|
||||
}
|
||||
crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
|
||||
ControlFlow::Continue(start + p.period)
|
||||
}
|
||||
}
|
||||
@@ -100,7 +108,6 @@ impl Timeline {
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Default)]
|
||||
struct EvictionStats {
|
||||
not_considered_due_to_clock_skew: usize,
|
||||
candidates: usize,
|
||||
evicted: usize,
|
||||
errors: usize,
|
||||
@@ -129,9 +136,21 @@ impl Timeline {
|
||||
let no_activity_for = match now.duration_since(last_activity_ts) {
|
||||
Ok(d) => d,
|
||||
Err(_e) => {
|
||||
// NB: don't log the error. If there are many layers and the system clock
|
||||
// is skewed, we'd be flooding the log.
|
||||
stats.not_considered_due_to_clock_skew += 1;
|
||||
// We reach here if `now` < `last_activity_ts`, which can legitimately
|
||||
// happen if there is an access between us getting `now`, and us getting
|
||||
// the access stats from the layer.
|
||||
//
|
||||
// The other reason why it can happen is system clock skew because
|
||||
// SystemTime::now() is not monotonic, so, even if there is no access
|
||||
// to the layer after we get `now` at the beginning of this function,
|
||||
// it could be that `now` < `last_activity_ts`.
|
||||
//
|
||||
// To distinguish the cases, we would need to record `Instant`s in the
|
||||
// access stats (i.e., monotonic timestamps), but then, the timestamps
|
||||
// values in the access stats would need to be `Instant`'s, and hence
|
||||
// they would be meaningless outside of the pageserver process.
|
||||
// At the time of writing, the trade-off is that access stats are more
|
||||
// valuable than detecting clock skew.
|
||||
continue;
|
||||
}
|
||||
};
|
||||
@@ -188,8 +207,9 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
if stats.not_considered_due_to_clock_skew > 0 || stats.errors > 0 || stats.not_evictable > 0
|
||||
{
|
||||
if stats.candidates == stats.not_evictable {
|
||||
debug!(stats=?stats, "eviction iteration complete");
|
||||
} else if stats.errors > 0 || stats.not_evictable > 0 {
|
||||
warn!(stats=?stats, "eviction iteration complete");
|
||||
} else {
|
||||
info!(stats=?stats, "eviction iteration complete");
|
||||
|
||||
@@ -27,8 +27,7 @@ use std::fs::OpenOptions;
|
||||
use std::io::prelude::*;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::os::fd::RawFd;
|
||||
use std::os::unix::io::AsRawFd;
|
||||
use std::os::unix::io::{AsRawFd, RawFd};
|
||||
use std::os::unix::prelude::CommandExt;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Stdio;
|
||||
|
||||
78
poetry.lock
generated
78
poetry.lock
generated
@@ -1238,7 +1238,6 @@ category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "junit-xml-1.9.tar.gz", hash = "sha256:de16a051990d4e25a3982b2dd9e89d671067548718866416faec14d9de56db9f"},
|
||||
{file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"},
|
||||
]
|
||||
|
||||
@@ -1309,66 +1308,63 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "moto"
|
||||
version = "3.1.18"
|
||||
description = "A library that allows your python tests to easily mock out the boto library"
|
||||
version = "4.1.2"
|
||||
description = ""
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"},
|
||||
{file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"},
|
||||
{file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"},
|
||||
{file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""}
|
||||
boto3 = ">=1.9.201"
|
||||
botocore = ">=1.12.201"
|
||||
cfn-lint = {version = ">=0.4.0", optional = true, markers = "extra == \"server\""}
|
||||
cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""}
|
||||
cryptography = ">=3.3.1"
|
||||
docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""}
|
||||
ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""}
|
||||
flask = {version = "<2.2.0", optional = true, markers = "extra == \"server\""}
|
||||
flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""}
|
||||
flask-cors = {version = "*", optional = true, markers = "extra == \"server\""}
|
||||
graphql-core = {version = "*", optional = true, markers = "extra == \"server\""}
|
||||
idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""}
|
||||
Jinja2 = ">=2.10.1"
|
||||
jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""}
|
||||
MarkupSafe = "!=2.0.0a1"
|
||||
openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""}
|
||||
pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""}
|
||||
python-dateutil = ">=2.1,<3.0.0"
|
||||
python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""}
|
||||
pytz = "*"
|
||||
PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""}
|
||||
requests = ">=2.5"
|
||||
responses = ">=0.9.0"
|
||||
responses = ">=0.13.0"
|
||||
setuptools = {version = "*", optional = true, markers = "extra == \"server\""}
|
||||
sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""}
|
||||
werkzeug = ">=0.5,<2.2.0"
|
||||
werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1"
|
||||
xmltodict = "*"
|
||||
|
||||
[package.extras]
|
||||
all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
|
||||
all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
|
||||
apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
|
||||
apigatewayv2 = ["PyYAML (>=5.1)"]
|
||||
appsync = ["graphql-core"]
|
||||
awslambda = ["docker (>=2.5.1)"]
|
||||
batch = ["docker (>=2.5.1)"]
|
||||
cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
|
||||
cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
|
||||
cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
|
||||
ds = ["sshpubkeys (>=3.1.0)"]
|
||||
dynamodb = ["docker (>=2.5.1)"]
|
||||
dynamodb2 = ["docker (>=2.5.1)"]
|
||||
dynamodbstreams = ["docker (>=2.5.1)"]
|
||||
ebs = ["sshpubkeys (>=3.1.0)"]
|
||||
ec2 = ["sshpubkeys (>=3.1.0)"]
|
||||
efs = ["sshpubkeys (>=3.1.0)"]
|
||||
eks = ["sshpubkeys (>=3.1.0)"]
|
||||
glue = ["pyparsing (>=3.0.7)"]
|
||||
iotdata = ["jsondiff (>=1.1.2)"]
|
||||
route53resolver = ["sshpubkeys (>=3.1.0)"]
|
||||
s3 = ["PyYAML (>=5.1)"]
|
||||
server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (<2.2.0)", "flask-cors", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
|
||||
ssm = ["PyYAML (>=5.1)", "dataclasses"]
|
||||
server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
|
||||
ssm = ["PyYAML (>=5.1)"]
|
||||
xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
|
||||
|
||||
[[package]]
|
||||
@@ -1716,7 +1712,6 @@ python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
|
||||
{file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
|
||||
{file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
|
||||
{file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
|
||||
{file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
|
||||
{file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
|
||||
@@ -1750,7 +1745,6 @@ files = [
|
||||
{file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
|
||||
{file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
|
||||
{file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
|
||||
{file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
|
||||
{file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
|
||||
{file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
|
||||
{file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
|
||||
@@ -1762,7 +1756,6 @@ files = [
|
||||
{file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
|
||||
{file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
|
||||
{file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
|
||||
{file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
|
||||
{file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
|
||||
{file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
|
||||
{file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
|
||||
@@ -1795,7 +1788,18 @@ category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
|
||||
{file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
|
||||
{file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
|
||||
{file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
|
||||
{file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
|
||||
{file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
|
||||
{file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
|
||||
{file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
|
||||
{file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
|
||||
{file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
|
||||
{file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
|
||||
{file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
|
||||
{file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
|
||||
]
|
||||
|
||||
@@ -2004,8 +2008,8 @@ files = [
|
||||
|
||||
[package.dependencies]
|
||||
pytest = [
|
||||
{version = ">=5.0", markers = "python_version < \"3.10\""},
|
||||
{version = ">=6.2.4", markers = "python_version >= \"3.10\""},
|
||||
{version = ">=5.0", markers = "python_version < \"3.10\""},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2082,18 +2086,6 @@ cryptography = ["cryptography (>=3.4.0)"]
|
||||
pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"]
|
||||
pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "pytz"
|
||||
version = "2022.1"
|
||||
description = "World timezone definitions, modern and historical"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"},
|
||||
{file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pywin32"
|
||||
version = "301"
|
||||
@@ -2129,13 +2121,6 @@ files = [
|
||||
{file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
|
||||
{file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
|
||||
{file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
|
||||
{file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
|
||||
{file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
|
||||
{file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
|
||||
{file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
|
||||
{file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
|
||||
{file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
|
||||
{file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
|
||||
{file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
|
||||
{file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
|
||||
{file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
|
||||
@@ -2449,16 +2434,19 @@ test = ["websockets"]
|
||||
|
||||
[[package]]
|
||||
name = "werkzeug"
|
||||
version = "2.1.2"
|
||||
version = "2.2.3"
|
||||
description = "The comprehensive WSGI web application library."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "Werkzeug-2.1.2-py3-none-any.whl", hash = "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255"},
|
||||
{file = "Werkzeug-2.1.2.tar.gz", hash = "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6"},
|
||||
{file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"},
|
||||
{file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
MarkupSafe = ">=2.1.1"
|
||||
|
||||
[package.extras]
|
||||
watchdog = ["watchdog"]
|
||||
|
||||
@@ -2655,4 +2643,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "7563a38912963d8cf20c99acb06fe55623e65b799c4b88d37dc672e5384c96a3"
|
||||
content-hash = "3038940781ef59d1ed28cedf46120ad6623e21e602c38ad3c359428d79fa1efd"
|
||||
|
||||
@@ -28,6 +28,7 @@ itertools.workspace = true
|
||||
md5.workspace = true
|
||||
metrics.workspace = true
|
||||
once_cell.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
parking_lot.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
pq_proto.workspace = true
|
||||
@@ -35,6 +36,8 @@ prometheus.workspace = true
|
||||
rand.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
reqwest-middleware.workspace = true
|
||||
reqwest-tracing.workspace = true
|
||||
routerify.workspace = true
|
||||
rustls-pemfile.workspace = true
|
||||
rustls.workspace = true
|
||||
@@ -48,8 +51,10 @@ thiserror.workspace = true
|
||||
tls-listener.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio = { workspace = true, features = ["signal"] }
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
mod classic;
|
||||
|
||||
mod hacks;
|
||||
mod link;
|
||||
use futures::TryFutureExt;
|
||||
|
||||
pub use link::LinkAuthError;
|
||||
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
auth::{self, ClientCredentials},
|
||||
console::{
|
||||
self,
|
||||
provider::{CachedNodeInfo, ConsoleReqExtra},
|
||||
@@ -13,9 +13,10 @@ use crate::{
|
||||
},
|
||||
stream, url,
|
||||
};
|
||||
use futures::TryFutureExt;
|
||||
use std::borrow::Cow;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
use tracing::info;
|
||||
|
||||
/// A product of successful authentication.
|
||||
pub struct AuthSuccess<T> {
|
||||
@@ -105,97 +106,49 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: get rid of explicit lifetimes in this block (there's a bug in rustc).
|
||||
// Read more: https://github.com/rust-lang/rust/issues/99190
|
||||
// Alleged fix: https://github.com/rust-lang/rust/pull/89056
|
||||
impl<'l> BackendType<'l, ClientCredentials<'_>> {
|
||||
/// Do something special if user didn't provide the `project` parameter.
|
||||
async fn try_password_hack<'a>(
|
||||
&'a mut self,
|
||||
extra: &'a ConsoleReqExtra<'a>,
|
||||
client: &'a mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<Option<AuthSuccess<CachedNodeInfo>>> {
|
||||
use BackendType::*;
|
||||
|
||||
// If there's no project so far, that entails that client doesn't
|
||||
// support SNI or other means of passing the project name.
|
||||
// We now expect to see a very specific payload in the place of password.
|
||||
let fetch_magic_payload = |client| async {
|
||||
warn!("project name not specified, resorting to the password hack auth flow");
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::PasswordHack)
|
||||
.await?
|
||||
.authenticate()
|
||||
.await?;
|
||||
|
||||
info!(project = &payload.project, "received missing parameter");
|
||||
auth::Result::Ok(payload)
|
||||
};
|
||||
|
||||
// If we want to use cleartext password flow, we can read the password
|
||||
// from the client and pretend that it's a magic payload (PasswordHack hack).
|
||||
let fetch_plaintext_password = |client| async {
|
||||
info!("using cleartext password flow");
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::CleartextPassword)
|
||||
.await?
|
||||
.authenticate()
|
||||
.await?;
|
||||
|
||||
auth::Result::Ok(auth::password_hack::PasswordHackPayload {
|
||||
project: String::new(),
|
||||
password: payload,
|
||||
})
|
||||
};
|
||||
|
||||
// TODO: find a proper way to merge those very similar blocks.
|
||||
let (mut node, password) = match self {
|
||||
Console(api, creds) if creds.project.is_none() => {
|
||||
let payload = fetch_magic_payload(client).await?;
|
||||
creds.project = Some(payload.project.into());
|
||||
let node = api.wake_compute(extra, creds).await?;
|
||||
|
||||
(node, payload.password)
|
||||
}
|
||||
// This is a hack to allow cleartext password in secure connections (wss).
|
||||
Console(api, creds) if creds.use_cleartext_password_flow => {
|
||||
let payload = fetch_plaintext_password(client).await?;
|
||||
let node = api.wake_compute(extra, creds).await?;
|
||||
|
||||
(node, payload.password)
|
||||
}
|
||||
Postgres(api, creds) if creds.project.is_none() => {
|
||||
let payload = fetch_magic_payload(client).await?;
|
||||
creds.project = Some(payload.project.into());
|
||||
let node = api.wake_compute(extra, creds).await?;
|
||||
|
||||
(node, payload.password)
|
||||
}
|
||||
_ => return Ok(None),
|
||||
};
|
||||
|
||||
node.config.password(password);
|
||||
Ok(Some(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: node,
|
||||
}))
|
||||
/// True to its name, this function encapsulates our current auth trade-offs.
|
||||
/// Here, we choose the appropriate auth flow based on circumstances.
|
||||
async fn auth_quirks(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
allow_cleartext: bool,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
// If there's no project so far, that entails that client doesn't
|
||||
// support SNI or other means of passing the endpoint (project) name.
|
||||
// We now expect to see a very specific payload in the place of password.
|
||||
if creds.project.is_none() {
|
||||
// Password will be checked by the compute node later.
|
||||
return hacks::password_hack(api, extra, creds, client).await;
|
||||
}
|
||||
|
||||
// Password hack should set the project name.
|
||||
// TODO: make `creds.project` more type-safe.
|
||||
assert!(creds.project.is_some());
|
||||
|
||||
// Perform cleartext auth if we're allowed to do that.
|
||||
// Currently, we use it for websocket connections (latency).
|
||||
if allow_cleartext {
|
||||
// Password will be checked by the compute node later.
|
||||
return hacks::cleartext_hack(api, extra, creds, client).await;
|
||||
}
|
||||
|
||||
// Finally, proceed with the main auth flow (SCRAM-based).
|
||||
classic::authenticate(api, extra, creds, client).await
|
||||
}
|
||||
|
||||
impl BackendType<'_, ClientCredentials<'_>> {
|
||||
/// Authenticate the client via the requested backend, possibly using credentials.
|
||||
pub async fn authenticate<'a>(
|
||||
#[tracing::instrument(fields(allow_cleartext), skip_all)]
|
||||
pub async fn authenticate(
|
||||
&mut self,
|
||||
extra: &'a ConsoleReqExtra<'a>,
|
||||
client: &'a mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
allow_cleartext: bool,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
use BackendType::*;
|
||||
|
||||
// Handle cases when `project` is missing in `creds`.
|
||||
// TODO: type safety: return `creds` with irrefutable `project`.
|
||||
if let Some(res) = self.try_password_hack(extra, client).await? {
|
||||
info!("user successfully authenticated (using the password hack)");
|
||||
return Ok(res);
|
||||
}
|
||||
|
||||
let res = match self {
|
||||
Console(api, creds) => {
|
||||
info!(
|
||||
@@ -204,20 +157,24 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> {
|
||||
"performing authentication using the console"
|
||||
);
|
||||
|
||||
assert!(creds.project.is_some());
|
||||
classic::handle_user(api.as_ref(), extra, creds, client).await?
|
||||
let api = api.as_ref();
|
||||
auth_quirks(api, extra, creds, client, allow_cleartext).await?
|
||||
}
|
||||
Postgres(api, creds) => {
|
||||
info!("performing mock authentication using a local postgres instance");
|
||||
info!(
|
||||
user = creds.user,
|
||||
project = creds.project(),
|
||||
"performing authentication using a local postgres instance"
|
||||
);
|
||||
|
||||
assert!(creds.project.is_some());
|
||||
classic::handle_user(api.as_ref(), extra, creds, client).await?
|
||||
let api = api.as_ref();
|
||||
auth_quirks(api, extra, creds, client, allow_cleartext).await?
|
||||
}
|
||||
// NOTE: this auth backend doesn't use client credentials.
|
||||
Link(url) => {
|
||||
info!("performing link authentication");
|
||||
|
||||
link::handle_user(url, client)
|
||||
link::authenticate(url, client)
|
||||
.await?
|
||||
.map(CachedNodeInfo::new_uncached)
|
||||
}
|
||||
@@ -229,9 +186,9 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> {
|
||||
|
||||
/// When applicable, wake the compute node, gaining its connection info in the process.
|
||||
/// The link auth flow doesn't support this, so we return [`None`] in that case.
|
||||
pub async fn wake_compute<'a>(
|
||||
pub async fn wake_compute(
|
||||
&self,
|
||||
extra: &'a ConsoleReqExtra<'a>,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
|
||||
use BackendType::*;
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ use crate::{
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::info;
|
||||
|
||||
pub(super) async fn handle_user(
|
||||
pub(super) async fn authenticate(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials<'_>,
|
||||
|
||||
66
proxy/src/auth/backend/hacks.rs
Normal file
66
proxy/src/auth/backend/hacks.rs
Normal file
@@ -0,0 +1,66 @@
|
||||
use super::AuthSuccess;
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
console::{
|
||||
self,
|
||||
provider::{CachedNodeInfo, ConsoleReqExtra},
|
||||
},
|
||||
stream,
|
||||
};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
|
||||
/// Compared to [SCRAM](crate::scram), cleartext password auth saves
|
||||
/// one round trip and *expensive* computations (>= 4096 HMAC iterations).
|
||||
/// These properties are benefical for serverless JS workers, so we
|
||||
/// use this mechanism for websocket connections.
|
||||
pub async fn cleartext_hack(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
warn!("cleartext auth flow override is enabled, proceeding");
|
||||
let password = AuthFlow::new(client)
|
||||
.begin(auth::CleartextPassword)
|
||||
.await?
|
||||
.authenticate()
|
||||
.await?;
|
||||
|
||||
let mut node = api.wake_compute(extra, creds).await?;
|
||||
node.config.password(password);
|
||||
|
||||
// Report tentative success; compute node will check the password anyway.
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: node,
|
||||
})
|
||||
}
|
||||
|
||||
/// Workaround for clients which don't provide an endpoint (project) name.
|
||||
/// Very similar to [`cleartext_hack`], but there's a specific password format.
|
||||
pub async fn password_hack(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
warn!("project not specified, resorting to the password hack auth flow");
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::PasswordHack)
|
||||
.await?
|
||||
.authenticate()
|
||||
.await?;
|
||||
|
||||
info!(project = &payload.project, "received missing parameter");
|
||||
creds.project = Some(payload.project.into());
|
||||
|
||||
let mut node = api.wake_compute(extra, creds).await?;
|
||||
node.config.password(payload.password);
|
||||
|
||||
// Report tentative success; compute node will check the password anyway.
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: node,
|
||||
})
|
||||
}
|
||||
@@ -53,7 +53,7 @@ pub fn new_psql_session_id() -> String {
|
||||
hex::encode(rand::random::<[u8; 8]>())
|
||||
}
|
||||
|
||||
pub(super) async fn handle_user(
|
||||
pub(super) async fn authenticate(
|
||||
link_uri: &reqwest::Url,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<AuthSuccess<NodeInfo>> {
|
||||
|
||||
@@ -11,12 +11,16 @@ pub enum ClientCredsParseError {
|
||||
#[error("Parameter '{0}' is missing in startup packet.")]
|
||||
MissingKey(&'static str),
|
||||
|
||||
#[error("Inconsistent project name inferred from SNI ('{}') and project option ('{}').", .domain, .option)]
|
||||
#[error(
|
||||
"Inconsistent project name inferred from \
|
||||
SNI ('{}') and project option ('{}').",
|
||||
.domain, .option,
|
||||
)]
|
||||
InconsistentProjectNames { domain: String, option: String },
|
||||
|
||||
#[error(
|
||||
"SNI ('{}') inconsistently formatted with respect to common name ('{}'). \
|
||||
SNI should be formatted as '<project-name>.{}'.",
|
||||
SNI should be formatted as '<project-name>.{}'.",
|
||||
.sni, .cn, .cn,
|
||||
)]
|
||||
InconsistentSni { sni: String, cn: String },
|
||||
@@ -34,9 +38,6 @@ pub struct ClientCredentials<'a> {
|
||||
pub user: &'a str,
|
||||
// TODO: this is a severe misnomer! We should think of a new name ASAP.
|
||||
pub project: Option<Cow<'a, str>>,
|
||||
/// If `True`, we'll use the old cleartext password flow. This is used for
|
||||
/// websocket connections, which want to minimize the number of round trips.
|
||||
pub use_cleartext_password_flow: bool,
|
||||
}
|
||||
|
||||
impl ClientCredentials<'_> {
|
||||
@@ -51,7 +52,6 @@ impl<'a> ClientCredentials<'a> {
|
||||
params: &'a StartupMessageParams,
|
||||
sni: Option<&str>,
|
||||
common_name: Option<&str>,
|
||||
use_cleartext_password_flow: bool,
|
||||
) -> Result<Self, ClientCredsParseError> {
|
||||
use ClientCredsParseError::*;
|
||||
|
||||
@@ -96,18 +96,9 @@ impl<'a> ClientCredentials<'a> {
|
||||
}
|
||||
.transpose()?;
|
||||
|
||||
info!(
|
||||
user = user,
|
||||
project = project.as_deref(),
|
||||
use_cleartext_password_flow = use_cleartext_password_flow,
|
||||
"credentials"
|
||||
);
|
||||
info!(user, project = project.as_deref(), "credentials");
|
||||
|
||||
Ok(Self {
|
||||
user,
|
||||
project,
|
||||
use_cleartext_password_flow,
|
||||
})
|
||||
Ok(Self { user, project })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,7 +122,7 @@ mod tests {
|
||||
// According to postgresql, only `user` should be required.
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
|
||||
let creds = ClientCredentials::parse(&options, None, None, false)?;
|
||||
let creds = ClientCredentials::parse(&options, None, None)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project, None);
|
||||
|
||||
@@ -146,7 +137,7 @@ mod tests {
|
||||
("foo", "bar"), // should be ignored
|
||||
]);
|
||||
|
||||
let creds = ClientCredentials::parse(&options, None, None, false)?;
|
||||
let creds = ClientCredentials::parse(&options, None, None)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project, None);
|
||||
|
||||
@@ -160,7 +151,7 @@ mod tests {
|
||||
let sni = Some("foo.localhost");
|
||||
let common_name = Some("localhost");
|
||||
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project.as_deref(), Some("foo"));
|
||||
|
||||
@@ -174,7 +165,7 @@ mod tests {
|
||||
("options", "-ckey=1 project=bar -c geqo=off"),
|
||||
]);
|
||||
|
||||
let creds = ClientCredentials::parse(&options, None, None, false)?;
|
||||
let creds = ClientCredentials::parse(&options, None, None)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project.as_deref(), Some("bar"));
|
||||
|
||||
@@ -188,7 +179,7 @@ mod tests {
|
||||
let sni = Some("baz.localhost");
|
||||
let common_name = Some("localhost");
|
||||
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project.as_deref(), Some("baz"));
|
||||
|
||||
@@ -203,8 +194,7 @@ mod tests {
|
||||
let sni = Some("second.localhost");
|
||||
let common_name = Some("localhost");
|
||||
|
||||
let err =
|
||||
ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
|
||||
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
|
||||
match err {
|
||||
InconsistentProjectNames { domain, option } => {
|
||||
assert_eq!(option, "first");
|
||||
@@ -221,8 +211,7 @@ mod tests {
|
||||
let sni = Some("project.localhost");
|
||||
let common_name = Some("example.com");
|
||||
|
||||
let err =
|
||||
ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
|
||||
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
|
||||
match err {
|
||||
InconsistentSni { sni, cn } => {
|
||||
assert_eq!(sni, "project.localhost");
|
||||
|
||||
@@ -77,14 +77,9 @@ impl ConnCfg {
|
||||
self.dbname(dbname);
|
||||
}
|
||||
|
||||
if let Some(options) = params.options_raw() {
|
||||
// We must drop all proxy-specific parameters.
|
||||
#[allow(unstable_name_collisions)]
|
||||
let options: String = options
|
||||
.filter(|opt| !opt.starts_with("project="))
|
||||
.intersperse(" ") // TODO: use impl from std once it's stabilized
|
||||
.collect();
|
||||
|
||||
// Don't add `options` if they were only used for specifying a project.
|
||||
// Connection pools don't support `options`, because they affect backend startup.
|
||||
if let Some(options) = filtered_options(params) {
|
||||
self.options(&options);
|
||||
}
|
||||
|
||||
@@ -225,3 +220,46 @@ impl ConnCfg {
|
||||
Ok(connection)
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve `options` from a startup message, dropping all proxy-secific flags.
|
||||
fn filtered_options(params: &StartupMessageParams) -> Option<String> {
|
||||
#[allow(unstable_name_collisions)]
|
||||
let options: String = params
|
||||
.options_raw()?
|
||||
.filter(|opt| !opt.starts_with("project="))
|
||||
.intersperse(" ") // TODO: use impl from std once it's stabilized
|
||||
.collect();
|
||||
|
||||
// Don't even bother with empty options.
|
||||
if options.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(options)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_filtered_options() {
|
||||
// Empty options is unlikely to be useful anyway.
|
||||
let params = StartupMessageParams::new([("options", "")]);
|
||||
assert_eq!(filtered_options(¶ms), None);
|
||||
|
||||
// It's likely that clients will only use options to specify endpoint/project.
|
||||
let params = StartupMessageParams::new([("options", "project=foo")]);
|
||||
assert_eq!(filtered_options(¶ms), None);
|
||||
|
||||
// Same, because unescaped whitespaces are no-op.
|
||||
let params = StartupMessageParams::new([("options", " project=foo ")]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), None);
|
||||
|
||||
let params = StartupMessageParams::new([("options", r"\ project=foo \ ")]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), Some(r"\ \ "));
|
||||
|
||||
let params = StartupMessageParams::new([("options", "project = foo")]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@ pub struct ProxyConfig {
|
||||
pub metric_collection: Option<MetricCollectionConfig>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MetricCollectionConfig {
|
||||
pub endpoint: reqwest::Url,
|
||||
pub interval: Duration,
|
||||
|
||||
@@ -5,10 +5,7 @@ use crate::{
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::Lazy;
|
||||
use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
|
||||
use std::{
|
||||
net::{TcpListener, TcpStream},
|
||||
thread,
|
||||
};
|
||||
use std::{net::TcpStream, thread};
|
||||
use tracing::{error, info, info_span};
|
||||
use utils::{
|
||||
postgres_backend::{self, AuthType, PostgresBackend},
|
||||
@@ -34,23 +31,24 @@ pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::N
|
||||
CPLANE_WAITERS.notify(psql_session_id, msg)
|
||||
}
|
||||
|
||||
/// Console management API listener thread.
|
||||
/// Console management API listener task.
|
||||
/// It spawns console response handlers needed for the link auth.
|
||||
pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
|
||||
pub async fn task_main(listener: tokio::net::TcpListener) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("mgmt has shut down");
|
||||
}
|
||||
|
||||
listener
|
||||
.set_nonblocking(false)
|
||||
.context("failed to set listener to blocking")?;
|
||||
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept().context("failed to accept a new client")?;
|
||||
let (socket, peer_addr) = listener.accept().await?;
|
||||
info!("accepted connection from {peer_addr}");
|
||||
|
||||
let socket = socket.into_std()?;
|
||||
socket
|
||||
.set_nodelay(true)
|
||||
.context("failed to set client socket option")?;
|
||||
socket
|
||||
.set_nonblocking(false)
|
||||
.context("failed to set client socket option")?;
|
||||
|
||||
// TODO: replace with async tasks.
|
||||
thread::spawn(move || {
|
||||
|
||||
@@ -11,8 +11,10 @@ use async_trait::async_trait;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub mod errors {
|
||||
use crate::error::{io_error, UserFacingError};
|
||||
use reqwest::StatusCode as HttpStatusCode;
|
||||
use crate::{
|
||||
error::{io_error, UserFacingError},
|
||||
http,
|
||||
};
|
||||
use thiserror::Error;
|
||||
|
||||
/// A go-to error message which doesn't leak any detail.
|
||||
@@ -24,7 +26,7 @@ pub mod errors {
|
||||
/// Error returned by the console itself.
|
||||
#[error("{REQUEST_FAILED} with {}: {}", .status, .text)]
|
||||
Console {
|
||||
status: HttpStatusCode,
|
||||
status: http::StatusCode,
|
||||
text: Box<str>,
|
||||
},
|
||||
|
||||
@@ -35,7 +37,7 @@ pub mod errors {
|
||||
|
||||
impl ApiError {
|
||||
/// Returns HTTP status code if it's the reason for failure.
|
||||
pub fn http_status_code(&self) -> Option<HttpStatusCode> {
|
||||
pub fn http_status_code(&self) -> Option<http::StatusCode> {
|
||||
use ApiError::*;
|
||||
match self {
|
||||
Console { status, .. } => Some(*status),
|
||||
@@ -51,15 +53,15 @@ pub mod errors {
|
||||
// To minimize risks, only select errors are forwarded to users.
|
||||
// Ask @neondatabase/control-plane for review before adding more.
|
||||
Console { status, .. } => match *status {
|
||||
HttpStatusCode::NOT_FOUND => {
|
||||
http::StatusCode::NOT_FOUND => {
|
||||
// Status 404: failed to get a project-related resource.
|
||||
format!("{REQUEST_FAILED}: endpoint cannot be found")
|
||||
}
|
||||
HttpStatusCode::NOT_ACCEPTABLE => {
|
||||
http::StatusCode::NOT_ACCEPTABLE => {
|
||||
// Status 406: endpoint is disabled (we don't allow connections).
|
||||
format!("{REQUEST_FAILED}: endpoint is disabled")
|
||||
}
|
||||
HttpStatusCode::LOCKED => {
|
||||
http::StatusCode::LOCKED => {
|
||||
// Status 423: project might be in maintenance mode (or bad state).
|
||||
format!("{REQUEST_FAILED}: endpoint is temporary unavailable")
|
||||
}
|
||||
@@ -70,13 +72,18 @@ pub mod errors {
|
||||
}
|
||||
}
|
||||
|
||||
// Helps eliminate graceless `.map_err` calls without introducing another ctor.
|
||||
impl From<reqwest::Error> for ApiError {
|
||||
fn from(e: reqwest::Error) -> Self {
|
||||
io_error(e).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<reqwest_middleware::Error> for ApiError {
|
||||
fn from(e: reqwest_middleware::Error) -> Self {
|
||||
io_error(e).into()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum GetAuthInfoError {
|
||||
// We shouldn't include the actual secret here.
|
||||
|
||||
@@ -8,7 +8,6 @@ use super::{
|
||||
use crate::{auth::ClientCredentials, compute, http, scram};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use reqwest::StatusCode as HttpStatusCode;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -52,7 +51,7 @@ impl Api {
|
||||
Ok(body) => body,
|
||||
// Error 404 is special: it's ok not to have a secret.
|
||||
Err(e) => match e.http_status_code() {
|
||||
Some(HttpStatusCode::NOT_FOUND) => return Ok(None),
|
||||
Some(http::StatusCode::NOT_FOUND) => return Ok(None),
|
||||
_otherwise => return Err(e.into()),
|
||||
},
|
||||
};
|
||||
@@ -154,7 +153,7 @@ impl super::Api for Api {
|
||||
|
||||
/// Parse http response body, taking status code into account.
|
||||
async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
|
||||
response: reqwest::Response,
|
||||
response: http::Response,
|
||||
) -> Result<T, ApiError> {
|
||||
let status = response.status();
|
||||
if status.is_success() {
|
||||
|
||||
@@ -1,7 +1,24 @@
|
||||
//! HTTP client and server impls.
|
||||
//! Other modules should use stuff from this module instead of
|
||||
//! directly relying on deps like `reqwest` (think loose coupling).
|
||||
|
||||
pub mod server;
|
||||
pub mod websocket;
|
||||
|
||||
pub use reqwest::{Request, Response, StatusCode};
|
||||
pub use reqwest_middleware::{ClientWithMiddleware, Error};
|
||||
|
||||
use crate::url::ApiUrl;
|
||||
use reqwest_middleware::RequestBuilder;
|
||||
|
||||
/// This is the preferred way to create new http clients,
|
||||
/// because it takes care of observability (OpenTelemetry).
|
||||
/// We deliberately don't want to replace this with a public static.
|
||||
pub fn new_client() -> ClientWithMiddleware {
|
||||
reqwest_middleware::ClientBuilder::new(reqwest::Client::new())
|
||||
.with(reqwest_tracing::TracingMiddleware::default())
|
||||
.build()
|
||||
}
|
||||
|
||||
/// Thin convenience wrapper for an API provided by an http endpoint.
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -9,13 +26,17 @@ pub struct Endpoint {
|
||||
/// API's base URL.
|
||||
endpoint: ApiUrl,
|
||||
/// Connection manager with built-in pooling.
|
||||
client: reqwest::Client,
|
||||
client: ClientWithMiddleware,
|
||||
}
|
||||
|
||||
impl Endpoint {
|
||||
/// Construct a new HTTP endpoint wrapper.
|
||||
pub fn new(endpoint: ApiUrl, client: reqwest::Client) -> Self {
|
||||
Self { endpoint, client }
|
||||
/// Http client is not constructed under the hood so that it can be shared.
|
||||
pub fn new(endpoint: ApiUrl, client: impl Into<ClientWithMiddleware>) -> Self {
|
||||
Self {
|
||||
endpoint,
|
||||
client: client.into(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
@@ -23,19 +44,16 @@ impl Endpoint {
|
||||
&self.endpoint
|
||||
}
|
||||
|
||||
/// Return a [builder](reqwest::RequestBuilder) for a `GET` request,
|
||||
/// Return a [builder](RequestBuilder) for a `GET` request,
|
||||
/// appending a single `path` segment to the base endpoint URL.
|
||||
pub fn get(&self, path: &str) -> reqwest::RequestBuilder {
|
||||
pub fn get(&self, path: &str) -> RequestBuilder {
|
||||
let mut url = self.endpoint.clone();
|
||||
url.path_segments_mut().push(path);
|
||||
self.client.get(url.into_inner())
|
||||
}
|
||||
|
||||
/// Execute a [request](reqwest::Request).
|
||||
pub async fn execute(
|
||||
&self,
|
||||
request: reqwest::Request,
|
||||
) -> Result<reqwest::Response, reqwest::Error> {
|
||||
pub async fn execute(&self, request: Request) -> Result<Response, Error> {
|
||||
self.client.execute(request).await
|
||||
}
|
||||
}
|
||||
@@ -43,11 +61,12 @@ impl Endpoint {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use reqwest::Client;
|
||||
|
||||
#[test]
|
||||
fn optional_query_params() -> anyhow::Result<()> {
|
||||
let url = "http://example.com".parse()?;
|
||||
let endpoint = Endpoint::new(url, reqwest::Client::new());
|
||||
let endpoint = Endpoint::new(url, Client::new());
|
||||
|
||||
// Validate that this pattern makes sense.
|
||||
let req = endpoint
|
||||
@@ -66,7 +85,7 @@ mod tests {
|
||||
#[test]
|
||||
fn uuid_params() -> anyhow::Result<()> {
|
||||
let url = "http://example.com".parse()?;
|
||||
let endpoint = Endpoint::new(url, reqwest::Client::new());
|
||||
let endpoint = Endpoint::new(url, Client::new());
|
||||
|
||||
let req = endpoint
|
||||
.get("frobnicate")
|
||||
|
||||
@@ -186,8 +186,8 @@ async fn ws_handler(
|
||||
}
|
||||
|
||||
pub async fn task_main(
|
||||
ws_listener: TcpListener,
|
||||
config: &'static ProxyConfig,
|
||||
ws_listener: TcpListener,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("websocket server has shut down");
|
||||
|
||||
47
proxy/src/logging.rs
Normal file
47
proxy/src/logging.rs
Normal file
@@ -0,0 +1,47 @@
|
||||
use tracing_opentelemetry::OpenTelemetryLayer;
|
||||
use tracing_subscriber::{
|
||||
filter::{EnvFilter, LevelFilter},
|
||||
prelude::*,
|
||||
};
|
||||
|
||||
/// Initialize logging and OpenTelemetry tracing and exporter.
|
||||
///
|
||||
/// Logging can be configured using `RUST_LOG` environment variable.
|
||||
///
|
||||
/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up
|
||||
/// configuration from environment variables. For example, to change the
|
||||
/// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`.
|
||||
/// See <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables>
|
||||
pub async fn init() -> anyhow::Result<LoggingGuard> {
|
||||
let env_filter = EnvFilter::builder()
|
||||
.with_default_directive(LevelFilter::INFO.into())
|
||||
.from_env_lossy();
|
||||
|
||||
let fmt_layer = tracing_subscriber::fmt::layer()
|
||||
.with_ansi(atty::is(atty::Stream::Stderr))
|
||||
.with_writer(std::io::stderr)
|
||||
.with_target(false);
|
||||
|
||||
let otlp_layer = tracing_utils::init_tracing("proxy")
|
||||
.await
|
||||
.map(OpenTelemetryLayer::new);
|
||||
|
||||
tracing_subscriber::registry()
|
||||
.with(env_filter)
|
||||
.with(otlp_layer)
|
||||
.with(fmt_layer)
|
||||
.try_init()?;
|
||||
|
||||
Ok(LoggingGuard)
|
||||
}
|
||||
|
||||
pub struct LoggingGuard;
|
||||
|
||||
impl Drop for LoggingGuard {
|
||||
fn drop(&mut self) {
|
||||
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
||||
// pending traces before we exit.
|
||||
tracing::info!("shutting down the tracing machinery");
|
||||
tracing_utils::shutdown_tracing();
|
||||
}
|
||||
}
|
||||
@@ -12,6 +12,7 @@ mod config;
|
||||
mod console;
|
||||
mod error;
|
||||
mod http;
|
||||
mod logging;
|
||||
mod metrics;
|
||||
mod parse;
|
||||
mod proxy;
|
||||
@@ -27,7 +28,7 @@ use config::ProxyConfig;
|
||||
use futures::FutureExt;
|
||||
use std::{borrow::Cow, future::Future, net::SocketAddr};
|
||||
use tokio::{net::TcpListener, task::JoinError};
|
||||
use tracing::{info, info_span, Instrument};
|
||||
use tracing::{info, warn};
|
||||
use utils::{project_git_version, sentry_init::init_sentry};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
@@ -41,8 +42,8 @@ async fn flatten_err(
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
// First, initialize logging and troubleshooting subsystems.
|
||||
init_tracing();
|
||||
let _logging_guard = logging::init().await?;
|
||||
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
info!("Version: {GIT_VERSION}");
|
||||
@@ -60,16 +61,17 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let mgmt_address: SocketAddr = args.get_one::<String>("mgmt").unwrap().parse()?;
|
||||
info!("Starting mgmt on {mgmt_address}");
|
||||
let mgmt_listener = TcpListener::bind(mgmt_address).await?.into_std()?;
|
||||
let mgmt_listener = TcpListener::bind(mgmt_address).await?;
|
||||
|
||||
let proxy_address: SocketAddr = args.get_one::<String>("proxy").unwrap().parse()?;
|
||||
info!("Starting proxy on {proxy_address}");
|
||||
let proxy_listener = TcpListener::bind(proxy_address).await?;
|
||||
|
||||
let mut tasks = vec![
|
||||
tokio::spawn(handle_signals()),
|
||||
tokio::spawn(http::server::task_main(http_listener)),
|
||||
tokio::spawn(proxy::task_main(config, proxy_listener)),
|
||||
tokio::task::spawn_blocking(move || console::mgmt::thread_main(mgmt_listener)),
|
||||
tokio::spawn(console::mgmt::task_main(mgmt_listener)),
|
||||
];
|
||||
|
||||
if let Some(wss_address) = args.get_one::<String>("wss") {
|
||||
@@ -78,48 +80,50 @@ async fn main() -> anyhow::Result<()> {
|
||||
let wss_listener = TcpListener::bind(wss_address).await?;
|
||||
|
||||
tasks.push(tokio::spawn(http::websocket::task_main(
|
||||
wss_listener,
|
||||
config,
|
||||
wss_listener,
|
||||
)));
|
||||
}
|
||||
|
||||
// TODO: refactor.
|
||||
if let Some(metric_collection) = &config.metric_collection {
|
||||
let hostname = hostname::get()?
|
||||
.into_string()
|
||||
.map_err(|e| anyhow::anyhow!("failed to get hostname {e:?}"))?;
|
||||
|
||||
tasks.push(tokio::spawn(
|
||||
metrics::collect_metrics(
|
||||
&metric_collection.endpoint,
|
||||
metric_collection.interval,
|
||||
hostname,
|
||||
)
|
||||
.instrument(info_span!("collect_metrics")),
|
||||
));
|
||||
if let Some(metrics_config) = &config.metric_collection {
|
||||
tasks.push(tokio::spawn(metrics::task_main(metrics_config)));
|
||||
}
|
||||
|
||||
// This will block until all tasks have completed.
|
||||
// Furthermore, the first one to fail will cancel the rest.
|
||||
// This combinator will block until either all tasks complete or
|
||||
// one of them finishes with an error (others will be cancelled).
|
||||
let tasks = tasks.into_iter().map(flatten_err);
|
||||
let _: Vec<()> = futures::future::try_join_all(tasks).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Tracing is used for logging and telemetry.
|
||||
fn init_tracing() {
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter({
|
||||
// This filter will examine the `RUST_LOG` env variable.
|
||||
use tracing_subscriber::filter::{EnvFilter, LevelFilter};
|
||||
EnvFilter::builder()
|
||||
.with_default_directive(LevelFilter::INFO.into())
|
||||
.from_env_lossy()
|
||||
})
|
||||
.with_ansi(atty::is(atty::Stream::Stdout))
|
||||
.with_target(false)
|
||||
.init();
|
||||
/// Handle unix signals appropriately.
|
||||
async fn handle_signals() -> anyhow::Result<()> {
|
||||
use tokio::signal::unix::{signal, SignalKind};
|
||||
|
||||
let mut hangup = signal(SignalKind::hangup())?;
|
||||
let mut interrupt = signal(SignalKind::interrupt())?;
|
||||
let mut terminate = signal(SignalKind::terminate())?;
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
// Hangup is commonly used for config reload.
|
||||
_ = hangup.recv() => {
|
||||
warn!("received SIGHUP; config reload is not supported");
|
||||
}
|
||||
// Shut down the whole application.
|
||||
_ = interrupt.recv() => {
|
||||
warn!("received SIGINT, exiting immediately");
|
||||
bail!("interrupted");
|
||||
}
|
||||
// TODO: Don't accept new proxy connections.
|
||||
// TODO: Shut down once all exisiting connections have been closed.
|
||||
_ = terminate.recv() => {
|
||||
warn!("received SIGTERM, exiting immediately");
|
||||
bail!("terminated");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// ProxyConfig is created at proxy startup, and lives forever.
|
||||
@@ -161,7 +165,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
|
||||
}));
|
||||
|
||||
let url = args.get_one::<String>("auth-endpoint").unwrap().parse()?;
|
||||
let endpoint = http::Endpoint::new(url, reqwest::Client::new());
|
||||
let endpoint = http::Endpoint::new(url, http::new_client());
|
||||
|
||||
let api = console::provider::neon::Api::new(endpoint, caches);
|
||||
auth::BackendType::Console(Cow::Owned(api), ())
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
//!
|
||||
//! Periodically collect proxy consumption metrics
|
||||
//! and push them to a HTTP endpoint.
|
||||
//!
|
||||
use crate::{config::MetricCollectionConfig, http};
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
|
||||
use serde::Serialize;
|
||||
use std::{collections::HashMap, time::Duration};
|
||||
use tracing::{debug, error, log::info, trace};
|
||||
use std::collections::HashMap;
|
||||
use tracing::{debug, error, info, instrument, trace};
|
||||
|
||||
const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
|
||||
|
||||
@@ -19,48 +18,41 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
|
||||
/// so while the project-id is unique across regions the whole pipeline will work correctly
|
||||
/// because we enrich the event with project_id in the control-plane endpoint.
|
||||
///
|
||||
#[derive(Eq, Hash, PartialEq, Serialize)]
|
||||
#[derive(Eq, Hash, PartialEq, Serialize, Debug)]
|
||||
pub struct Ids {
|
||||
pub endpoint_id: String,
|
||||
}
|
||||
|
||||
pub async fn collect_metrics(
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
metric_collection_interval: Duration,
|
||||
hostname: String,
|
||||
) -> anyhow::Result<()> {
|
||||
pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> {
|
||||
info!("metrics collector config: {config:?}");
|
||||
scopeguard::defer! {
|
||||
info!("collect_metrics has shut down");
|
||||
info!("metrics collector has shut down");
|
||||
}
|
||||
|
||||
let mut ticker = tokio::time::interval(metric_collection_interval);
|
||||
|
||||
info!(
|
||||
"starting collect_metrics. metric_collection_endpoint: {}",
|
||||
metric_collection_endpoint
|
||||
);
|
||||
|
||||
// define client here to reuse it for all requests
|
||||
let client = reqwest::Client::new();
|
||||
let http_client = http::new_client();
|
||||
let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
|
||||
let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
|
||||
|
||||
let mut ticker = tokio::time::interval(config.interval);
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = ticker.tick() => {
|
||||
ticker.tick().await;
|
||||
|
||||
match collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await
|
||||
{
|
||||
Err(e) => {
|
||||
error!("Failed to send consumption metrics: {} ", e);
|
||||
},
|
||||
Ok(_) => { trace!("collect_metrics_iteration completed successfully") },
|
||||
}
|
||||
}
|
||||
let res = collect_metrics_iteration(
|
||||
&http_client,
|
||||
&mut cached_metrics,
|
||||
&config.endpoint,
|
||||
&hostname,
|
||||
)
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Err(e) => error!("failed to send consumption metrics: {e} "),
|
||||
Ok(_) => trace!("periodic metrics collection completed successfully"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
|
||||
fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
|
||||
let mut current_metrics: Vec<(Ids, (u64, DateTime<Utc>))> = Vec::new();
|
||||
let metrics = prometheus::default_registry().gather();
|
||||
|
||||
@@ -99,11 +91,12 @@ pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
|
||||
current_metrics
|
||||
}
|
||||
|
||||
pub async fn collect_metrics_iteration(
|
||||
client: &reqwest::Client,
|
||||
#[instrument(skip_all)]
|
||||
async fn collect_metrics_iteration(
|
||||
client: &http::ClientWithMiddleware,
|
||||
cached_metrics: &mut HashMap<Ids, (u64, DateTime<Utc>)>,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
hostname: String,
|
||||
hostname: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
info!(
|
||||
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
|
||||
@@ -134,7 +127,7 @@ pub async fn collect_metrics_iteration(
|
||||
stop_time: *curr_time,
|
||||
},
|
||||
metric: PROXY_IO_BYTES_PER_CLIENT,
|
||||
idempotency_key: idempotency_key(hostname.clone()),
|
||||
idempotency_key: idempotency_key(hostname.to_owned()),
|
||||
value,
|
||||
extra: Ids {
|
||||
endpoint_id: curr_key.endpoint_id.clone(),
|
||||
@@ -190,6 +183,12 @@ pub async fn collect_metrics_iteration(
|
||||
}
|
||||
} else {
|
||||
error!("metrics endpoint refused the sent metrics: {:?}", res);
|
||||
for metric in chunk.iter() {
|
||||
// Report if the metric value is suspiciously large
|
||||
if metric.value > (1u64 << 40) {
|
||||
error!("potentially abnormal metric value: {:?}", metric);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -17,7 +17,7 @@ use once_cell::sync::Lazy;
|
||||
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
/// Number of times we should retry the `/proxy_wake_compute` http request.
|
||||
const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
|
||||
@@ -91,13 +91,13 @@ pub async fn task_main(
|
||||
.unwrap_or_else(|e| {
|
||||
// Acknowledge that the task has finished with an error.
|
||||
error!("per-client task finished with an error: {e:#}");
|
||||
})
|
||||
.instrument(info_span!("client", session = format_args!("{session_id}"))),
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(tech debt): unite this with its twin below.
|
||||
#[tracing::instrument(fields(session_id), skip_all)]
|
||||
pub async fn handle_ws_client(
|
||||
config: &'static ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
@@ -127,7 +127,7 @@ pub async fn handle_ws_client(
|
||||
let result = config
|
||||
.auth_backend
|
||||
.as_ref()
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name, true))
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name))
|
||||
.transpose();
|
||||
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
@@ -135,10 +135,11 @@ pub async fn handle_ws_client(
|
||||
|
||||
let client = Client::new(stream, creds, ¶ms, session_id);
|
||||
cancel_map
|
||||
.with_session(|session| client.connect_to_db(session))
|
||||
.with_session(|session| client.connect_to_db(session, true))
|
||||
.await
|
||||
}
|
||||
|
||||
#[tracing::instrument(fields(session_id), skip_all)]
|
||||
async fn handle_client(
|
||||
config: &'static ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
@@ -165,7 +166,7 @@ async fn handle_client(
|
||||
let result = config
|
||||
.auth_backend
|
||||
.as_ref()
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name, false))
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name))
|
||||
.transpose();
|
||||
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
@@ -173,7 +174,7 @@ async fn handle_client(
|
||||
|
||||
let client = Client::new(stream, creds, ¶ms, session_id);
|
||||
cancel_map
|
||||
.with_session(|session| client.connect_to_db(session))
|
||||
.with_session(|session| client.connect_to_db(session, false))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -401,7 +402,11 @@ impl<'a, S> Client<'a, S> {
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
/// Let the client authenticate and connect to the designated compute node.
|
||||
async fn connect_to_db(self, session: cancellation::Session<'_>) -> anyhow::Result<()> {
|
||||
async fn connect_to_db(
|
||||
self,
|
||||
session: cancellation::Session<'_>,
|
||||
allow_cleartext: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let Self {
|
||||
mut stream,
|
||||
mut creds,
|
||||
@@ -416,10 +421,12 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
|
||||
let auth_result = async {
|
||||
// `&mut stream` doesn't let us merge those 2 lines.
|
||||
let res = creds.authenticate(&extra, &mut stream).await;
|
||||
let res = creds
|
||||
.authenticate(&extra, &mut stream, allow_cleartext)
|
||||
.await;
|
||||
|
||||
async { res }.or_else(|e| stream.throw_error(e)).await
|
||||
}
|
||||
.instrument(info_span!("auth"))
|
||||
.await?;
|
||||
|
||||
let AuthSuccess {
|
||||
|
||||
@@ -19,12 +19,12 @@ types-requests = "^2.28.5"
|
||||
types-psycopg2 = "^2.9.18"
|
||||
boto3 = "^1.26.16"
|
||||
boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
|
||||
moto = {version = "^3.0.0", extras = ["server"]}
|
||||
moto = {extras = ["server"], version = "^4.1.2"}
|
||||
backoff = "^1.11.1"
|
||||
pytest-lazy-fixture = "^0.6.3"
|
||||
prometheus-client = "^0.14.1"
|
||||
pytest-timeout = "^2.1.0"
|
||||
Werkzeug = "2.1.2"
|
||||
Werkzeug = "^2.2.3"
|
||||
pytest-order = "^1.0.1"
|
||||
allure-pytest = "^2.10.0"
|
||||
pytest-asyncio = "^0.19.0"
|
||||
|
||||
@@ -126,7 +126,12 @@ fn main() -> anyhow::Result<()> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// important to keep the order of:
|
||||
// 1. init logging
|
||||
// 2. tracing panic hook
|
||||
// 3. sentry
|
||||
logging::init(LogFormat::from_config(&args.log_format)?)?;
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
info!("version: {GIT_VERSION}");
|
||||
|
||||
let args_workdir = &args.datadir;
|
||||
|
||||
@@ -424,12 +424,16 @@ async fn http1_handler(
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
let args = Args::parse();
|
||||
|
||||
// important to keep the order of:
|
||||
// 1. init logging
|
||||
// 2. tracing panic hook
|
||||
// 3. sentry
|
||||
logging::init(LogFormat::from_config(&args.log_format)?)?;
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
info!("version: {GIT_VERSION}");
|
||||
::metrics::set_build_info_metric(GIT_VERSION);
|
||||
|
||||
|
||||
@@ -1217,7 +1217,7 @@ class PageserverHttpClient(requests.Session):
|
||||
"""
|
||||
Returns the tenant size, together with the model inputs as the second tuple item.
|
||||
"""
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/size")
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size")
|
||||
self.verbose_error(res)
|
||||
res = res.json()
|
||||
assert isinstance(res, dict)
|
||||
@@ -1228,6 +1228,16 @@ class PageserverHttpClient(requests.Session):
|
||||
assert type(inputs) is dict
|
||||
return (size, inputs)
|
||||
|
||||
def tenant_size_debug(self, tenant_id: TenantId) -> str:
|
||||
"""
|
||||
Returns the tenant size debug info, as an HTML string
|
||||
"""
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size",
|
||||
headers={"Accept": "text/html"},
|
||||
)
|
||||
return res.text
|
||||
|
||||
def timeline_list(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
@@ -1278,6 +1288,7 @@ class PageserverHttpClient(requests.Session):
|
||||
timeline_id: TimelineId,
|
||||
include_non_incremental_logical_size: bool = False,
|
||||
include_timeline_dir_layer_file_size_sum: bool = False,
|
||||
**kwargs,
|
||||
) -> Dict[Any, Any]:
|
||||
params = {}
|
||||
if include_non_incremental_logical_size:
|
||||
@@ -1288,6 +1299,7 @@ class PageserverHttpClient(requests.Session):
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||
params=params,
|
||||
**kwargs,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
@@ -2067,6 +2079,7 @@ class NeonPageserver(PgProtocol):
|
||||
".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs
|
||||
".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
|
||||
".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress
|
||||
".*task iteration took longer than the configured period.*",
|
||||
]
|
||||
|
||||
def start(
|
||||
@@ -2517,15 +2530,17 @@ class NeonProxy(PgProtocol):
|
||||
tb: Optional[TracebackType],
|
||||
):
|
||||
if self._popen is not None:
|
||||
# NOTE the process will die when we're done with tests anyway, because
|
||||
# it's a child process. This is mostly to clean up in between different tests.
|
||||
self._popen.kill()
|
||||
self._popen.terminate()
|
||||
try:
|
||||
self._popen.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
log.warn("failed to gracefully terminate proxy; killing")
|
||||
self._popen.kill()
|
||||
|
||||
@staticmethod
|
||||
async def activate_link_auth(
|
||||
local_vanilla_pg, proxy_with_metric_collector, psql_session_id, create_user=True
|
||||
):
|
||||
|
||||
pg_user = "proxy"
|
||||
|
||||
if create_user:
|
||||
|
||||
@@ -1,19 +1,25 @@
|
||||
# It's possible to run any regular test with the local fs remote storage via
|
||||
# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
|
||||
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, DefaultDict, Dict
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
PageserverApiException,
|
||||
RemoteStorageKind,
|
||||
assert_tenant_status,
|
||||
available_remote_storages,
|
||||
wait_for_last_flush_lsn,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_sk_commit_lsn_to_reach_remote_storage,
|
||||
wait_for_upload,
|
||||
wait_until,
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.types import Lsn
|
||||
from fixtures.utils import query_scalar
|
||||
@@ -212,6 +218,9 @@ def test_ondemand_download_timetravel(
|
||||
log.info(filled_size)
|
||||
assert filled_current_physical == filled_size, "we don't yet do layer eviction"
|
||||
|
||||
# Wait until generated image layers are uploaded to S3
|
||||
time.sleep(3)
|
||||
|
||||
env.pageserver.stop()
|
||||
|
||||
# remove all the layer files
|
||||
@@ -449,3 +458,233 @@ def test_download_remote_layers_api(
|
||||
pg_old = env.postgres.create_start(branch_name="main")
|
||||
with pg_old.cursor() as cur:
|
||||
assert query_scalar(cur, "select count(*) from testtab") == table_len
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
|
||||
def test_compaction_downloads_on_demand_without_image_creation(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
):
|
||||
"""
|
||||
Create a few layers, then evict, then make sure compaction runs successfully.
|
||||
"""
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_compaction_downloads_on_demand_without_image_creation",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
conf = {
|
||||
# Disable background GC & compaction
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# unused, because manual will be called after each table
|
||||
"checkpoint_distance": 100 * 1024**2,
|
||||
# this will be updated later on to allow manual compaction outside of checkpoints
|
||||
"compaction_threshold": 100,
|
||||
# repartitioning parameter, not required here
|
||||
"image_creation_threshold": 100,
|
||||
# repartitioning parameter, not required here
|
||||
"compaction_target_size": 128 * 1024**2,
|
||||
# pitr_interval and gc_horizon are not interesting because we dont run gc
|
||||
}
|
||||
|
||||
# Override defaults, to create more layers
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
|
||||
env.initial_tenant = tenant_id
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
with env.postgres.create_start("main") as pg:
|
||||
# no particular reason to create the layers like this, but we are sure
|
||||
# not to hit the image_creation_threshold here.
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("create table a as select id::bigint from generate_series(1, 204800) s(id)")
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("update a set id = -id")
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
|
||||
assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint"
|
||||
assert len(layers.historic_layers) == 1 + 2, "should have inidb layer and 2 deltas"
|
||||
|
||||
for layer in layers.historic_layers:
|
||||
log.info(f"pre-compact: {layer}")
|
||||
pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
|
||||
|
||||
env.neon_cli.config_tenant(tenant_id, {"compaction_threshold": "3"})
|
||||
|
||||
pageserver_http.timeline_compact(tenant_id, timeline_id)
|
||||
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
|
||||
for layer in layers.historic_layers:
|
||||
log.info(f"post compact: {layer}")
|
||||
assert len(layers.historic_layers) == 1, "should have compacted to single layer"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
|
||||
def test_compaction_downloads_on_demand_with_image_creation(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
):
|
||||
"""
|
||||
Create layers, compact with high image_creation_threshold, then run final compaction with all layers evicted.
|
||||
|
||||
Due to current implementation, this will make image creation on-demand download layers, but we cannot really
|
||||
directly test for it.
|
||||
"""
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_compaction_downloads_on_demand",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
conf = {
|
||||
# Disable background GC & compaction
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# repartitioning threshold is this / 10, but it doesn't really seem to matter
|
||||
"checkpoint_distance": 50 * 1024**2,
|
||||
"compaction_threshold": 3,
|
||||
# important: keep this high for the data ingestion
|
||||
"image_creation_threshold": 100,
|
||||
# repartitioning parameter, unused
|
||||
"compaction_target_size": 128 * 1024**2,
|
||||
# pitr_interval and gc_horizon are not interesting because we dont run gc
|
||||
}
|
||||
|
||||
# Override defaults, to create more layers
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
|
||||
env.initial_tenant = tenant_id
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
# no particular reason to create the layers like this, but we are sure
|
||||
# not to hit the image_creation_threshold here.
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("create table a (id bigserial primary key, some_value bigint not null)")
|
||||
cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)")
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
for i in range(0, 2):
|
||||
for j in range(0, 3):
|
||||
# create a minimal amount of "delta difficulty" for this table
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("update a set some_value = -some_value + %s", (j,))
|
||||
|
||||
with pg.cursor() as cur:
|
||||
# vacuuming should aid to reuse keys, though it's not really important
|
||||
# with image_creation_threshold=1 which we will use on the last compaction
|
||||
cur.execute("vacuum")
|
||||
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
|
||||
if i == 1 and j == 2:
|
||||
# last iteration; stop before checkpoint to avoid leaving an inmemory layer
|
||||
pg.stop_and_destroy()
|
||||
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
# images should not yet be created, because threshold is too high,
|
||||
# but these will be reshuffled to L1 layers
|
||||
pageserver_http.timeline_compact(tenant_id, timeline_id)
|
||||
|
||||
for _ in range(0, 20):
|
||||
# loop in case flushing is still in progress
|
||||
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
|
||||
if not layers.in_memory_layers:
|
||||
break
|
||||
time.sleep(0.2)
|
||||
|
||||
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
|
||||
assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint"
|
||||
|
||||
kinds_before: DefaultDict[str, int] = defaultdict(int)
|
||||
|
||||
for layer in layers.historic_layers:
|
||||
kinds_before[layer.kind] += 1
|
||||
pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
|
||||
|
||||
assert dict(kinds_before) == {"Delta": 4}
|
||||
|
||||
# now having evicted all layers, reconfigure to have lower image creation
|
||||
# threshold to expose image creation to downloading all of the needed
|
||||
# layers -- threshold of 2 would sound more reasonable, but keeping it as 1
|
||||
# to be less flaky
|
||||
env.neon_cli.config_tenant(tenant_id, {"image_creation_threshold": "1"})
|
||||
|
||||
pageserver_http.timeline_compact(tenant_id, timeline_id)
|
||||
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
|
||||
kinds_after: DefaultDict[str, int] = defaultdict(int)
|
||||
for layer in layers.historic_layers:
|
||||
kinds_after[layer.kind] += 1
|
||||
|
||||
assert dict(kinds_after) == {"Delta": 4, "Image": 1}
|
||||
|
||||
|
||||
def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
|
||||
return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_ondemand_download_failure_to_replace(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
):
|
||||
"""
|
||||
Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking.
|
||||
|
||||
See: https://github.com/neondatabase/neon/issues/3533
|
||||
"""
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_ondemand_download_failure_to_replace",
|
||||
)
|
||||
|
||||
# disable gc and compaction via default tenant config because config is lost while detaching
|
||||
# so that compaction will not be the one to download the layer but the http handler is
|
||||
neon_env_builder.pageserver_config_override = (
|
||||
"""tenant_config={gc_period = "0s", compaction_period = "0s"}"""
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant()
|
||||
|
||||
env.initial_tenant = tenant_id
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
|
||||
|
||||
wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
|
||||
|
||||
# remove layers so that they will be redownloaded
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
|
||||
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
|
||||
pageserver_http.configure_failpoints(("layermap-replace-notfound", "return"))
|
||||
|
||||
# requesting details with non-incremental size should trigger a download of the only layer
|
||||
# this will need to be adjusted if an index for logical sizes is ever implemented
|
||||
with pytest.raises(PageserverApiException):
|
||||
# error message is not useful
|
||||
pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2)
|
||||
|
||||
actual_message = (
|
||||
".* ERROR .*replacing downloaded layer into layermap failed because layer was not found"
|
||||
)
|
||||
assert env.pageserver.log_contains(actual_message) is not None
|
||||
env.pageserver.allowed_errors.append(actual_message)
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
".* ERROR .*Error processing HTTP request: InternalServerError\\(get local timeline info"
|
||||
)
|
||||
# this might get to run and attempt on-demand, but not always
|
||||
env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'")
|
||||
|
||||
# if the above returned, then we didn't have a livelock, and all is well
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
from typing import Any, List, Tuple
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import parse_metrics
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn
|
||||
from fixtures.types import Lsn
|
||||
|
||||
|
||||
def test_empty_tenant_size(neon_simple_env: NeonEnv):
|
||||
def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
|
||||
env = neon_simple_env
|
||||
(tenant_id, _) = env.neon_cli.create_tenant()
|
||||
http_client = env.pageserver.http_client()
|
||||
@@ -18,6 +18,9 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv):
|
||||
|
||||
main_branch_name = "main"
|
||||
|
||||
branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0]
|
||||
assert branch_name == main_branch_name
|
||||
|
||||
with env.postgres.create_start(
|
||||
main_branch_name,
|
||||
tenant_id=tenant_id,
|
||||
@@ -39,12 +42,44 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv):
|
||||
size, inputs = http_client.tenant_size_and_modelinputs(tenant_id)
|
||||
assert size == initial_size, "tenant_size should not be affected by shutdown of compute"
|
||||
|
||||
expected_commands: List[Any] = [{"branch_from": None}, "end_of_branch"]
|
||||
actual_commands: List[Any] = list(map(lambda x: x["command"], inputs["updates"])) # type: ignore
|
||||
assert actual_commands == expected_commands
|
||||
expected_inputs = {
|
||||
"segments": [
|
||||
{
|
||||
"segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True},
|
||||
"timeline_id": f"{main_timeline_id}",
|
||||
"kind": "BranchStart",
|
||||
},
|
||||
{
|
||||
"segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True},
|
||||
"timeline_id": f"{main_timeline_id}",
|
||||
"kind": "BranchEnd",
|
||||
},
|
||||
],
|
||||
"timeline_inputs": [
|
||||
{
|
||||
"timeline_id": f"{main_timeline_id}",
|
||||
"ancestor_id": None,
|
||||
"ancestor_lsn": "0/0",
|
||||
"last_record": "0/1698CC0",
|
||||
"latest_gc_cutoff": "0/1698C48",
|
||||
"horizon_cutoff": "0/0",
|
||||
"pitr_cutoff": "0/0",
|
||||
"next_gc_cutoff": "0/0",
|
||||
"retention_param_cutoff": None,
|
||||
}
|
||||
],
|
||||
}
|
||||
expected_inputs = mask_model_inputs(expected_inputs)
|
||||
actual_inputs = mask_model_inputs(inputs)
|
||||
|
||||
assert expected_inputs == actual_inputs
|
||||
|
||||
size_debug_file = open(test_output_dir / "size_debug.html", "w")
|
||||
size_debug = http_client.tenant_size_debug(tenant_id)
|
||||
size_debug_file.write(size_debug)
|
||||
|
||||
|
||||
def test_branched_empty_timeline_size(neon_simple_env: NeonEnv):
|
||||
def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: Path):
|
||||
"""
|
||||
Issue found in production. Because the ancestor branch was under
|
||||
gc_horizon, the branchpoint was "dangling" and the computation could not be
|
||||
@@ -75,8 +110,12 @@ def test_branched_empty_timeline_size(neon_simple_env: NeonEnv):
|
||||
|
||||
assert size_after_branching > initial_size
|
||||
|
||||
size_debug_file = open(test_output_dir / "size_debug.html", "w")
|
||||
size_debug = http_client.tenant_size_debug(tenant_id)
|
||||
size_debug_file.write(size_debug)
|
||||
|
||||
def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv):
|
||||
|
||||
def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_output_dir: Path):
|
||||
"""
|
||||
More general version of test_branched_empty_timeline_size
|
||||
|
||||
@@ -128,9 +167,13 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv):
|
||||
size_after_writes = http_client.tenant_size(tenant_id)
|
||||
assert size_after_writes > initial_size
|
||||
|
||||
size_debug_file = open(test_output_dir / "size_debug.html", "w")
|
||||
size_debug = http_client.tenant_size_debug(tenant_id)
|
||||
size_debug_file.write(size_debug)
|
||||
|
||||
|
||||
@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
|
||||
def test_branch_point_within_horizon(neon_simple_env: NeonEnv):
|
||||
def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
|
||||
"""
|
||||
gc_horizon = 15
|
||||
|
||||
@@ -167,9 +210,13 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv):
|
||||
|
||||
assert size_before_branching < size_after
|
||||
|
||||
size_debug_file = open(test_output_dir / "size_debug.html", "w")
|
||||
size_debug = http_client.tenant_size_debug(tenant_id)
|
||||
size_debug_file.write(size_debug)
|
||||
|
||||
|
||||
@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
|
||||
def test_parent_within_horizon(neon_simple_env: NeonEnv):
|
||||
def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
|
||||
"""
|
||||
gc_horizon = 5
|
||||
|
||||
@@ -179,7 +226,7 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
|
||||
env = neon_simple_env
|
||||
gc_horizon = 200_000
|
||||
gc_horizon = 5_000
|
||||
(tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
|
||||
http_client = env.pageserver.http_client()
|
||||
|
||||
@@ -212,9 +259,13 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv):
|
||||
|
||||
assert size_before_branching < size_after
|
||||
|
||||
size_debug_file = open(test_output_dir / "size_debug.html", "w")
|
||||
size_debug = http_client.tenant_size_debug(tenant_id)
|
||||
size_debug_file.write(size_debug)
|
||||
|
||||
|
||||
@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
|
||||
def test_only_heads_within_horizon(neon_simple_env: NeonEnv):
|
||||
def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
|
||||
"""
|
||||
gc_horizon = small
|
||||
|
||||
@@ -253,8 +304,14 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv):
|
||||
|
||||
latest_size = size_now
|
||||
|
||||
size_debug_file = open(test_output_dir / "size_debug.html", "w")
|
||||
size_debug = http_client.tenant_size_debug(tenant_id)
|
||||
size_debug_file.write(size_debug)
|
||||
|
||||
def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
def test_single_branch_get_tenant_size_grows(
|
||||
neon_env_builder: NeonEnvBuilder, test_output_dir: Path
|
||||
):
|
||||
"""
|
||||
Operate on single branch reading the tenants size after each transaction.
|
||||
"""
|
||||
@@ -279,7 +336,20 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
collected_responses: List[Tuple[Lsn, int]] = []
|
||||
|
||||
size_debug_file = open(test_output_dir / "size_debug.html", "w")
|
||||
|
||||
def check_size_change(current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev: int):
|
||||
if current_lsn - initdb_lsn > gc_horizon:
|
||||
assert (
|
||||
size >= prev
|
||||
), "tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size"
|
||||
else:
|
||||
assert (
|
||||
size > prev
|
||||
), "tenant_size should grow, because we continue to add WAL to initial snapshot size"
|
||||
|
||||
with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg:
|
||||
initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL)")
|
||||
|
||||
@@ -297,13 +367,19 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
|
||||
size = http_client.tenant_size(tenant_id)
|
||||
size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
|
||||
|
||||
size_debug = http_client.tenant_size_debug(tenant_id)
|
||||
size_debug_file.write(size_debug)
|
||||
|
||||
if len(collected_responses) > 0:
|
||||
prev = collected_responses[-1][1]
|
||||
if size == 0:
|
||||
assert prev == 0
|
||||
else:
|
||||
# branch start shouldn't be past gc_horizon yet
|
||||
# thus the size should grow as we insert more data
|
||||
assert current_lsn - initdb_lsn <= gc_horizon
|
||||
assert size > prev
|
||||
|
||||
collected_responses.append((current_lsn, size))
|
||||
@@ -323,9 +399,15 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
|
||||
size = http_client.tenant_size(tenant_id)
|
||||
size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
|
||||
|
||||
size_debug = http_client.tenant_size_debug(tenant_id)
|
||||
size_debug_file.write(size_debug)
|
||||
|
||||
prev = collected_responses[-1][1]
|
||||
assert size > prev, "tenant_size should grow with updates"
|
||||
|
||||
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
|
||||
|
||||
collected_responses.append((current_lsn, size))
|
||||
|
||||
while True:
|
||||
@@ -340,9 +422,9 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
size = http_client.tenant_size(tenant_id)
|
||||
prev = collected_responses[-1][1]
|
||||
assert (
|
||||
size > prev
|
||||
), "even though rows have been deleted, the tenant_size should increase"
|
||||
|
||||
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
|
||||
|
||||
collected_responses.append((current_lsn, size))
|
||||
|
||||
with pg.cursor() as cur:
|
||||
@@ -352,7 +434,9 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
size = http_client.tenant_size(tenant_id)
|
||||
prev = collected_responses[-1][1]
|
||||
assert size > prev, "dropping table grows tenant_size"
|
||||
|
||||
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
|
||||
|
||||
collected_responses.append((current_lsn, size))
|
||||
|
||||
# this isn't too many lines to forget for a while. observed while
|
||||
@@ -364,24 +448,17 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
size_debug_file.close()
|
||||
|
||||
size_after = http_client.tenant_size(tenant_id)
|
||||
prev = collected_responses[-1][1]
|
||||
|
||||
assert size_after == prev, "size after restarting pageserver should not have changed"
|
||||
|
||||
ps_metrics = parse_metrics(http_client.get_metrics(), "pageserver")
|
||||
tenant_metric_filter = {
|
||||
"tenant_id": str(tenant_id),
|
||||
}
|
||||
|
||||
tenant_size_metric = int(
|
||||
ps_metrics.query_one("pageserver_tenant_synthetic_size", filter=tenant_metric_filter).value
|
||||
)
|
||||
|
||||
assert tenant_size_metric == size_after, "API size value should be equal to metric size value"
|
||||
|
||||
|
||||
def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder):
|
||||
def test_get_tenant_size_with_multiple_branches(
|
||||
neon_env_builder: NeonEnvBuilder, test_output_dir: Path
|
||||
):
|
||||
"""
|
||||
Reported size goes up while branches or rows are being added, goes down after removing branches.
|
||||
"""
|
||||
@@ -481,6 +558,10 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
|
||||
size_after = http_client.tenant_size(tenant_id)
|
||||
assert size_after == size_after_thinning_branch
|
||||
|
||||
size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w")
|
||||
size_debug = http_client.tenant_size_debug(tenant_id)
|
||||
size_debug_file_before.write(size_debug)
|
||||
|
||||
# teardown, delete branches, and the size should be going down
|
||||
http_client.timeline_delete(tenant_id, first_branch_timeline_id)
|
||||
|
||||
@@ -493,3 +574,38 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
|
||||
|
||||
assert size_after_deleting_second < size_after_continuing_on_main
|
||||
assert size_after_deleting_second > size_after_first_branch
|
||||
|
||||
size_debug_file = open(test_output_dir / "size_debug.html", "w")
|
||||
size_debug = http_client.tenant_size_debug(tenant_id)
|
||||
size_debug_file.write(size_debug)
|
||||
|
||||
|
||||
# Helper for tests that compare timeline_inputs
|
||||
# We don't want to compare the exact values, because they can be unstable
|
||||
# and cause flaky tests. So replace the values with useful invariants.
|
||||
def mask_model_inputs(x):
|
||||
if isinstance(x, dict):
|
||||
newx = {}
|
||||
for k, v in x.items():
|
||||
if k == "size":
|
||||
if v is None or v == 0:
|
||||
# no change
|
||||
newx[k] = v
|
||||
elif v < 0:
|
||||
newx[k] = "<0"
|
||||
else:
|
||||
newx[k] = ">0"
|
||||
elif k.endswith("lsn") or k.endswith("cutoff") or k == "last_record":
|
||||
if v is None or v == 0 or v == "0/0":
|
||||
# no change
|
||||
newx[k] = v
|
||||
else:
|
||||
newx[k] = "masked"
|
||||
else:
|
||||
newx[k] = mask_model_inputs(v)
|
||||
return newx
|
||||
elif isinstance(x, list):
|
||||
newlist = [mask_model_inputs(v) for v in x]
|
||||
return newlist
|
||||
else:
|
||||
return x
|
||||
|
||||
@@ -280,6 +280,7 @@ def test_tenant_upgrades_index_json_from_v0(
|
||||
|
||||
timeline_file.seek(0)
|
||||
json.dump(v0_index_part, timeline_file)
|
||||
timeline_file.truncate(timeline_file.tell())
|
||||
|
||||
env.pageserver.start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
12
vm-cgconfig.conf
Normal file
12
vm-cgconfig.conf
Normal file
@@ -0,0 +1,12 @@
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
group neon-postgres {
|
||||
perm {
|
||||
admin {
|
||||
uid = vm-informant;
|
||||
}
|
||||
task {
|
||||
gid = users;
|
||||
}
|
||||
}
|
||||
memory {}
|
||||
}
|
||||
@@ -38,14 +38,14 @@ prost = { version = "0.11" }
|
||||
rand = { version = "0.8", features = ["small_rng"] }
|
||||
regex = { version = "1" }
|
||||
regex-syntax = { version = "0.6" }
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "multipart", "rustls-tls"] }
|
||||
ring = { version = "0.16", features = ["std"] }
|
||||
rustls = { version = "0.20", features = ["dangerous_configuration"] }
|
||||
scopeguard = { version = "1" }
|
||||
serde = { version = "1", features = ["alloc", "derive"] }
|
||||
serde_json = { version = "1", features = ["raw_value"] }
|
||||
socket2 = { version = "0.4", default-features = false, features = ["all"] }
|
||||
tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "sync", "time"] }
|
||||
tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "sync", "time"] }
|
||||
tokio-util = { version = "0.7", features = ["codec", "io"] }
|
||||
tonic = { version = "0.8", features = ["tls-roots"] }
|
||||
tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
|
||||
|
||||
Reference in New Issue
Block a user