mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-25 14:20:38 +00:00
Compare commits
52 Commits
pg_log_vie
...
sk/auth_re
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
892723cc7e | ||
|
|
8543485e92 | ||
|
|
ec53c5ca2e | ||
|
|
94d612195a | ||
|
|
b1329db495 | ||
|
|
5bb971d64e | ||
|
|
0364f77b9a | ||
|
|
4ac6a9f089 | ||
|
|
9486d76b2a | ||
|
|
040f736909 | ||
|
|
645e4f6ab9 | ||
|
|
e947cc119b | ||
|
|
53e5d18da5 | ||
|
|
3813c703c9 | ||
|
|
b15204fa8c | ||
|
|
81c75586ab | ||
|
|
556fb1642a | ||
|
|
23aca81943 | ||
|
|
42798e6adc | ||
|
|
b03143dfc8 | ||
|
|
fdacfaabfd | ||
|
|
b2a3981ead | ||
|
|
fe0b616299 | ||
|
|
c4e1cafb63 | ||
|
|
fdf5e4db5e | ||
|
|
d1e86d65dc | ||
|
|
f5b4697c90 | ||
|
|
3be81dd36b | ||
|
|
e6ec2400fc | ||
|
|
5b911e1f9f | ||
|
|
9ea7b5dd38 | ||
|
|
0112a602e1 | ||
|
|
92214578af | ||
|
|
6861259be7 | ||
|
|
11df2ee5d7 | ||
|
|
31a3910fd9 | ||
|
|
381c8fca4f | ||
|
|
4625da3164 | ||
|
|
850f6b1cb9 | ||
|
|
f19b70b379 | ||
|
|
9d0cf08d5f | ||
|
|
2d6fd72177 | ||
|
|
8945fbdb31 | ||
|
|
05ac0e2493 | ||
|
|
bfd45dd671 | ||
|
|
7f80230fd2 | ||
|
|
78bbbccadb | ||
|
|
dbbe032c39 | ||
|
|
cb9473928d | ||
|
|
fa20e37574 | ||
|
|
4911d7ce6f | ||
|
|
e83684b868 |
47
.github/ansible/staging.eu-central-1.hosts.yaml
vendored
Normal file
47
.github/ansible/staging.eu-central-1.hosts.yaml
vendored
Normal file
@@ -0,0 +1,47 @@
|
||||
storage:
|
||||
vars:
|
||||
bucket_name: neon-dev-storage-eu-central-1
|
||||
bucket_region: eu-central-1
|
||||
# We only register/update storage in one preview console and manually copy to other instances
|
||||
console_mgmt_base_url: http://neon-internal-api.helium.aws.neon.build
|
||||
broker_endpoint: http://storage-broker-lb.alpha.eu-central-1.internal.aws.neon.build:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.helium.aws.neon.build/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "20m"
|
||||
threshold: &default_eviction_threshold "20m"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
prefix_in_bucket: "pageserver/v1"
|
||||
safekeeper_s3_prefix: safekeeper/v1/wal
|
||||
hostname_suffix: ""
|
||||
remote_user: ssm-user
|
||||
ansible_aws_ssm_region: eu-central-1
|
||||
ansible_aws_ssm_bucket_name: neon-dev-storage-eu-central-1
|
||||
console_region_id: aws-eu-central-1
|
||||
sentry_environment: staging
|
||||
|
||||
children:
|
||||
pageservers:
|
||||
hosts:
|
||||
pageserver-0.eu-central-1.aws.neon.build:
|
||||
ansible_host: i-011f93ec26cfba2d4
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
safekeeper-0.eu-central-1.aws.neon.build:
|
||||
ansible_host: i-0ff026d27babf8ddd
|
||||
safekeeper-1.eu-central-1.aws.neon.build:
|
||||
ansible_host: i-03983a49ee54725d9
|
||||
safekeeper-2.eu-central-1.aws.neon.build:
|
||||
ansible_host: i-0bd025ecdb61b0db3
|
||||
4
.github/ansible/staging.us-east-2.hosts.yaml
vendored
4
.github/ansible/staging.us-east-2.hosts.yaml
vendored
@@ -48,9 +48,9 @@ storage:
|
||||
hosts:
|
||||
safekeeper-0.us-east-2.aws.neon.build:
|
||||
ansible_host: i-027662bd552bf5db0
|
||||
safekeeper-1.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0171efc3604a7b907
|
||||
safekeeper-2.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0de0b03a51676a6ce
|
||||
safekeeper-3.us-east-2.aws.neon.build:
|
||||
ansible_host: i-05f8ba2cda243bd18
|
||||
safekeeper-99.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0d61b6a2ea32028d5
|
||||
|
||||
52
.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml
vendored
Normal file
52
.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
# Helm chart values for neon-storage-broker
|
||||
podLabels:
|
||||
neon_env: staging
|
||||
neon_service: storage-broker
|
||||
|
||||
# Use L4 LB
|
||||
service:
|
||||
# service.annotations -- Annotations to add to the service
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
|
||||
# assign service to this name at external-dns
|
||||
external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.alpha.eu-central-1.internal.aws.neon.build
|
||||
# service.type -- Service type
|
||||
type: LoadBalancer
|
||||
# service.port -- broker listen port
|
||||
port: 50051
|
||||
|
||||
ingress:
|
||||
enabled: false
|
||||
|
||||
metrics:
|
||||
enabled: false
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-storage-broker.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-storage-broker
|
||||
app.kubernetes.io/instance: neon-storage-broker
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-storage-broker"
|
||||
endpoints:
|
||||
- port: broker
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
|
||||
settings:
|
||||
sentryEnvironment: "staging"
|
||||
@@ -23,6 +23,7 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.eu-west-1.aws.neon.build"
|
||||
otelExporterOtlpEndpoint: "https://otel-collector.zeta.eu-west-1.internal.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
|
||||
@@ -9,6 +9,7 @@ settings:
|
||||
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.stage.neon.tech/psql_session/"
|
||||
domain: "pg.neon.build"
|
||||
otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
@@ -24,6 +24,7 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.cloud.stage.neon.tech"
|
||||
otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
|
||||
@@ -25,6 +25,7 @@ settings:
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.build"
|
||||
extraDomains: ["*.us-east-2.postgres.zenith.tech", "*.us-east-2.retooldb-staging.com"]
|
||||
otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
|
||||
67
.github/helm-values/preview-template.neon-proxy-scram.yaml
vendored
Normal file
67
.github/helm-values/preview-template.neon-proxy-scram.yaml
vendored
Normal file
@@ -0,0 +1,67 @@
|
||||
# Helm chart values for neon-proxy-scram.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
deploymentStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxSurge: 100%
|
||||
maxUnavailable: 50%
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.${PREVIEW_NAME}.aws.neon.build/management/api/v2"
|
||||
domain: "*.cloud.${PREVIEW_NAME}.aws.neon.build"
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.${PREVIEW_NAME}.aws.neon.build/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: test
|
||||
neon_region: ${PREVIEW_NAME}.eu-central-1
|
||||
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: cloud.${PREVIEW_NAME}.aws.neon.build
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
# serviceMonitor:
|
||||
# enabled: true
|
||||
# selector:
|
||||
# release: kube-prometheus-stack
|
||||
|
||||
extraManifests:
|
||||
- apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMServiceScrape
|
||||
metadata:
|
||||
name: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
labels:
|
||||
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: neon-proxy
|
||||
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
|
||||
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
namespace: "{{ .Release.Namespace }}"
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "neon-proxy"
|
||||
endpoints:
|
||||
- port: http
|
||||
path: /metrics
|
||||
interval: 10s
|
||||
scrapeTimeout: 10s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Release.Namespace }}"
|
||||
@@ -23,8 +23,8 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.us-east-1.aws.neon.tech"
|
||||
# These domains haven't been delegated yet.
|
||||
# extraDomains: ["*.us-east-1.retooldb.com", "*.us-east-1.postgres.vercel-storage.com"]
|
||||
# *.us-east-1.retooldb.com hasn't been delegated yet.
|
||||
extraDomains: ["*.us-east-1.postgres.vercel-storage.com"]
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
|
||||
199
.github/workflows/build_and_test.yml
vendored
199
.github/workflows/build_and_test.yml
vendored
@@ -111,8 +111,21 @@ jobs:
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
- name: Run cargo clippy
|
||||
run: ./run_clippy.sh
|
||||
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
|
||||
# This will catch compiler & clippy warnings in all feature combinations.
|
||||
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
|
||||
# NB: keep clippy args in sync with ./run_clippy.sh
|
||||
- run: |
|
||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
||||
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
|
||||
echo "No clippy args found in .neon_clippy_args"
|
||||
exit 1
|
||||
fi
|
||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
||||
- name: Run cargo clippy (debug)
|
||||
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
||||
- name: Run cargo clippy (release)
|
||||
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
@@ -541,7 +554,7 @@ jobs:
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
options: --init
|
||||
needs: [ push-docker-hub, tag ]
|
||||
needs: [ promote-images, tag ]
|
||||
steps:
|
||||
- name: Set PR's status to pending and request a remote CI test
|
||||
run: |
|
||||
@@ -584,8 +597,7 @@ jobs:
|
||||
neon-image:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
needs: [ tag ]
|
||||
# https://github.com/GoogleContainerTools/kaniko/issues/2005
|
||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
@@ -597,11 +609,32 @@ jobs:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
- name: Configure ECR and Docker Hub login
|
||||
run: |
|
||||
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
|
||||
echo "::add-mask::${DOCKERHUB_AUTH}"
|
||||
|
||||
cat <<-EOF > /kaniko/.docker/config.json
|
||||
{
|
||||
"auths": {
|
||||
"https://index.docker.io/v1/": {
|
||||
"auth": "${DOCKERHUB_AUTH}"
|
||||
}
|
||||
},
|
||||
"credHelpers": {
|
||||
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
- name: Kaniko build neon
|
||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
run:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
||||
- name: Cleanup ECR folder
|
||||
@@ -652,7 +685,7 @@ jobs:
|
||||
compute-tools-image:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
needs: [ tag ]
|
||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
@@ -661,18 +694,41 @@ jobs:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1 # v3 won't work with kaniko
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
- name: Configure ECR and Docker Hub login
|
||||
run: |
|
||||
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
|
||||
echo "::add-mask::${DOCKERHUB_AUTH}"
|
||||
|
||||
cat <<-EOF > /kaniko/.docker/config.json
|
||||
{
|
||||
"auths": {
|
||||
"https://index.docker.io/v1/": {
|
||||
"auth": "${DOCKERHUB_AUTH}"
|
||||
}
|
||||
},
|
||||
"credHelpers": {
|
||||
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
- name: Kaniko build compute tools
|
||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
run:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--dockerfile Dockerfile.compute-tools
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
compute-node-image:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
needs: [ tag ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -689,12 +745,36 @@ jobs:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
- name: Configure ECR and Docker Hub login
|
||||
run: |
|
||||
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
|
||||
echo "::add-mask::${DOCKERHUB_AUTH}"
|
||||
|
||||
cat <<-EOF > /kaniko/.docker/config.json
|
||||
{
|
||||
"auths": {
|
||||
"https://index.docker.io/v1/": {
|
||||
"auth": "${DOCKERHUB_AUTH}"
|
||||
}
|
||||
},
|
||||
"credHelpers": {
|
||||
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
- name: Kaniko build compute node with extensions
|
||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
run:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--build-arg PG_VERSION=${{ matrix.version }}
|
||||
--dockerfile Dockerfile.compute-node
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
@@ -786,13 +866,11 @@ jobs:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
needs: [ tag, test-images, vm-compute-node-image ]
|
||||
container: golang:1.19-bullseye
|
||||
if: github.event_name != 'workflow_dispatch'
|
||||
# Don't add if-condition here.
|
||||
# The job should always be run because we have dependant other jobs that shouldn't be skipped
|
||||
|
||||
steps:
|
||||
- name: Install Crane & ECR helper
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
|
||||
@@ -802,10 +880,15 @@ jobs:
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
|
||||
- name: Copy vm-compute-node images to Docker Hub
|
||||
run: |
|
||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
|
||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
|
||||
|
||||
- name: Add latest tag to images
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
github.event_name != 'workflow_dispatch'
|
||||
run: |
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
@@ -814,50 +897,10 @@ jobs:
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
push-docker-hub:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs: [ promote-images, tag ]
|
||||
container: golang:1.19-bullseye
|
||||
|
||||
steps:
|
||||
- name: Install Crane & ECR helper
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
|
||||
|
||||
- name: Configure ECR login
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
|
||||
- name: Pull neon image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} neon
|
||||
|
||||
- name: Pull compute tools image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools
|
||||
|
||||
- name: Pull compute node v14 image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14
|
||||
|
||||
- name: Pull vm compute node v14 image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
|
||||
|
||||
- name: Pull compute node v15 image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15
|
||||
|
||||
- name: Pull vm compute node v15 image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
|
||||
|
||||
- name: Pull rust image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
|
||||
|
||||
- name: Push images to production ECR
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
github.event_name != 'workflow_dispatch'
|
||||
run: |
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
|
||||
@@ -872,28 +915,12 @@ jobs:
|
||||
echo "" > /github/home/.docker/config.json
|
||||
crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
|
||||
|
||||
- name: Push neon image to Docker Hub
|
||||
run: crane push neon neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
- name: Push vm-compute-node to Docker Hub
|
||||
run: |
|
||||
crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
|
||||
crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push compute tools image to Docker Hub
|
||||
run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push compute node v14 image to Docker Hub
|
||||
run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push vm compute node v14 image to Docker Hub
|
||||
run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push compute node v15 image to Docker Hub
|
||||
run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push vm compute node v15 image to Docker Hub
|
||||
run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push rust image to Docker Hub
|
||||
run: crane push rust neondatabase/rust:pinned
|
||||
|
||||
- name: Add latest tag to images in Docker Hub
|
||||
- name: Push latest tags to Docker Hub
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
@@ -913,7 +940,7 @@ jobs:
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
||||
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||
needs: [ push-docker-hub, tag, regress-tests ]
|
||||
needs: [ promote-images, tag, regress-tests ]
|
||||
if: |
|
||||
contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
@@ -947,7 +974,7 @@ jobs:
|
||||
deploy:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
needs: [ push-docker-hub, tag, regress-tests ]
|
||||
needs: [ promote-images, tag, regress-tests ]
|
||||
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
@@ -984,7 +1011,7 @@ jobs:
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
needs: [ push-docker-hub, tag, regress-tests ]
|
||||
needs: [ promote-images, tag, regress-tests ]
|
||||
if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
|
||||
steps:
|
||||
- name: Promote compatibility snapshot for the release
|
||||
|
||||
52
.github/workflows/deploy-dev.yml
vendored
52
.github/workflows/deploy-dev.yml
vendored
@@ -48,7 +48,8 @@ jobs:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
target_region: [ eu-west-1, us-east-2 ]
|
||||
# TODO(sergey): Fix storage deploy in eu-central-1
|
||||
target_region: [ eu-west-1, us-east-2]
|
||||
environment:
|
||||
name: dev-${{ matrix.target_region }}
|
||||
steps:
|
||||
@@ -133,6 +134,53 @@ jobs:
|
||||
|
||||
- name: Cleanup helm folder
|
||||
run: rm -rf ~/.cache
|
||||
|
||||
deploy-preview-proxy-new:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
if: inputs.deployProxy
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target_region: eu-central-1
|
||||
target_cluster: dev-eu-central-1-alpha
|
||||
environment:
|
||||
name: dev-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
- name: Configure AWS Credentials
|
||||
uses: aws-actions/configure-aws-credentials@v1-node16
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::369495373322:role/github-runner
|
||||
aws-region: eu-central-1
|
||||
role-skip-session-tagging: true
|
||||
role-duration-seconds: 1800
|
||||
|
||||
- name: Configure environment
|
||||
run: |
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
|
||||
|
||||
- name: Re-deploy preview proxies
|
||||
run: |
|
||||
DOCKER_TAG=${{ inputs.dockerTag }}
|
||||
for PREVIEW_NAME in helium argon krypton xenon radon oganesson hydrogen nitrogen oxygen fluorine chlorine; do
|
||||
export PREVIEW_NAME
|
||||
envsubst <.github/helm-values/preview-template.neon-proxy-scram.yaml >preview-${PREVIEW_NAME}.neon-proxy-scram.yaml
|
||||
helm upgrade neon-proxy-scram-${PREVIEW_NAME} neondatabase/neon-proxy --namespace neon-proxy-${PREVIEW_NAME} --create-namespace --install --atomic -f preview-${PREVIEW_NAME}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
done
|
||||
|
||||
- name: Cleanup helm folder
|
||||
run: rm -rf ~/.cache
|
||||
|
||||
deploy-storage-broker-new:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
@@ -148,6 +196,8 @@ jobs:
|
||||
target_cluster: dev-us-east-2-beta
|
||||
- target_region: eu-west-1
|
||||
target_cluster: dev-eu-west-1-zeta
|
||||
- target_region: eu-central-1
|
||||
target_cluster: dev-eu-central-1-alpha
|
||||
environment:
|
||||
name: dev-${{ matrix.target_region }}
|
||||
steps:
|
||||
|
||||
4
.neon_clippy_args
Normal file
4
.neon_clippy_args
Normal file
@@ -0,0 +1,4 @@
|
||||
# * `-A unknown_lints` – do not warn about unknown lint suppressions
|
||||
# that people with newer toolchains might use
|
||||
# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status)
|
||||
export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings"
|
||||
133
Cargo.lock
generated
133
Cargo.lock
generated
@@ -1574,6 +1574,21 @@ version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
|
||||
dependencies = [
|
||||
"foreign-types-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types-shared"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
|
||||
|
||||
[[package]]
|
||||
name = "form_urlencoded"
|
||||
version = "1.1.0"
|
||||
@@ -2361,6 +2376,24 @@ version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
|
||||
|
||||
[[package]]
|
||||
name = "native-tls"
|
||||
version = "0.2.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"log",
|
||||
"openssl",
|
||||
"openssl-probe",
|
||||
"openssl-sys",
|
||||
"schannel",
|
||||
"security-framework",
|
||||
"security-framework-sys",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.26.2"
|
||||
@@ -2483,12 +2516,50 @@ version = "11.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.52"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cfg-if",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"openssl-macros",
|
||||
"openssl-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-macros"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.15",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-probe"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.87"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry"
|
||||
version = "0.18.0"
|
||||
@@ -2681,6 +2752,7 @@ dependencies = [
|
||||
"tenant_size_model",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
@@ -2815,6 +2887,12 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
|
||||
|
||||
[[package]]
|
||||
name = "pkg-config"
|
||||
version = "0.3.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
|
||||
|
||||
[[package]]
|
||||
name = "plotters"
|
||||
version = "0.3.4"
|
||||
@@ -2846,7 +2924,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -2856,10 +2934,21 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tokio-postgres",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -2877,7 +2966,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -2958,7 +3047,6 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
"rand",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
@@ -3109,10 +3197,12 @@ dependencies = [
|
||||
"itertools",
|
||||
"md5",
|
||||
"metrics",
|
||||
"native-tls",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"parking_lot",
|
||||
"pin-project-lite",
|
||||
"postgres-native-tls",
|
||||
"postgres_backend",
|
||||
"pq_proto",
|
||||
"prometheus",
|
||||
@@ -3567,6 +3657,7 @@ dependencies = [
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"fs2",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
@@ -3581,6 +3672,7 @@ dependencies = [
|
||||
"pq_proto",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"safekeeper_api",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -4319,10 +4411,20 @@ dependencies = [
|
||||
"syn 2.0.15",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-native-tls"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -4629,6 +4731,16 @@ dependencies = [
|
||||
"valuable",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-error"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d686ec1c0f384b1277f097b2f279a2ecc11afe8c133c1aabf036a27cb4cd206e"
|
||||
dependencies = [
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-futures"
|
||||
version = "0.2.5"
|
||||
@@ -4854,6 +4966,7 @@ dependencies = [
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"criterion",
|
||||
"futures",
|
||||
"heapless",
|
||||
@@ -4865,6 +4978,7 @@ dependencies = [
|
||||
"nix",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"pq_proto",
|
||||
"rand",
|
||||
"regex",
|
||||
"routerify",
|
||||
@@ -4879,6 +4993,7 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"uuid",
|
||||
@@ -4901,6 +5016,12 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
|
||||
|
||||
[[package]]
|
||||
name = "vcpkg"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.4"
|
||||
@@ -5279,13 +5400,11 @@ name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap 4.2.2",
|
||||
"clap_builder",
|
||||
"crossbeam-utils",
|
||||
"digest",
|
||||
"either",
|
||||
"fail",
|
||||
"futures",
|
||||
|
||||
13
Cargo.toml
13
Cargo.toml
@@ -62,6 +62,7 @@ jsonwebtoken = "8"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
memoffset = "0.8"
|
||||
native-tls = "0.2"
|
||||
nix = "0.26"
|
||||
notify = "5.0.0"
|
||||
num_cpus = "1.15"
|
||||
@@ -110,6 +111,7 @@ toml = "0.7"
|
||||
toml_edit = "0.19"
|
||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.18.0"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
url = "2.2"
|
||||
@@ -123,10 +125,11 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
|
||||
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
|
||||
|
||||
## Other git libraries
|
||||
@@ -161,7 +164,7 @@ tonic-build = "0.9"
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
[patch.crates-io]
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -249,18 +249,63 @@ impl ComputeNode {
|
||||
/// safekeepers sync, basebackup, etc.
|
||||
#[instrument(skip(self, compute_state))]
|
||||
pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
#[derive(Clone)]
|
||||
enum Replication {
|
||||
Primary,
|
||||
Static { lsn: Lsn },
|
||||
HotStandby,
|
||||
}
|
||||
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
let hot_replica = if let Some(option) = spec.cluster.settings.find_ref("hot_standby") {
|
||||
if let Some(value) = &option.value {
|
||||
anyhow::ensure!(option.vartype == "bool");
|
||||
matches!(value.as_str(), "on" | "yes" | "true")
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
let replication = if hot_replica {
|
||||
Replication::HotStandby
|
||||
} else if let Some(lsn) = spec.cluster.settings.find("recovery_target_lsn") {
|
||||
Replication::Static {
|
||||
lsn: Lsn::from_str(&lsn)?,
|
||||
}
|
||||
} else {
|
||||
Replication::Primary
|
||||
};
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
self.create_pgdata()?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
|
||||
|
||||
info!("starting safekeepers syncing");
|
||||
let lsn = self
|
||||
.sync_safekeepers(pspec.storage_auth_token.clone())
|
||||
.with_context(|| "failed to sync safekeepers")?;
|
||||
info!("safekeepers synced at LSN {}", lsn);
|
||||
// Syncing safekeepers is only safe with primary nodes: if a primary
|
||||
// is already connected it will be kicked out, so a secondary (standby)
|
||||
// cannot sync safekeepers.
|
||||
let lsn = match &replication {
|
||||
Replication::Primary => {
|
||||
info!("starting safekeepers syncing");
|
||||
let lsn = self
|
||||
.sync_safekeepers(pspec.storage_auth_token.clone())
|
||||
.with_context(|| "failed to sync safekeepers")?;
|
||||
info!("safekeepers synced at LSN {}", lsn);
|
||||
lsn
|
||||
}
|
||||
Replication::Static { lsn } => {
|
||||
info!("Starting read-only node at static LSN {}", lsn);
|
||||
*lsn
|
||||
}
|
||||
Replication::HotStandby => {
|
||||
info!("Initializing standby from latest Pageserver LSN");
|
||||
Lsn(0)
|
||||
}
|
||||
};
|
||||
|
||||
info!(
|
||||
"getting basebackup@{} from pageserver {}",
|
||||
@@ -276,6 +321,13 @@ impl ComputeNode {
|
||||
// Update pg_hba.conf received with basebackup.
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
match &replication {
|
||||
Replication::Primary | Replication::Static { .. } => {}
|
||||
Replication::HotStandby => {
|
||||
add_standby_signal(pgdata_path)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -94,6 +94,7 @@ impl PgOptionsSerialize for GenericOptions {
|
||||
|
||||
pub trait GenericOptionsSearch {
|
||||
fn find(&self, name: &str) -> Option<String>;
|
||||
fn find_ref(&self, name: &str) -> Option<&GenericOption>;
|
||||
}
|
||||
|
||||
impl GenericOptionsSearch for GenericOptions {
|
||||
@@ -103,6 +104,12 @@ impl GenericOptionsSearch for GenericOptions {
|
||||
let op = ops.iter().find(|s| s.name == name)?;
|
||||
op.value.clone()
|
||||
}
|
||||
|
||||
/// Lookup option by name, returning ref
|
||||
fn find_ref(&self, name: &str) -> Option<&GenericOption> {
|
||||
let ops = self.as_ref()?;
|
||||
ops.iter().find(|s| s.name == name)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait RoleExt {
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
|
||||
@@ -145,6 +146,21 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a standby.signal file
|
||||
pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
|
||||
// XXX: consider making it a part of spec.json
|
||||
info!("adding standby.signal");
|
||||
let signalfile = pgdata_path.join("standby.signal");
|
||||
|
||||
if !signalfile.exists() {
|
||||
info!("created standby.signal");
|
||||
File::create(signalfile)?;
|
||||
} else {
|
||||
info!("reused pre-existing standby.signal");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Given a cluster spec json and open transaction it handles roles creation,
|
||||
/// deletion and update.
|
||||
#[instrument(skip_all)]
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::endpoint::Replication;
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use control_plane::pageserver::PageServerNode;
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
@@ -474,7 +475,14 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
||||
env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
|
||||
|
||||
println!("Creating endpoint for imported timeline ...");
|
||||
cplane.new_endpoint(tenant_id, name, timeline_id, None, None, pg_version)?;
|
||||
cplane.new_endpoint(
|
||||
tenant_id,
|
||||
name,
|
||||
timeline_id,
|
||||
None,
|
||||
pg_version,
|
||||
Replication::Primary,
|
||||
)?;
|
||||
println!("Done");
|
||||
}
|
||||
Some(("branch", branch_match)) => {
|
||||
@@ -560,20 +568,20 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
.iter()
|
||||
.filter(|(_, endpoint)| endpoint.tenant_id == tenant_id)
|
||||
{
|
||||
let lsn_str = match endpoint.lsn {
|
||||
None => {
|
||||
// -> primary endpoint
|
||||
let lsn_str = match endpoint.replication {
|
||||
Replication::Static(lsn) => {
|
||||
// -> read-only endpoint
|
||||
// Use the node's LSN.
|
||||
lsn.to_string()
|
||||
}
|
||||
_ => {
|
||||
// -> primary endpoint or hot replica
|
||||
// Use the LSN at the end of the timeline.
|
||||
timeline_infos
|
||||
.get(&endpoint.timeline_id)
|
||||
.map(|bi| bi.last_record_lsn.to_string())
|
||||
.unwrap_or_else(|| "?".to_string())
|
||||
}
|
||||
Some(lsn) => {
|
||||
// -> read-only endpoint
|
||||
// Use the endpoint's LSN.
|
||||
lsn.to_string()
|
||||
}
|
||||
};
|
||||
|
||||
let branch_name = timeline_name_mappings
|
||||
@@ -619,7 +627,26 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
.copied()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, lsn, port, pg_version)?;
|
||||
let hot_standby = sub_args
|
||||
.get_one::<bool>("hot-standby")
|
||||
.copied()
|
||||
.unwrap_or(false);
|
||||
|
||||
let replication = match (lsn, hot_standby) {
|
||||
(Some(lsn), false) => Replication::Static(lsn),
|
||||
(None, true) => Replication::Replica,
|
||||
(None, false) => Replication::Primary,
|
||||
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
|
||||
};
|
||||
|
||||
cplane.new_endpoint(
|
||||
tenant_id,
|
||||
&endpoint_id,
|
||||
timeline_id,
|
||||
port,
|
||||
pg_version,
|
||||
replication,
|
||||
)?;
|
||||
}
|
||||
"start" => {
|
||||
let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
|
||||
@@ -637,7 +664,21 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
None
|
||||
};
|
||||
|
||||
let hot_standby = sub_args
|
||||
.get_one::<bool>("hot-standby")
|
||||
.copied()
|
||||
.unwrap_or(false);
|
||||
|
||||
if let Some(endpoint) = endpoint {
|
||||
match (&endpoint.replication, hot_standby) {
|
||||
(Replication::Static(_), true) => {
|
||||
bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
|
||||
}
|
||||
(Replication::Primary, true) => {
|
||||
bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint.start(&auth_token)?;
|
||||
} else {
|
||||
@@ -659,6 +700,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
.get_one::<u32>("pg-version")
|
||||
.copied()
|
||||
.context("Failed to `pg-version` from the argument string")?;
|
||||
|
||||
let replication = match (lsn, hot_standby) {
|
||||
(Some(lsn), false) => Replication::Static(lsn),
|
||||
(None, true) => Replication::Replica,
|
||||
(None, false) => Replication::Primary,
|
||||
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
|
||||
};
|
||||
|
||||
// when used with custom port this results in non obvious behaviour
|
||||
// port is remembered from first start command, i e
|
||||
// start --port X
|
||||
@@ -670,9 +719,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
tenant_id,
|
||||
endpoint_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
port,
|
||||
pg_version,
|
||||
replication,
|
||||
)?;
|
||||
ep.start(&auth_token)?;
|
||||
}
|
||||
@@ -928,6 +977,12 @@ fn cli() -> Command {
|
||||
.help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
|
||||
.required(false);
|
||||
|
||||
let hot_standby_arg = Arg::new("hot-standby")
|
||||
.value_parser(value_parser!(bool))
|
||||
.long("hot-standby")
|
||||
.help("If set, the node will be a hot replica on the specified timeline")
|
||||
.required(false);
|
||||
|
||||
Command::new("Neon CLI")
|
||||
.arg_required_else_help(true)
|
||||
.version(GIT_VERSION)
|
||||
@@ -1052,6 +1107,7 @@ fn cli() -> Command {
|
||||
.long("config-only")
|
||||
.required(false))
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(hot_standby_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
||||
@@ -1062,6 +1118,7 @@ fn cli() -> Command {
|
||||
.arg(lsn_arg)
|
||||
.arg(port_arg)
|
||||
.arg(pg_version_arg)
|
||||
.arg(hot_standby_arg)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("stop")
|
||||
|
||||
@@ -68,18 +68,19 @@ impl ComputeControlPlane {
|
||||
tenant_id: TenantId,
|
||||
name: &str,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Option<Lsn>,
|
||||
port: Option<u16>,
|
||||
pg_version: u32,
|
||||
replication: Replication,
|
||||
) -> Result<Arc<Endpoint>> {
|
||||
let port = port.unwrap_or_else(|| self.get_port());
|
||||
|
||||
let ep = Arc::new(Endpoint {
|
||||
name: name.to_owned(),
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
|
||||
env: self.env.clone(),
|
||||
pageserver: Arc::clone(&self.pageserver),
|
||||
timeline_id,
|
||||
lsn,
|
||||
replication,
|
||||
tenant_id,
|
||||
pg_version,
|
||||
});
|
||||
@@ -95,6 +96,18 @@ impl ComputeControlPlane {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
pub enum Replication {
|
||||
// Regular read-write node
|
||||
Primary,
|
||||
// if recovery_target_lsn is provided, and we want to pin the node to a specific LSN
|
||||
Static(Lsn),
|
||||
// Hot standby; read-only replica.
|
||||
// Future versions may want to distinguish between replicas with hot standby
|
||||
// feedback and other kinds of replication configurations.
|
||||
Replica,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Endpoint {
|
||||
/// used as the directory name
|
||||
@@ -102,7 +115,7 @@ pub struct Endpoint {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
// Some(lsn) if this is a read-only endpoint anchored at 'lsn'. None for the primary.
|
||||
pub lsn: Option<Lsn>,
|
||||
pub replication: Replication,
|
||||
|
||||
// port and address of the Postgres server
|
||||
pub address: SocketAddr,
|
||||
@@ -153,9 +166,17 @@ impl Endpoint {
|
||||
fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string());
|
||||
let pg_version = u32::from_str(&pg_version_str)?;
|
||||
|
||||
// parse recovery_target_lsn, if any
|
||||
let recovery_target_lsn: Option<Lsn> =
|
||||
conf.parse_field_optional("recovery_target_lsn", &context)?;
|
||||
// parse recovery_target_lsn and primary_conninfo into Recovery Target, if any
|
||||
let replication = if let Some(lsn_str) = conf.get("recovery_target_lsn") {
|
||||
Replication::Static(Lsn::from_str(lsn_str)?)
|
||||
} else if let Some(slot_name) = conf.get("primary_slot_name") {
|
||||
let slot_name = slot_name.to_string();
|
||||
let prefix = format!("repl_{}_", timeline_id);
|
||||
assert!(slot_name.starts_with(&prefix));
|
||||
Replication::Replica
|
||||
} else {
|
||||
Replication::Primary
|
||||
};
|
||||
|
||||
// ok now
|
||||
Ok(Endpoint {
|
||||
@@ -164,7 +185,7 @@ impl Endpoint {
|
||||
env: env.clone(),
|
||||
pageserver: Arc::clone(pageserver),
|
||||
timeline_id,
|
||||
lsn: recovery_target_lsn,
|
||||
replication,
|
||||
tenant_id,
|
||||
pg_version,
|
||||
})
|
||||
@@ -299,50 +320,83 @@ impl Endpoint {
|
||||
conf.append("neon.pageserver_connstring", &pageserver_connstr);
|
||||
conf.append("neon.tenant_id", &self.tenant_id.to_string());
|
||||
conf.append("neon.timeline_id", &self.timeline_id.to_string());
|
||||
if let Some(lsn) = self.lsn {
|
||||
conf.append("recovery_target_lsn", &lsn.to_string());
|
||||
}
|
||||
|
||||
conf.append_line("");
|
||||
// Configure backpressure
|
||||
// - Replication write lag depends on how fast the walreceiver can process incoming WAL.
|
||||
// This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec,
|
||||
// so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB.
|
||||
// Actually latency should be much smaller (better if < 1sec). But we assume that recently
|
||||
// updates pages are not requested from pageserver.
|
||||
// - Replication flush lag depends on speed of persisting data by checkpointer (creation of
|
||||
// delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to
|
||||
// remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long
|
||||
// recovery time (in case of pageserver crash) and disk space overflow at safekeepers.
|
||||
// - Replication apply lag depends on speed of uploading changes to S3 by uploader thread.
|
||||
// To be able to restore database in case of pageserver node crash, safekeeper should not
|
||||
// remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
|
||||
// (if they are not able to upload WAL to S3).
|
||||
conf.append("max_replication_write_lag", "15MB");
|
||||
conf.append("max_replication_flush_lag", "10GB");
|
||||
// Replication-related configurations, such as WAL sending
|
||||
match &self.replication {
|
||||
Replication::Primary => {
|
||||
// Configure backpressure
|
||||
// - Replication write lag depends on how fast the walreceiver can process incoming WAL.
|
||||
// This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec,
|
||||
// so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB.
|
||||
// Actually latency should be much smaller (better if < 1sec). But we assume that recently
|
||||
// updates pages are not requested from pageserver.
|
||||
// - Replication flush lag depends on speed of persisting data by checkpointer (creation of
|
||||
// delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to
|
||||
// remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long
|
||||
// recovery time (in case of pageserver crash) and disk space overflow at safekeepers.
|
||||
// - Replication apply lag depends on speed of uploading changes to S3 by uploader thread.
|
||||
// To be able to restore database in case of pageserver node crash, safekeeper should not
|
||||
// remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
|
||||
// (if they are not able to upload WAL to S3).
|
||||
conf.append("max_replication_write_lag", "15MB");
|
||||
conf.append("max_replication_flush_lag", "10GB");
|
||||
|
||||
if !self.env.safekeepers.is_empty() {
|
||||
// Configure Postgres to connect to the safekeepers
|
||||
conf.append("synchronous_standby_names", "walproposer");
|
||||
if !self.env.safekeepers.is_empty() {
|
||||
// Configure Postgres to connect to the safekeepers
|
||||
conf.append("synchronous_standby_names", "walproposer");
|
||||
|
||||
let safekeepers = self
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.map(|sk| format!("localhost:{}", sk.pg_port))
|
||||
.collect::<Vec<String>>()
|
||||
.join(",");
|
||||
conf.append("neon.safekeepers", &safekeepers);
|
||||
} else {
|
||||
// We only use setup without safekeepers for tests,
|
||||
// and don't care about data durability on pageserver,
|
||||
// so set more relaxed synchronous_commit.
|
||||
conf.append("synchronous_commit", "remote_write");
|
||||
let safekeepers = self
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.map(|sk| format!("localhost:{}", sk.pg_port))
|
||||
.collect::<Vec<String>>()
|
||||
.join(",");
|
||||
conf.append("neon.safekeepers", &safekeepers);
|
||||
} else {
|
||||
// We only use setup without safekeepers for tests,
|
||||
// and don't care about data durability on pageserver,
|
||||
// so set more relaxed synchronous_commit.
|
||||
conf.append("synchronous_commit", "remote_write");
|
||||
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
// This isn't really a supported configuration, but can be useful for
|
||||
// testing.
|
||||
conf.append("synchronous_standby_names", "pageserver");
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
// This isn't really a supported configuration, but can be useful for
|
||||
// testing.
|
||||
conf.append("synchronous_standby_names", "pageserver");
|
||||
}
|
||||
}
|
||||
Replication::Static(lsn) => {
|
||||
conf.append("recovery_target_lsn", &lsn.to_string());
|
||||
}
|
||||
Replication::Replica => {
|
||||
assert!(!self.env.safekeepers.is_empty());
|
||||
|
||||
// TODO: use future host field from safekeeper spec
|
||||
// Pass the list of safekeepers to the replica so that it can connect to any of them,
|
||||
// whichever is availiable.
|
||||
let sk_ports = self
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.map(|x| x.pg_port.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
|
||||
|
||||
let connstr = format!(
|
||||
"host={} port={} options='-c timeline_id={} tenant_id={}' application_name=replica replication=true",
|
||||
sk_hosts,
|
||||
sk_ports,
|
||||
&self.timeline_id.to_string(),
|
||||
&self.tenant_id.to_string(),
|
||||
);
|
||||
|
||||
let slot_name = format!("repl_{}_", self.timeline_id);
|
||||
conf.append("primary_conninfo", connstr.as_str());
|
||||
conf.append("primary_slot_name", slot_name.as_str());
|
||||
conf.append("hot_standby", "on");
|
||||
}
|
||||
}
|
||||
|
||||
let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
|
||||
@@ -355,21 +409,27 @@ impl Endpoint {
|
||||
}
|
||||
|
||||
fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
|
||||
let backup_lsn = if let Some(lsn) = self.lsn {
|
||||
Some(lsn)
|
||||
} else if !self.env.safekeepers.is_empty() {
|
||||
// LSN 0 means that it is bootstrap and we need to download just
|
||||
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
|
||||
// procedure evolves quite actively right now, so let's think about it again
|
||||
// when things would be more stable (TODO).
|
||||
let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
|
||||
if lsn == Lsn(0) {
|
||||
None
|
||||
} else {
|
||||
Some(lsn)
|
||||
let backup_lsn = match &self.replication {
|
||||
Replication::Primary => {
|
||||
if !self.env.safekeepers.is_empty() {
|
||||
// LSN 0 means that it is bootstrap and we need to download just
|
||||
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
|
||||
// procedure evolves quite actively right now, so let's think about it again
|
||||
// when things would be more stable (TODO).
|
||||
let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
|
||||
if lsn == Lsn(0) {
|
||||
None
|
||||
} else {
|
||||
Some(lsn)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
Replication::Static(lsn) => Some(*lsn),
|
||||
Replication::Replica => {
|
||||
None // Take the latest snapshot available to start with
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
self.do_basebackup(backup_lsn)?;
|
||||
@@ -466,7 +526,7 @@ impl Endpoint {
|
||||
// 3. Load basebackup
|
||||
self.load_basebackup(auth_token)?;
|
||||
|
||||
if self.lsn.is_some() {
|
||||
if self.replication != Replication::Primary {
|
||||
File::create(self.pgdata().join("standby.signal"))?;
|
||||
}
|
||||
|
||||
|
||||
@@ -359,8 +359,8 @@ impl PageServerNode {
|
||||
.transpose()
|
||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||
eviction_policy: settings
|
||||
.get("eviction_policy")
|
||||
.map(|x| serde_json::from_str(x))
|
||||
.remove("eviction_policy")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("Failed to parse 'eviction_policy' json")?,
|
||||
min_resident_size_override: settings
|
||||
|
||||
@@ -13,7 +13,7 @@ use std::io::BufRead;
|
||||
use std::str::FromStr;
|
||||
|
||||
/// In-memory representation of a postgresql.conf file
|
||||
#[derive(Default)]
|
||||
#[derive(Default, Debug)]
|
||||
pub struct PostgresConf {
|
||||
lines: Vec<String>,
|
||||
hash: HashMap<String, String>,
|
||||
|
||||
@@ -28,11 +28,6 @@
|
||||
"value": "replica",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "hot_standby",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "wal_log_hints",
|
||||
"value": "on",
|
||||
|
||||
@@ -95,10 +95,13 @@ pub fn generate_wal_segment(
|
||||
segno: u64,
|
||||
system_id: u64,
|
||||
pg_version: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<Bytes, SerializeError> {
|
||||
assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE));
|
||||
|
||||
match pg_version {
|
||||
14 => v14::xlog_utils::generate_wal_segment(segno, system_id),
|
||||
15 => v15::xlog_utils::generate_wal_segment(segno, system_id),
|
||||
14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
|
||||
15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
|
||||
_ => Err(SerializeError::BadInput),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -195,6 +195,7 @@ pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
|
||||
|
||||
pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
|
||||
pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
|
||||
pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
|
||||
pub const XLP_LONG_HEADER: u16 = 0x0002;
|
||||
|
||||
/* From fsm_internals.h */
|
||||
|
||||
@@ -270,6 +270,11 @@ impl XLogPageHeaderData {
|
||||
use utils::bin_ser::LeSer;
|
||||
XLogPageHeaderData::des_from(&mut buf.reader())
|
||||
}
|
||||
|
||||
pub fn encode(&self) -> Result<Bytes, SerializeError> {
|
||||
use utils::bin_ser::LeSer;
|
||||
self.ser().map(|b| b.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl XLogLongPageHeaderData {
|
||||
@@ -328,22 +333,32 @@ impl CheckPoint {
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Generate new, empty WAL segment.
|
||||
// We need this segment to start compute node.
|
||||
//
|
||||
pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, SerializeError> {
|
||||
/// Generate new, empty WAL segment, with correct block headers at the first
|
||||
/// page of the segment and the page that contains the given LSN.
|
||||
/// We need this segment to start compute node.
|
||||
pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Bytes, SerializeError> {
|
||||
let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE);
|
||||
|
||||
let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
|
||||
|
||||
let page_off = lsn.block_offset();
|
||||
let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);
|
||||
|
||||
let first_page_only = seg_off < XLOG_BLCKSZ;
|
||||
let (shdr_rem_len, infoflags) = if first_page_only {
|
||||
(seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD)
|
||||
} else {
|
||||
(0, 0)
|
||||
};
|
||||
|
||||
let hdr = XLogLongPageHeaderData {
|
||||
std: {
|
||||
XLogPageHeaderData {
|
||||
xlp_magic: XLOG_PAGE_MAGIC as u16,
|
||||
xlp_info: pg_constants::XLP_LONG_HEADER,
|
||||
xlp_info: pg_constants::XLP_LONG_HEADER | infoflags,
|
||||
xlp_tli: PG_TLI,
|
||||
xlp_pageaddr: pageaddr,
|
||||
xlp_rem_len: 0,
|
||||
xlp_rem_len: shdr_rem_len as u32,
|
||||
..Default::default() // Put 0 in padding fields.
|
||||
}
|
||||
},
|
||||
@@ -357,6 +372,33 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, Seriali
|
||||
|
||||
//zero out the rest of the file
|
||||
seg_buf.resize(WAL_SEGMENT_SIZE, 0);
|
||||
|
||||
if !first_page_only {
|
||||
let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
|
||||
let header = XLogPageHeaderData {
|
||||
xlp_magic: XLOG_PAGE_MAGIC as u16,
|
||||
xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
|
||||
pg_constants::XLP_FIRST_IS_CONTRECORD
|
||||
} else {
|
||||
0
|
||||
},
|
||||
xlp_tli: PG_TLI,
|
||||
xlp_pageaddr: lsn.page_lsn().0,
|
||||
xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
|
||||
page_off as u32
|
||||
} else {
|
||||
0u32
|
||||
},
|
||||
..Default::default() // Put 0 in padding fields.
|
||||
};
|
||||
let hdr_bytes = header.encode()?;
|
||||
|
||||
debug_assert!(seg_buf.len() > block_offset + hdr_bytes.len());
|
||||
debug_assert_ne!(block_offset, 0);
|
||||
|
||||
seg_buf[block_offset..block_offset + hdr_bytes.len()].copy_from_slice(&hdr_bytes[..]);
|
||||
}
|
||||
|
||||
Ok(seg_buf.freeze())
|
||||
}
|
||||
|
||||
|
||||
@@ -6,9 +6,8 @@ use postgres::Client;
|
||||
use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||
use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
|
||||
use std::cmp::Ordering;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use std::process::Command;
|
||||
use std::time::Instant;
|
||||
use tempfile::{tempdir, TempDir};
|
||||
|
||||
@@ -95,12 +94,6 @@ impl Conf {
|
||||
|
||||
pub fn start_server(&self) -> Result<PostgresServer> {
|
||||
info!("Starting Postgres server in {:?}", self.datadir);
|
||||
let log_file = fs::File::create(self.datadir.join("pg.log")).with_context(|| {
|
||||
format!(
|
||||
"Failed to create pg.log file in directory {}",
|
||||
self.datadir.display()
|
||||
)
|
||||
})?;
|
||||
let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols)
|
||||
let unix_socket_dir_path = unix_socket_dir.path().to_owned();
|
||||
let server_process = self
|
||||
@@ -110,9 +103,7 @@ impl Conf {
|
||||
.arg(unix_socket_dir_path.as_os_str())
|
||||
.arg("-D")
|
||||
.arg(self.datadir.as_os_str())
|
||||
.args(["-c", "logging_collector=on"]) // stderr will mess up with tests output
|
||||
.args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg]))
|
||||
.stderr(Stdio::from(log_file))
|
||||
.spawn()?;
|
||||
let server = PostgresServer {
|
||||
process: server_process,
|
||||
@@ -121,7 +112,7 @@ impl Conf {
|
||||
let mut c = postgres::Config::new();
|
||||
c.host_path(&unix_socket_dir_path);
|
||||
c.user("postgres");
|
||||
c.connect_timeout(Duration::from_millis(1000));
|
||||
c.connect_timeout(Duration::from_millis(10000));
|
||||
c
|
||||
},
|
||||
};
|
||||
|
||||
@@ -10,7 +10,6 @@ byteorder.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
rand.workspace = true
|
||||
serde.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
@@ -6,15 +6,10 @@ pub mod framed;
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use postgres_protocol::PG_EPOCH;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
fmt, io, str,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
use tracing::{trace, warn};
|
||||
use std::{borrow::Cow, collections::HashMap, fmt, io, str};
|
||||
|
||||
// re-export for use in utils pageserver_feedback.rs
|
||||
pub use postgres_protocol::PG_EPOCH;
|
||||
|
||||
pub type Oid = u32;
|
||||
pub type SystemId = u64;
|
||||
@@ -664,7 +659,7 @@ fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), ProtocolErr
|
||||
}
|
||||
|
||||
/// Read cstring from buf, advancing it.
|
||||
fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
|
||||
pub fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
|
||||
let pos = buf
|
||||
.iter()
|
||||
.position(|x| *x == 0)
|
||||
@@ -939,175 +934,10 @@ impl<'a> BeMessage<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
|
||||
/// Serialized in custom flexible key/value format. In replication protocol, it
|
||||
/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
|
||||
/// Standby status update / Hot standby feedback messages.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct PageserverFeedback {
|
||||
/// Last known size of the timeline. Used to enforce timeline size limit.
|
||||
pub current_timeline_size: u64,
|
||||
/// LSN last received and ingested by the pageserver.
|
||||
pub last_received_lsn: u64,
|
||||
/// LSN up to which data is persisted by the pageserver to its local disc.
|
||||
pub disk_consistent_lsn: u64,
|
||||
/// LSN up to which data is persisted by the pageserver on s3; safekeepers
|
||||
/// consider WAL before it can be removed.
|
||||
pub remote_consistent_lsn: u64,
|
||||
pub replytime: SystemTime,
|
||||
}
|
||||
|
||||
// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
|
||||
// Do not remove previously available fields because this might be backwards incompatible.
|
||||
pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
|
||||
|
||||
impl PageserverFeedback {
|
||||
pub fn empty() -> PageserverFeedback {
|
||||
PageserverFeedback {
|
||||
current_timeline_size: 0,
|
||||
last_received_lsn: 0,
|
||||
remote_consistent_lsn: 0,
|
||||
disk_consistent_lsn: 0,
|
||||
replytime: SystemTime::now(),
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize PageserverFeedback using custom format
|
||||
// to support protocol extensibility.
|
||||
//
|
||||
// Following layout is used:
|
||||
// char - number of key-value pairs that follow.
|
||||
//
|
||||
// key-value pairs:
|
||||
// null-terminated string - key,
|
||||
// uint32 - value length in bytes
|
||||
// value itself
|
||||
//
|
||||
// TODO: change serialized fields names once all computes migrate to rename.
|
||||
pub fn serialize(&self, buf: &mut BytesMut) {
|
||||
buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
|
||||
buf.put_slice(b"current_timeline_size\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.current_timeline_size);
|
||||
|
||||
buf.put_slice(b"ps_writelsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.last_received_lsn);
|
||||
buf.put_slice(b"ps_flushlsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.disk_consistent_lsn);
|
||||
buf.put_slice(b"ps_applylsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.remote_consistent_lsn);
|
||||
|
||||
let timestamp = self
|
||||
.replytime
|
||||
.duration_since(*PG_EPOCH)
|
||||
.expect("failed to serialize pg_replytime earlier than PG_EPOCH")
|
||||
.as_micros() as i64;
|
||||
|
||||
buf.put_slice(b"ps_replytime\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_i64(timestamp);
|
||||
}
|
||||
|
||||
// Deserialize PageserverFeedback message
|
||||
// TODO: change serialized fields names once all computes migrate to rename.
|
||||
pub fn parse(mut buf: Bytes) -> PageserverFeedback {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
let nfields = buf.get_u8();
|
||||
for _ in 0..nfields {
|
||||
let key = read_cstr(&mut buf).unwrap();
|
||||
match key.as_ref() {
|
||||
b"current_timeline_size" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
rf.current_timeline_size = buf.get_u64();
|
||||
}
|
||||
b"ps_writelsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
rf.last_received_lsn = buf.get_u64();
|
||||
}
|
||||
b"ps_flushlsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
rf.disk_consistent_lsn = buf.get_u64();
|
||||
}
|
||||
b"ps_applylsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
rf.remote_consistent_lsn = buf.get_u64();
|
||||
}
|
||||
b"ps_replytime" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
let raw_time = buf.get_i64();
|
||||
if raw_time > 0 {
|
||||
rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
|
||||
} else {
|
||||
rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let len = buf.get_i32();
|
||||
warn!(
|
||||
"PageserverFeedback parse. unknown key {} of len {len}. Skip it.",
|
||||
String::from_utf8_lossy(key.as_ref())
|
||||
);
|
||||
buf.advance(len as usize);
|
||||
}
|
||||
}
|
||||
}
|
||||
trace!("PageserverFeedback parsed is {:?}", rf);
|
||||
rf
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_replication_feedback_serialization() {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
// Fill rf with some values
|
||||
rf.current_timeline_size = 12345678;
|
||||
// Set rounded time to be able to compare it with deserialized value,
|
||||
// because it is rounded up to microseconds during serialization.
|
||||
rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
let mut data = BytesMut::new();
|
||||
rf.serialize(&mut data);
|
||||
|
||||
let rf_parsed = PageserverFeedback::parse(data.freeze());
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replication_feedback_unknown_key() {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
// Fill rf with some values
|
||||
rf.current_timeline_size = 12345678;
|
||||
// Set rounded time to be able to compare it with deserialized value,
|
||||
// because it is rounded up to microseconds during serialization.
|
||||
rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
let mut data = BytesMut::new();
|
||||
rf.serialize(&mut data);
|
||||
|
||||
// Add an extra field to the buffer and adjust number of keys
|
||||
if let Some(first) = data.first_mut() {
|
||||
*first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
|
||||
}
|
||||
|
||||
data.put_slice(b"new_field_one\0");
|
||||
data.put_i32(8);
|
||||
data.put_u64(42);
|
||||
|
||||
// Parse serialized data and check that new field is not parsed
|
||||
let rf_parsed = PageserverFeedback::parse(data.freeze());
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_startup_message_params_options_escaped() {
|
||||
fn split_options(params: &StartupMessageParams) -> Vec<Cow<'_, str>> {
|
||||
|
||||
@@ -99,7 +99,11 @@ struct S3WithTestBlobs {
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledS3 {
|
||||
async fn setup() -> Self {
|
||||
utils::logging::init(utils::logging::LogFormat::Test).expect("logging init failed");
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||
info!(
|
||||
"`{}` env variable is not set, skipping the test",
|
||||
|
||||
@@ -11,6 +11,7 @@ async-trait.workspace = true
|
||||
anyhow.workspace = true
|
||||
bincode.workspace = true
|
||||
bytes.workspace = true
|
||||
chrono.workspace = true
|
||||
heapless.workspace = true
|
||||
hex = { workspace = true, features = ["serde"] }
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
@@ -27,7 +28,8 @@ signal-hook.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json"] }
|
||||
tracing-error.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
|
||||
rand.workspace = true
|
||||
serde_with.workspace = true
|
||||
strum.workspace = true
|
||||
@@ -35,6 +37,7 @@ strum_macros.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
|
||||
pq_proto.workspace = true
|
||||
metrics.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
|
||||
@@ -76,6 +76,7 @@ where
|
||||
|
||||
let log_quietly = method == Method::GET;
|
||||
async move {
|
||||
let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
|
||||
if log_quietly {
|
||||
debug!("Handling request");
|
||||
} else {
|
||||
@@ -87,7 +88,11 @@ where
|
||||
// Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
|
||||
//
|
||||
// Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
|
||||
match (self.0)(request).await {
|
||||
let res = (self.0)(request).await;
|
||||
|
||||
cancellation_guard.disarm();
|
||||
|
||||
match res {
|
||||
Ok(response) => {
|
||||
let response_status = response.status();
|
||||
if log_quietly && response_status.is_success() {
|
||||
@@ -105,6 +110,38 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/// Drop guard to WARN in case the request was dropped before completion.
|
||||
struct RequestCancelled {
|
||||
warn: Option<tracing::Span>,
|
||||
}
|
||||
|
||||
impl RequestCancelled {
|
||||
/// Create the drop guard using the [`tracing::Span::current`] as the span.
|
||||
fn warn_when_dropped_without_responding() -> Self {
|
||||
RequestCancelled {
|
||||
warn: Some(tracing::Span::current()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Consume the drop guard without logging anything.
|
||||
fn disarm(mut self) {
|
||||
self.warn = None;
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for RequestCancelled {
|
||||
fn drop(&mut self) {
|
||||
if let Some(span) = self.warn.take() {
|
||||
// the span has all of the info already, but the outer `.instrument(span)` has already
|
||||
// been dropped, so we need to manually re-enter it for this message.
|
||||
//
|
||||
// this is what the instrument would do before polling so it is fine.
|
||||
let _g = span.entered();
|
||||
warn!("request was dropped before completing");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
SERVE_METRICS_COUNT.inc();
|
||||
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
use std::fmt::Display;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Buf;
|
||||
use hyper::{header, Body, Request, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::error::ApiError;
|
||||
|
||||
@@ -33,12 +31,3 @@ pub fn json_response<T: Serialize>(
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
/// Serialize through Display trait.
|
||||
pub fn display_serialize<S, F>(z: &F, s: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
F: Display,
|
||||
{
|
||||
s.serialize_str(&format!("{}", z))
|
||||
}
|
||||
|
||||
@@ -265,6 +265,26 @@ impl fmt::Display for TenantTimelineId {
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for TenantTimelineId {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let mut parts = s.split('/');
|
||||
let tenant_id = parts
|
||||
.next()
|
||||
.ok_or_else(|| anyhow::anyhow!("TenantTimelineId must contain tenant_id"))?
|
||||
.parse()?;
|
||||
let timeline_id = parts
|
||||
.next()
|
||||
.ok_or_else(|| anyhow::anyhow!("TenantTimelineId must contain timeline_id"))?
|
||||
.parse()?;
|
||||
if parts.next().is_some() {
|
||||
anyhow::bail!("TenantTimelineId must contain only tenant_id and timeline_id");
|
||||
}
|
||||
Ok(TenantTimelineId::new(tenant_id, timeline_id))
|
||||
}
|
||||
}
|
||||
|
||||
// Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued
|
||||
// by the console.
|
||||
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)]
|
||||
|
||||
@@ -54,6 +54,10 @@ pub mod measured_stream;
|
||||
pub mod serde_percent;
|
||||
pub mod serde_regex;
|
||||
|
||||
pub mod pageserver_feedback;
|
||||
|
||||
pub mod tracing_span_assert;
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
#[macro_export]
|
||||
macro_rules! failpoint_sleep_millis_async {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::Lazy;
|
||||
use strum_macros::{EnumString, EnumVariantNames};
|
||||
|
||||
#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
|
||||
@@ -23,24 +24,81 @@ impl LogFormat {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
|
||||
let default_filter_str = "info";
|
||||
static TRACING_EVENT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
metrics::register_int_counter_vec!(
|
||||
"libmetrics_tracing_event_count",
|
||||
"Number of tracing events, by level",
|
||||
&["level"]
|
||||
)
|
||||
.expect("failed to define metric")
|
||||
});
|
||||
|
||||
struct TracingEventCountLayer(&'static metrics::IntCounterVec);
|
||||
|
||||
impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
|
||||
where
|
||||
S: tracing::Subscriber,
|
||||
{
|
||||
fn on_event(
|
||||
&self,
|
||||
event: &tracing::Event<'_>,
|
||||
_ctx: tracing_subscriber::layer::Context<'_, S>,
|
||||
) {
|
||||
let level = event.metadata().level();
|
||||
let level = match *level {
|
||||
tracing::Level::ERROR => "error",
|
||||
tracing::Level::WARN => "warn",
|
||||
tracing::Level::INFO => "info",
|
||||
tracing::Level::DEBUG => "debug",
|
||||
tracing::Level::TRACE => "trace",
|
||||
};
|
||||
self.0.with_label_values(&[level]).inc();
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether to add the `tracing_error` crate's `ErrorLayer`
|
||||
/// to the global tracing subscriber.
|
||||
///
|
||||
pub enum TracingErrorLayerEnablement {
|
||||
/// Do not add the `ErrorLayer`.
|
||||
Disabled,
|
||||
/// Add the `ErrorLayer` with the filter specified by RUST_LOG, defaulting to `info` if `RUST_LOG` is unset.
|
||||
EnableWithRustLogFilter,
|
||||
}
|
||||
|
||||
pub fn init(
|
||||
log_format: LogFormat,
|
||||
tracing_error_layer_enablement: TracingErrorLayerEnablement,
|
||||
) -> anyhow::Result<()> {
|
||||
// We fall back to printing all spans at info-level or above if
|
||||
// the RUST_LOG environment variable is not set.
|
||||
let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str));
|
||||
let rust_log_env_filter = || {
|
||||
tracing_subscriber::EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
|
||||
};
|
||||
|
||||
let base_logger = tracing_subscriber::fmt()
|
||||
.with_env_filter(env_filter)
|
||||
.with_target(false)
|
||||
.with_ansi(atty::is(atty::Stream::Stdout))
|
||||
.with_writer(std::io::stdout);
|
||||
|
||||
match log_format {
|
||||
LogFormat::Json => base_logger.json().init(),
|
||||
LogFormat::Plain => base_logger.init(),
|
||||
LogFormat::Test => base_logger.with_test_writer().init(),
|
||||
// NB: the order of the with() calls does not matter.
|
||||
// See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
|
||||
use tracing_subscriber::prelude::*;
|
||||
let r = tracing_subscriber::registry();
|
||||
let r = r.with({
|
||||
let log_layer = tracing_subscriber::fmt::layer()
|
||||
.with_target(false)
|
||||
.with_ansi(atty::is(atty::Stream::Stdout))
|
||||
.with_writer(std::io::stdout);
|
||||
let log_layer = match log_format {
|
||||
LogFormat::Json => log_layer.json().boxed(),
|
||||
LogFormat::Plain => log_layer.boxed(),
|
||||
LogFormat::Test => log_layer.with_test_writer().boxed(),
|
||||
};
|
||||
log_layer.with_filter(rust_log_env_filter())
|
||||
});
|
||||
let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()));
|
||||
match tracing_error_layer_enablement {
|
||||
TracingErrorLayerEnablement::EnableWithRustLogFilter => r
|
||||
.with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
|
||||
.init(),
|
||||
TracingErrorLayerEnablement::Disabled => r.init(),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -157,3 +215,33 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> {
|
||||
<Self as std::fmt::Display>::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use metrics::{core::Opts, IntCounterVec};
|
||||
|
||||
use super::TracingEventCountLayer;
|
||||
|
||||
#[test]
|
||||
fn tracing_event_count_metric() {
|
||||
let counter_vec =
|
||||
IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
|
||||
let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static
|
||||
let layer = TracingEventCountLayer(counter_vec);
|
||||
use tracing_subscriber::prelude::*;
|
||||
|
||||
tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {
|
||||
tracing::trace!("foo");
|
||||
tracing::debug!("foo");
|
||||
tracing::info!("foo");
|
||||
tracing::warn!("foo");
|
||||
tracing::error!("foo");
|
||||
});
|
||||
|
||||
assert_eq!(counter_vec.with_label_values(&["trace"]).get(), 1);
|
||||
assert_eq!(counter_vec.with_label_values(&["debug"]).get(), 1);
|
||||
assert_eq!(counter_vec.with_label_values(&["info"]).get(), 1);
|
||||
assert_eq!(counter_vec.with_label_values(&["warn"]).get(), 1);
|
||||
assert_eq!(counter_vec.with_label_values(&["error"]).get(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -62,29 +62,48 @@ impl Lsn {
|
||||
}
|
||||
|
||||
/// Compute the offset into a segment
|
||||
#[inline]
|
||||
pub fn segment_offset(self, seg_sz: usize) -> usize {
|
||||
(self.0 % seg_sz as u64) as usize
|
||||
}
|
||||
|
||||
/// Compute LSN of the segment start.
|
||||
#[inline]
|
||||
pub fn segment_lsn(self, seg_sz: usize) -> Lsn {
|
||||
Lsn(self.0 - (self.0 % seg_sz as u64))
|
||||
}
|
||||
|
||||
/// Compute the segment number
|
||||
#[inline]
|
||||
pub fn segment_number(self, seg_sz: usize) -> u64 {
|
||||
self.0 / seg_sz as u64
|
||||
}
|
||||
|
||||
/// Compute the offset into a block
|
||||
#[inline]
|
||||
pub fn block_offset(self) -> u64 {
|
||||
const BLCKSZ: u64 = XLOG_BLCKSZ as u64;
|
||||
self.0 % BLCKSZ
|
||||
}
|
||||
|
||||
/// Compute the block offset of the first byte of this Lsn within this
|
||||
/// segment
|
||||
#[inline]
|
||||
pub fn page_lsn(self) -> Lsn {
|
||||
Lsn(self.0 - self.block_offset())
|
||||
}
|
||||
|
||||
/// Compute the block offset of the first byte of this Lsn within this
|
||||
/// segment
|
||||
#[inline]
|
||||
pub fn page_offset_in_segment(self, seg_sz: usize) -> u64 {
|
||||
(self.0 - self.block_offset()) - self.segment_lsn(seg_sz).0
|
||||
}
|
||||
|
||||
/// Compute the bytes remaining in this block
|
||||
///
|
||||
/// If the LSN is already at the block boundary, it will return `XLOG_BLCKSZ`.
|
||||
#[inline]
|
||||
pub fn remaining_in_block(self) -> u64 {
|
||||
const BLCKSZ: u64 = XLOG_BLCKSZ as u64;
|
||||
BLCKSZ - (self.0 % BLCKSZ)
|
||||
|
||||
214
libs/utils/src/pageserver_feedback.rs
Normal file
214
libs/utils/src/pageserver_feedback.rs
Normal file
@@ -0,0 +1,214 @@
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use pq_proto::{read_cstr, PG_EPOCH};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use tracing::{trace, warn};
|
||||
|
||||
use crate::lsn::Lsn;
|
||||
|
||||
/// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
|
||||
/// Serialized in custom flexible key/value format. In replication protocol, it
|
||||
/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
|
||||
/// Standby status update / Hot standby feedback messages.
|
||||
///
|
||||
/// serde Serialize is used only for human readable dump to json (e.g. in
|
||||
/// safekeepers debug_dump).
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct PageserverFeedback {
|
||||
/// Last known size of the timeline. Used to enforce timeline size limit.
|
||||
pub current_timeline_size: u64,
|
||||
/// LSN last received and ingested by the pageserver. Controls backpressure.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub last_received_lsn: Lsn,
|
||||
/// LSN up to which data is persisted by the pageserver to its local disc.
|
||||
/// Controls backpressure.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
/// LSN up to which data is persisted by the pageserver on s3; safekeepers
|
||||
/// consider WAL before it can be removed.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
// Serialize with RFC3339 format.
|
||||
#[serde(with = "serde_systemtime")]
|
||||
pub replytime: SystemTime,
|
||||
}
|
||||
|
||||
// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
|
||||
// Do not remove previously available fields because this might be backwards incompatible.
|
||||
pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
|
||||
|
||||
impl PageserverFeedback {
|
||||
pub fn empty() -> PageserverFeedback {
|
||||
PageserverFeedback {
|
||||
current_timeline_size: 0,
|
||||
last_received_lsn: Lsn::INVALID,
|
||||
remote_consistent_lsn: Lsn::INVALID,
|
||||
disk_consistent_lsn: Lsn::INVALID,
|
||||
replytime: *PG_EPOCH,
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize PageserverFeedback using custom format
|
||||
// to support protocol extensibility.
|
||||
//
|
||||
// Following layout is used:
|
||||
// char - number of key-value pairs that follow.
|
||||
//
|
||||
// key-value pairs:
|
||||
// null-terminated string - key,
|
||||
// uint32 - value length in bytes
|
||||
// value itself
|
||||
//
|
||||
// TODO: change serialized fields names once all computes migrate to rename.
|
||||
pub fn serialize(&self, buf: &mut BytesMut) {
|
||||
buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
|
||||
buf.put_slice(b"current_timeline_size\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.current_timeline_size);
|
||||
|
||||
buf.put_slice(b"ps_writelsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.last_received_lsn.0);
|
||||
buf.put_slice(b"ps_flushlsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.disk_consistent_lsn.0);
|
||||
buf.put_slice(b"ps_applylsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.remote_consistent_lsn.0);
|
||||
|
||||
let timestamp = self
|
||||
.replytime
|
||||
.duration_since(*PG_EPOCH)
|
||||
.expect("failed to serialize pg_replytime earlier than PG_EPOCH")
|
||||
.as_micros() as i64;
|
||||
|
||||
buf.put_slice(b"ps_replytime\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_i64(timestamp);
|
||||
}
|
||||
|
||||
// Deserialize PageserverFeedback message
|
||||
// TODO: change serialized fields names once all computes migrate to rename.
|
||||
pub fn parse(mut buf: Bytes) -> PageserverFeedback {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
let nfields = buf.get_u8();
|
||||
for _ in 0..nfields {
|
||||
let key = read_cstr(&mut buf).unwrap();
|
||||
match key.as_ref() {
|
||||
b"current_timeline_size" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
rf.current_timeline_size = buf.get_u64();
|
||||
}
|
||||
b"ps_writelsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
rf.last_received_lsn = Lsn(buf.get_u64());
|
||||
}
|
||||
b"ps_flushlsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
rf.disk_consistent_lsn = Lsn(buf.get_u64());
|
||||
}
|
||||
b"ps_applylsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
rf.remote_consistent_lsn = Lsn(buf.get_u64());
|
||||
}
|
||||
b"ps_replytime" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
let raw_time = buf.get_i64();
|
||||
if raw_time > 0 {
|
||||
rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
|
||||
} else {
|
||||
rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let len = buf.get_i32();
|
||||
warn!(
|
||||
"PageserverFeedback parse. unknown key {} of len {len}. Skip it.",
|
||||
String::from_utf8_lossy(key.as_ref())
|
||||
);
|
||||
buf.advance(len as usize);
|
||||
}
|
||||
}
|
||||
}
|
||||
trace!("PageserverFeedback parsed is {:?}", rf);
|
||||
rf
|
||||
}
|
||||
}
|
||||
|
||||
mod serde_systemtime {
|
||||
use std::time::SystemTime;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Deserializer, Serializer};
|
||||
|
||||
pub fn serialize<S>(ts: &SystemTime, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
let chrono_dt: DateTime<Utc> = (*ts).into();
|
||||
serializer.serialize_str(&chrono_dt.to_rfc3339())
|
||||
}
|
||||
|
||||
pub fn deserialize<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
let time: String = Deserialize::deserialize(deserializer)?;
|
||||
Ok(DateTime::parse_from_rfc3339(&time)
|
||||
.map_err(serde::de::Error::custom)?
|
||||
.into())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_replication_feedback_serialization() {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
// Fill rf with some values
|
||||
rf.current_timeline_size = 12345678;
|
||||
// Set rounded time to be able to compare it with deserialized value,
|
||||
// because it is rounded up to microseconds during serialization.
|
||||
rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
let mut data = BytesMut::new();
|
||||
rf.serialize(&mut data);
|
||||
|
||||
let rf_parsed = PageserverFeedback::parse(data.freeze());
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replication_feedback_unknown_key() {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
// Fill rf with some values
|
||||
rf.current_timeline_size = 12345678;
|
||||
// Set rounded time to be able to compare it with deserialized value,
|
||||
// because it is rounded up to microseconds during serialization.
|
||||
rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
let mut data = BytesMut::new();
|
||||
rf.serialize(&mut data);
|
||||
|
||||
// Add an extra field to the buffer and adjust number of keys
|
||||
if let Some(first) = data.first_mut() {
|
||||
*first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
|
||||
}
|
||||
|
||||
data.put_slice(b"new_field_one\0");
|
||||
data.put_i32(8);
|
||||
data.put_u64(42);
|
||||
|
||||
// Parse serialized data and check that new field is not parsed
|
||||
let rf_parsed = PageserverFeedback::parse(data.freeze());
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
}
|
||||
287
libs/utils/src/tracing_span_assert.rs
Normal file
287
libs/utils/src/tracing_span_assert.rs
Normal file
@@ -0,0 +1,287 @@
|
||||
//! Assert that the current [`tracing::Span`] has a given set of fields.
|
||||
//!
|
||||
//! # Usage
|
||||
//!
|
||||
//! ```
|
||||
//! use tracing_subscriber::prelude::*;
|
||||
//! let registry = tracing_subscriber::registry()
|
||||
//! .with(tracing_error::ErrorLayer::default());
|
||||
//!
|
||||
//! // Register the registry as the global subscriber.
|
||||
//! // In this example, we'll only use it as a thread-local subscriber.
|
||||
//! let _guard = tracing::subscriber::set_default(registry);
|
||||
//!
|
||||
//! // Then, in the main code:
|
||||
//!
|
||||
//! let span = tracing::info_span!("TestSpan", test_id = 1);
|
||||
//! let _guard = span.enter();
|
||||
//!
|
||||
//! // ... down the call stack
|
||||
//!
|
||||
//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
|
||||
//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
|
||||
//! match check_fields_present([&extractor]) {
|
||||
//! Ok(()) => {},
|
||||
//! Err(missing) => {
|
||||
//! panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::<Vec<_>>());
|
||||
//! }
|
||||
//! }
|
||||
//! ```
|
||||
//!
|
||||
//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
|
||||
//!
|
||||
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
fmt::{self},
|
||||
hash::{Hash, Hasher},
|
||||
};
|
||||
|
||||
pub enum ExtractionResult {
|
||||
Present,
|
||||
Absent,
|
||||
}
|
||||
|
||||
pub trait Extractor: Send + Sync + std::fmt::Debug {
|
||||
fn name(&self) -> &str;
|
||||
fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult;
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MultiNameExtractor<const L: usize> {
|
||||
name: &'static str,
|
||||
field_names: [&'static str; L],
|
||||
}
|
||||
|
||||
impl<const L: usize> MultiNameExtractor<L> {
|
||||
pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor<L> {
|
||||
MultiNameExtractor { name, field_names }
|
||||
}
|
||||
}
|
||||
impl<const L: usize> Extractor for MultiNameExtractor<L> {
|
||||
fn name(&self) -> &str {
|
||||
self.name
|
||||
}
|
||||
fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult {
|
||||
if fields.iter().any(|f| self.field_names.contains(&f.name())) {
|
||||
ExtractionResult::Present
|
||||
} else {
|
||||
ExtractionResult::Absent
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct MemoryIdentity<'a>(&'a dyn Extractor);
|
||||
|
||||
impl<'a> MemoryIdentity<'a> {
|
||||
fn as_ptr(&self) -> *const () {
|
||||
self.0 as *const _ as *const ()
|
||||
}
|
||||
}
|
||||
impl<'a> PartialEq for MemoryIdentity<'a> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.as_ptr() == other.as_ptr()
|
||||
}
|
||||
}
|
||||
impl<'a> Eq for MemoryIdentity<'a> {}
|
||||
impl<'a> Hash for MemoryIdentity<'a> {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
self.as_ptr().hash(state);
|
||||
}
|
||||
}
|
||||
impl<'a> fmt::Debug for MemoryIdentity<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
|
||||
}
|
||||
}
|
||||
|
||||
/// The extractor names passed as keys to [`new`].
|
||||
pub fn check_fields_present<const L: usize>(
|
||||
must_be_present: [&dyn Extractor; L],
|
||||
) -> Result<(), Vec<&dyn Extractor>> {
|
||||
let mut missing: HashSet<MemoryIdentity> =
|
||||
HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r)));
|
||||
let trace = tracing_error::SpanTrace::capture();
|
||||
trace.with_spans(|md, _formatted_fields| {
|
||||
missing.retain(|extractor| match extractor.0.extract(md.fields()) {
|
||||
ExtractionResult::Present => false,
|
||||
ExtractionResult::Absent => true,
|
||||
});
|
||||
!missing.is_empty() // continue walking up until we've found all missing
|
||||
});
|
||||
if missing.is_empty() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(missing.into_iter().map(|mi| mi.0).collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use tracing_subscriber::prelude::*;
|
||||
|
||||
use super::*;
|
||||
|
||||
struct Setup {
|
||||
_current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
|
||||
tenant_extractor: MultiNameExtractor<2>,
|
||||
timeline_extractor: MultiNameExtractor<2>,
|
||||
}
|
||||
|
||||
fn setup_current_thread() -> Setup {
|
||||
let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]);
|
||||
let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]);
|
||||
|
||||
let registry = tracing_subscriber::registry()
|
||||
.with(tracing_subscriber::fmt::layer())
|
||||
.with(tracing_error::ErrorLayer::default());
|
||||
|
||||
let guard = tracing::subscriber::set_default(registry);
|
||||
|
||||
Setup {
|
||||
_current_thread_subscriber_guard: guard,
|
||||
tenant_extractor,
|
||||
timeline_extractor,
|
||||
}
|
||||
}
|
||||
|
||||
fn assert_missing(missing: Vec<&dyn Extractor>, expected: Vec<&dyn Extractor>) {
|
||||
let missing: HashSet<MemoryIdentity> =
|
||||
HashSet::from_iter(missing.into_iter().map(MemoryIdentity));
|
||||
let expected: HashSet<MemoryIdentity> =
|
||||
HashSet::from_iter(expected.into_iter().map(MemoryIdentity));
|
||||
assert_eq!(missing, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn positive_one_level() {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn negative_one_level() {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
let missing =
|
||||
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn positive_multiple_levels() {
|
||||
let setup = setup_current_thread();
|
||||
|
||||
let span = tracing::info_span!("root");
|
||||
let _guard = span.enter();
|
||||
|
||||
let span = tracing::info_span!("child", tenant_id = "tenant-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn negative_multiple_levels() {
|
||||
let setup = setup_current_thread();
|
||||
|
||||
let span = tracing::info_span!("root");
|
||||
let _guard = span.enter();
|
||||
|
||||
let span = tracing::info_span!("child", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn positive_subset_one_level() {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
check_fields_present([&setup.tenant_extractor]).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn positive_subset_multiple_levels() {
|
||||
let setup = setup_current_thread();
|
||||
|
||||
let span = tracing::info_span!("root");
|
||||
let _guard = span.enter();
|
||||
|
||||
let span = tracing::info_span!("child", tenant_id = "tenant-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
check_fields_present([&setup.tenant_extractor]).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn negative_subset_one_level() {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn negative_subset_multiple_levels() {
|
||||
let setup = setup_current_thread();
|
||||
|
||||
let span = tracing::info_span!("root");
|
||||
let _guard = span.enter();
|
||||
|
||||
let span = tracing::info_span!("child", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tracing_error_subscriber_not_set_up() {
|
||||
// no setup
|
||||
|
||||
let span = tracing::info_span!("foo", e = "some value");
|
||||
let _guard = span.enter();
|
||||
|
||||
let extractor = MultiNameExtractor::new("E", ["e"]);
|
||||
let missing = check_fields_present([&extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&extractor]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn panics_if_tracing_error_subscriber_has_wrong_filter() {
|
||||
let r = tracing_subscriber::registry().with({
|
||||
tracing_error::ErrorLayer::default().with_filter(
|
||||
tracing_subscriber::filter::dynamic_filter_fn(|md, _| {
|
||||
if md.is_span() && *md.level() == tracing::Level::INFO {
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}),
|
||||
)
|
||||
});
|
||||
|
||||
let _guard = tracing::subscriber::set_default(r);
|
||||
|
||||
let span = tracing::info_span!("foo", e = "some value");
|
||||
let _guard = span.enter();
|
||||
|
||||
let extractor = MultiNameExtractor::new("E", ["e"]);
|
||||
let missing = check_fields_present([&extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&extractor]);
|
||||
}
|
||||
}
|
||||
@@ -52,6 +52,7 @@ sync_wrapper.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit = { workspace = true, features = [ "serde" ] }
|
||||
|
||||
@@ -33,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
|
||||
min_lsn = min(min_lsn, lsn_range.start);
|
||||
max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
|
||||
|
||||
updates.insert_historic(Arc::new(layer)).unwrap();
|
||||
updates.insert_historic(Arc::new(layer));
|
||||
}
|
||||
|
||||
println!("min: {min_lsn}, max: {max_lsn}");
|
||||
@@ -215,7 +215,7 @@ fn bench_sequential(c: &mut Criterion) {
|
||||
is_incremental: false,
|
||||
short_id: format!("Layer {}", i),
|
||||
};
|
||||
updates.insert_historic(Arc::new(layer)).unwrap();
|
||||
updates.insert_historic(Arc::new(layer));
|
||||
}
|
||||
updates.flush();
|
||||
println!("Finished layer map init in {:?}", now.elapsed());
|
||||
|
||||
@@ -463,9 +463,13 @@ where
|
||||
let wal_file_path = format!("pg_wal/{}", wal_file_name);
|
||||
let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?;
|
||||
|
||||
let wal_seg =
|
||||
postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version)
|
||||
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
|
||||
let wal_seg = postgres_ffi::generate_wal_segment(
|
||||
segno,
|
||||
system_identifier,
|
||||
self.timeline.pg_version,
|
||||
self.lsn,
|
||||
)
|
||||
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
|
||||
ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
|
||||
self.ar.append(&header, &wal_seg[..]).await?;
|
||||
Ok(())
|
||||
|
||||
@@ -25,6 +25,7 @@ use pageserver::{
|
||||
virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use utils::logging::TracingErrorLayerEnablement;
|
||||
use utils::signals::ShutdownSignals;
|
||||
use utils::{
|
||||
auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
|
||||
@@ -86,8 +87,19 @@ fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
};
|
||||
|
||||
// Initialize logging, which must be initialized before the custom panic hook is installed.
|
||||
logging::init(conf.log_format)?;
|
||||
// Initialize logging.
|
||||
//
|
||||
// It must be initialized before the custom panic hook is installed below.
|
||||
//
|
||||
// Regarding tracing_error enablement: at this time, we only use the
|
||||
// tracing_error crate to debug_assert that log spans contain tenant and timeline ids.
|
||||
// See `debug_assert_current_span_has_tenant_and_timeline_id` in the timeline module
|
||||
let tracing_error_layer_enablement = if cfg!(debug_assertions) {
|
||||
TracingErrorLayerEnablement::EnableWithRustLogFilter
|
||||
} else {
|
||||
TracingErrorLayerEnablement::Disabled
|
||||
};
|
||||
logging::init(conf.log_format, tracing_error_layer_enablement)?;
|
||||
|
||||
// mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
|
||||
// disarming this hook on pageserver, because we never tear down tracing.
|
||||
@@ -226,6 +238,7 @@ fn start_pageserver(
|
||||
);
|
||||
set_build_info_metric(GIT_VERSION);
|
||||
set_launch_timestamp_metric(launch_ts);
|
||||
pageserver::preinitialize_metrics();
|
||||
|
||||
// If any failpoints were set from FAILPOINTS environment variable,
|
||||
// print them to the log for debugging purposes
|
||||
|
||||
@@ -1201,6 +1201,37 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
enum Level {
|
||||
Error,
|
||||
Warn,
|
||||
Info,
|
||||
Debug,
|
||||
Trace,
|
||||
}
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct Request {
|
||||
level: Level,
|
||||
message: String,
|
||||
}
|
||||
let body: Request = json_request(&mut r)
|
||||
.await
|
||||
.map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
|
||||
|
||||
match body.level {
|
||||
Level::Error => tracing::error!(?body.message),
|
||||
Level::Warn => tracing::warn!(?body.message),
|
||||
Level::Info => tracing::info!(?body.message),
|
||||
Level::Debug => tracing::debug!(?body.message),
|
||||
Level::Trace => tracing::trace!(?body.message),
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
pub fn make_router(
|
||||
conf: &'static PageServerConf,
|
||||
launch_ts: &'static LaunchTimestamp,
|
||||
@@ -1341,5 +1372,9 @@ pub fn make_router(
|
||||
testing_api!("set tenant state to broken", handle_tenant_break),
|
||||
)
|
||||
.get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
|
||||
.post(
|
||||
"/v1/tracing/event",
|
||||
testing_api!("emit a tracing event", post_tracing_event_handler),
|
||||
)
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -114,7 +114,7 @@ async fn import_rel(
|
||||
path: &Path,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||
reader: &mut (impl AsyncRead + Unpin),
|
||||
len: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -200,7 +200,7 @@ async fn import_slru(
|
||||
modification: &mut DatadirModification<'_>,
|
||||
slru: SlruKind,
|
||||
path: &Path,
|
||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||
reader: &mut (impl AsyncRead + Unpin),
|
||||
len: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -612,8 +612,8 @@ async fn import_file(
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result<Bytes> {
|
||||
async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes> {
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
reader.read_to_end(&mut buf).await?;
|
||||
Ok(Bytes::copy_from_slice(&buf[..]))
|
||||
Ok(Bytes::from(buf))
|
||||
}
|
||||
|
||||
@@ -44,6 +44,8 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
|
||||
|
||||
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
pub use crate::metrics::preinitialize_metrics;
|
||||
|
||||
pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use metrics::core::{AtomicU64, GenericCounter};
|
||||
use metrics::{
|
||||
register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
|
||||
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
|
||||
Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
|
||||
UIntGauge, UIntGaugeVec,
|
||||
register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec, Counter, CounterVec,
|
||||
Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge,
|
||||
UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::models::TenantState;
|
||||
@@ -205,6 +205,15 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_unexpected_ondemand_downloads_count",
|
||||
"Number of unexpected on-demand downloads. \
|
||||
We log more context for each increment, so, forgo any labels in this metric.",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
/// Each [`Timeline`]'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||
#[derive(Debug)]
|
||||
pub struct EvictionsWithLowResidenceDuration {
|
||||
@@ -350,11 +359,6 @@ pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// remote storage metrics
|
||||
|
||||
/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
|
||||
@@ -385,6 +389,26 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_remote_timeline_client_bytes_started",
|
||||
"Incremented by the number of bytes associated with a remote timeline client operation. \
|
||||
The increment happens when the operation is scheduled.",
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_remote_timeline_client_bytes_finished",
|
||||
"Incremented by the number of bytes associated with a remote timeline client operation. \
|
||||
The increment happens when the operation finishes (regardless of success/failure/shutdown).",
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum RemoteOpKind {
|
||||
Upload,
|
||||
@@ -739,6 +763,8 @@ pub struct RemoteTimelineClientMetrics {
|
||||
remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
|
||||
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
|
||||
calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
|
||||
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
}
|
||||
|
||||
impl RemoteTimelineClientMetrics {
|
||||
@@ -749,6 +775,8 @@ impl RemoteTimelineClientMetrics {
|
||||
remote_operation_time: Mutex::new(HashMap::default()),
|
||||
calls_unfinished_gauge: Mutex::new(HashMap::default()),
|
||||
calls_started_hist: Mutex::new(HashMap::default()),
|
||||
bytes_started_counter: Mutex::new(HashMap::default()),
|
||||
bytes_finished_counter: Mutex::new(HashMap::default()),
|
||||
remote_physical_size_gauge: Mutex::new(None),
|
||||
}
|
||||
}
|
||||
@@ -787,6 +815,7 @@ impl RemoteTimelineClientMetrics {
|
||||
});
|
||||
metric.clone()
|
||||
}
|
||||
|
||||
fn calls_unfinished_gauge(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
@@ -828,32 +857,125 @@ impl RemoteTimelineClientMetrics {
|
||||
});
|
||||
metric.clone()
|
||||
}
|
||||
|
||||
fn bytes_started_counter(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> IntCounter {
|
||||
// XXX would be nice to have an upgradable RwLock
|
||||
let mut guard = self.bytes_started_counter.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
key.0,
|
||||
key.1,
|
||||
])
|
||||
.unwrap()
|
||||
});
|
||||
metric.clone()
|
||||
}
|
||||
|
||||
fn bytes_finished_counter(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> IntCounter {
|
||||
// XXX would be nice to have an upgradable RwLock
|
||||
let mut guard = self.bytes_finished_counter.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
key.0,
|
||||
key.1,
|
||||
])
|
||||
.unwrap()
|
||||
});
|
||||
metric.clone()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl RemoteTimelineClientMetrics {
|
||||
pub fn get_bytes_started_counter_value(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> Option<u64> {
|
||||
let guard = self.bytes_started_counter.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
guard.get(&key).map(|counter| counter.get())
|
||||
}
|
||||
|
||||
pub fn get_bytes_finished_counter_value(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> Option<u64> {
|
||||
let guard = self.bytes_finished_counter.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
guard.get(&key).map(|counter| counter.get())
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteTimelineClientMetrics::call_begin`].
|
||||
#[must_use]
|
||||
pub(crate) struct RemoteTimelineClientCallMetricGuard(Option<IntGauge>);
|
||||
pub(crate) struct RemoteTimelineClientCallMetricGuard {
|
||||
/// Decremented on drop.
|
||||
calls_unfinished_metric: Option<IntGauge>,
|
||||
/// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
|
||||
bytes_finished: Option<(IntCounter, u64)>,
|
||||
}
|
||||
|
||||
impl RemoteTimelineClientCallMetricGuard {
|
||||
/// Consume this guard object without decrementing the metric.
|
||||
/// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out.
|
||||
/// Consume this guard object without performing the metric updates it would do on `drop()`.
|
||||
/// The caller vouches to do the metric updates manually.
|
||||
pub fn will_decrement_manually(mut self) {
|
||||
self.0 = None; // prevent drop() from decrementing
|
||||
let RemoteTimelineClientCallMetricGuard {
|
||||
calls_unfinished_metric,
|
||||
bytes_finished,
|
||||
} = &mut self;
|
||||
calls_unfinished_metric.take();
|
||||
bytes_finished.take();
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for RemoteTimelineClientCallMetricGuard {
|
||||
fn drop(&mut self) {
|
||||
if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self {
|
||||
let RemoteTimelineClientCallMetricGuard {
|
||||
calls_unfinished_metric,
|
||||
bytes_finished,
|
||||
} = self;
|
||||
if let Some(guard) = calls_unfinished_metric.take() {
|
||||
guard.dec();
|
||||
}
|
||||
if let Some((bytes_finished_metric, value)) = bytes_finished {
|
||||
bytes_finished_metric.inc_by(*value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The enum variants communicate to the [`RemoteTimelineClientMetrics`] whether to
|
||||
/// track the byte size of this call in applicable metric(s).
|
||||
pub(crate) enum RemoteTimelineClientMetricsCallTrackSize {
|
||||
/// Do not account for this call's byte size in any metrics.
|
||||
/// The `reason` field is there to make the call sites self-documenting
|
||||
/// about why they don't need the metric.
|
||||
DontTrackSize { reason: &'static str },
|
||||
/// Track the byte size of the call in applicable metric(s).
|
||||
Bytes(u64),
|
||||
}
|
||||
|
||||
impl RemoteTimelineClientMetrics {
|
||||
/// Increment the metrics that track ongoing calls to the remote timeline client instance.
|
||||
/// Update the metrics that change when a call to the remote timeline client instance starts.
|
||||
///
|
||||
/// Drop the returned guard object once the operation is finished to decrement the values.
|
||||
/// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
|
||||
/// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
|
||||
/// is more suitable.
|
||||
/// Never do both.
|
||||
@@ -861,24 +983,51 @@ impl RemoteTimelineClientMetrics {
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
size: RemoteTimelineClientMetricsCallTrackSize,
|
||||
) -> RemoteTimelineClientCallMetricGuard {
|
||||
let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
|
||||
let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
|
||||
self.calls_started_hist(file_kind, op_kind)
|
||||
.observe(unfinished_metric.get() as f64);
|
||||
unfinished_metric.inc();
|
||||
RemoteTimelineClientCallMetricGuard(Some(unfinished_metric))
|
||||
.observe(calls_unfinished_metric.get() as f64);
|
||||
calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric
|
||||
|
||||
let bytes_finished = match size {
|
||||
RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
|
||||
// nothing to do
|
||||
None
|
||||
}
|
||||
RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
|
||||
self.bytes_started_counter(file_kind, op_kind).inc_by(size);
|
||||
let finished_counter = self.bytes_finished_counter(file_kind, op_kind);
|
||||
Some((finished_counter, size))
|
||||
}
|
||||
};
|
||||
RemoteTimelineClientCallMetricGuard {
|
||||
calls_unfinished_metric: Some(calls_unfinished_metric),
|
||||
bytes_finished,
|
||||
}
|
||||
}
|
||||
|
||||
/// Manually decrement the metric instead of using the guard object.
|
||||
/// Manually udpate the metrics that track completions, instead of using the guard object.
|
||||
/// Using the guard object is generally preferable.
|
||||
/// See [`call_begin`] for more context.
|
||||
pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) {
|
||||
let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
|
||||
pub(crate) fn call_end(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
size: RemoteTimelineClientMetricsCallTrackSize,
|
||||
) {
|
||||
let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
|
||||
debug_assert!(
|
||||
unfinished_metric.get() > 0,
|
||||
calls_unfinished_metric.get() > 0,
|
||||
"begin and end should cancel out"
|
||||
);
|
||||
unfinished_metric.dec();
|
||||
calls_unfinished_metric.dec();
|
||||
match size {
|
||||
RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
|
||||
RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
|
||||
self.bytes_finished_counter(file_kind, op_kind).inc_by(size);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -891,6 +1040,8 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
remote_operation_time,
|
||||
calls_unfinished_gauge,
|
||||
calls_started_hist,
|
||||
bytes_started_counter,
|
||||
bytes_finished_counter,
|
||||
} = self;
|
||||
for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
|
||||
@@ -911,6 +1062,22 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
b,
|
||||
]);
|
||||
}
|
||||
for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
a,
|
||||
b,
|
||||
]);
|
||||
}
|
||||
for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
a,
|
||||
b,
|
||||
]);
|
||||
}
|
||||
{
|
||||
let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
|
||||
let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
@@ -974,3 +1141,10 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
|
||||
poll_result
|
||||
}
|
||||
}
|
||||
|
||||
pub fn preinitialize_metrics() {
|
||||
// We want to alert on this metric increasing.
|
||||
// Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
|
||||
assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
|
||||
UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
|
||||
}
|
||||
|
||||
@@ -20,7 +20,6 @@ use pageserver_api::models::{
|
||||
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
|
||||
PagestreamNblocksRequest, PagestreamNblocksResponse,
|
||||
};
|
||||
use postgres_backend::PostgresBackendTCP;
|
||||
use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use pq_proto::FeStartupPacket;
|
||||
@@ -32,6 +31,7 @@ use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::io::StreamReader;
|
||||
use tracing::*;
|
||||
use utils::id::ConnectionId;
|
||||
@@ -57,7 +57,10 @@ use crate::trace::Tracer;
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream<Item = io::Result<Bytes>> + '_ {
|
||||
fn copyin_stream<IO>(pgb: &mut PostgresBackend<IO>) -> impl Stream<Item = io::Result<Bytes>> + '_
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Unpin,
|
||||
{
|
||||
async_stream::try_stream! {
|
||||
loop {
|
||||
let msg = tokio::select! {
|
||||
@@ -65,8 +68,8 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream<Item = io::Result<
|
||||
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
let msg = "pageserver is shutting down".to_string();
|
||||
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None));
|
||||
let msg = "pageserver is shutting down";
|
||||
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
|
||||
Err(QueryError::Other(anyhow::anyhow!(msg)))
|
||||
}
|
||||
|
||||
@@ -125,7 +128,7 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream<Item = io::Result<
|
||||
///
|
||||
/// XXX: Currently, any trailing data after the EOF marker prints a warning.
|
||||
/// Perhaps it should be a hard error?
|
||||
async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
|
||||
async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
|
||||
use tokio::io::AsyncReadExt;
|
||||
let mut buf = [0u8; 512];
|
||||
|
||||
@@ -245,12 +248,23 @@ async fn page_service_conn_main(
|
||||
.set_nodelay(true)
|
||||
.context("could not set TCP_NODELAY")?;
|
||||
|
||||
let peer_addr = socket.peer_addr().context("get peer address")?;
|
||||
|
||||
// setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements:
|
||||
// - long enough for most valid compute connections
|
||||
// - less than infinite to stop us from "leaking" connections to long-gone computes
|
||||
//
|
||||
// no write timeout is used, because the kernel is assumed to error writes after some time.
|
||||
let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
|
||||
socket.set_timeout(Some(std::time::Duration::from_secs(60 * 10)));
|
||||
let socket = std::pin::pin!(socket);
|
||||
|
||||
// XXX: pgbackend.run() should take the connection_ctx,
|
||||
// and create a child per-query context when it invokes process_query.
|
||||
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
||||
// and create the per-query context in process_query ourselves.
|
||||
let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
|
||||
let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
|
||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||
|
||||
match pgbackend
|
||||
.run(&mut conn_handler, task_mgr::shutdown_watcher)
|
||||
@@ -332,13 +346,16 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
#[instrument(skip(self, pgb, ctx))]
|
||||
async fn handle_pagerequests(
|
||||
async fn handle_pagerequests<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackendTCP,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
// NOTE: pagerequests handler exits when connection is closed,
|
||||
// so there is no need to reset the association
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
@@ -436,16 +453,19 @@ impl PageServerHandler {
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip(self, pgb, ctx))]
|
||||
async fn handle_import_basebackup(
|
||||
async fn handle_import_basebackup<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackendTCP,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
base_lsn: Lsn,
|
||||
_end_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(), QueryError> {
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
// Create empty timeline
|
||||
info!("creating new timeline");
|
||||
@@ -486,15 +506,18 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
#[instrument(skip(self, pgb, ctx))]
|
||||
async fn handle_import_wal(
|
||||
async fn handle_import_wal<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackendTCP,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(), QueryError> {
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
|
||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
||||
@@ -690,16 +713,21 @@ impl PageServerHandler {
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip(self, pgb, ctx))]
|
||||
async fn handle_basebackup_request(
|
||||
async fn handle_basebackup_request<IO>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackendTCP,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Option<Lsn>,
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
let started = std::time::Instant::now();
|
||||
|
||||
// check that the timeline exists
|
||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
@@ -712,6 +740,8 @@ impl PageServerHandler {
|
||||
.context("invalid basebackup lsn")?;
|
||||
}
|
||||
|
||||
let lsn_awaited_after = started.elapsed();
|
||||
|
||||
// switch client to COPYOUT
|
||||
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
|
||||
pgb.flush().await?;
|
||||
@@ -732,7 +762,17 @@ impl PageServerHandler {
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CopyDone)?;
|
||||
pgb.flush().await?;
|
||||
info!("basebackup complete");
|
||||
|
||||
let basebackup_after = started
|
||||
.elapsed()
|
||||
.checked_sub(lsn_awaited_after)
|
||||
.unwrap_or(Duration::ZERO);
|
||||
|
||||
info!(
|
||||
lsn_await_millis = lsn_awaited_after.as_millis(),
|
||||
basebackup_millis = basebackup_after.as_millis(),
|
||||
"basebackup complete"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -756,10 +796,13 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl postgres_backend::Handler<tokio::net::TcpStream> for PageServerHandler {
|
||||
impl<IO> postgres_backend::Handler<IO> for PageServerHandler
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
fn check_auth_jwt(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackendTCP,
|
||||
_pgb: &mut PostgresBackend<IO>,
|
||||
jwt_response: &[u8],
|
||||
) -> Result<(), QueryError> {
|
||||
// this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
|
||||
@@ -787,7 +830,7 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for PageServerHandler {
|
||||
|
||||
fn startup(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackendTCP,
|
||||
_pgb: &mut PostgresBackend<IO>,
|
||||
_sm: &FeStartupPacket,
|
||||
) -> Result<(), QueryError> {
|
||||
Ok(())
|
||||
@@ -795,7 +838,7 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for PageServerHandler {
|
||||
|
||||
async fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackendTCP,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
query_string: &str,
|
||||
) -> Result<(), QueryError> {
|
||||
let ctx = self.connection_ctx.attached_child();
|
||||
|
||||
@@ -118,6 +118,10 @@ pub struct Tenant {
|
||||
// Global pageserver config parameters
|
||||
pub conf: &'static PageServerConf,
|
||||
|
||||
/// The value creation timestamp, used to measure activation delay, see:
|
||||
/// <https://github.com/neondatabase/neon/issues/4025>
|
||||
loading_started_at: Instant,
|
||||
|
||||
state: watch::Sender<TenantState>,
|
||||
|
||||
// Overridden tenant-specific config parameters.
|
||||
@@ -267,10 +271,7 @@ impl UninitializedTimeline<'_> {
|
||||
.await
|
||||
.context("Failed to flush after basebackup import")?;
|
||||
|
||||
// Initialize without loading the layer map. We started with an empty layer map, and already
|
||||
// updated it for the layers that we created during the import.
|
||||
let mut timelines = self.owning_tenant.timelines.lock().unwrap();
|
||||
self.initialize_with_lock(ctx, &mut timelines, false, true)
|
||||
self.initialize(ctx)
|
||||
}
|
||||
|
||||
fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
|
||||
@@ -1476,7 +1477,7 @@ impl Tenant {
|
||||
TenantState::Loading | TenantState::Attaching => {
|
||||
*current_state = TenantState::Active;
|
||||
|
||||
info!("Activating tenant {}", self.tenant_id);
|
||||
debug!(tenant_id = %self.tenant_id, "Activating tenant");
|
||||
|
||||
let timelines_accessor = self.timelines.lock().unwrap();
|
||||
let not_broken_timelines = timelines_accessor
|
||||
@@ -1487,12 +1488,17 @@ impl Tenant {
|
||||
// down when they notice that the tenant is inactive.
|
||||
tasks::start_background_loops(self.tenant_id);
|
||||
|
||||
let mut activated_timelines = 0;
|
||||
let mut timelines_broken_during_activation = 0;
|
||||
|
||||
for timeline in not_broken_timelines {
|
||||
match timeline
|
||||
.activate(ctx)
|
||||
.context("timeline activation for activating tenant")
|
||||
{
|
||||
Ok(()) => {}
|
||||
Ok(()) => {
|
||||
activated_timelines += 1;
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Failed to activate timeline {}: {:#}",
|
||||
@@ -1503,9 +1509,26 @@ impl Tenant {
|
||||
"failed to activate timeline {}: {}",
|
||||
timeline.timeline_id, e
|
||||
));
|
||||
|
||||
timelines_broken_during_activation += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let elapsed = self.loading_started_at.elapsed();
|
||||
let total_timelines = timelines_accessor.len();
|
||||
|
||||
// log a lot of stuff, because some tenants sometimes suffer from user-visible
|
||||
// times to activate. see https://github.com/neondatabase/neon/issues/4025
|
||||
info!(
|
||||
since_creation_millis = elapsed.as_millis(),
|
||||
tenant_id = %self.tenant_id,
|
||||
activated_timelines,
|
||||
timelines_broken_during_activation,
|
||||
total_timelines,
|
||||
post_state = <&'static str>::from(&*current_state),
|
||||
"activation attempt finished"
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -1812,6 +1835,9 @@ impl Tenant {
|
||||
Tenant {
|
||||
tenant_id,
|
||||
conf,
|
||||
// using now here is good enough approximation to catch tenants with really long
|
||||
// activation times.
|
||||
loading_started_at: Instant::now(),
|
||||
tenant_conf: Arc::new(RwLock::new(tenant_conf)),
|
||||
timelines: Mutex::new(HashMap::new()),
|
||||
gc_cs: tokio::sync::Mutex::new(()),
|
||||
@@ -2326,8 +2352,6 @@ impl Tenant {
|
||||
)
|
||||
})?;
|
||||
|
||||
// Initialize the timeline without loading the layer map, because we already updated the layer
|
||||
// map above, when we imported the datadir.
|
||||
let timeline = {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)?
|
||||
@@ -2857,7 +2881,13 @@ pub mod harness {
|
||||
};
|
||||
|
||||
LOG_HANDLE.get_or_init(|| {
|
||||
logging::init(logging::LogFormat::Test).expect("Failed to init test logging")
|
||||
logging::init(
|
||||
logging::LogFormat::Test,
|
||||
// enable it in case in case the tests exercise code paths that use
|
||||
// debug_assert_current_span_has_tenant_and_timeline_id
|
||||
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
||||
)
|
||||
.expect("Failed to init test logging")
|
||||
});
|
||||
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
|
||||
@@ -48,11 +48,10 @@ mod layer_coverage;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::KeyPartitioning;
|
||||
use crate::metrics::NUM_ONDISK_LAYERS;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use anyhow::{bail, Result};
|
||||
use anyhow::Result;
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
@@ -126,7 +125,7 @@ where
|
||||
///
|
||||
/// Insert an on-disk layer.
|
||||
///
|
||||
pub fn insert_historic(&mut self, layer: Arc<L>) -> anyhow::Result<()> {
|
||||
pub fn insert_historic(&mut self, layer: Arc<L>) {
|
||||
self.layer_map.insert_historic_noflush(layer)
|
||||
}
|
||||
|
||||
@@ -274,22 +273,16 @@ where
|
||||
///
|
||||
/// Helper function for BatchedUpdates::insert_historic
|
||||
///
|
||||
pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) -> anyhow::Result<()> {
|
||||
let key = historic_layer_coverage::LayerKey::from(&*layer);
|
||||
if self.historic.contains(&key) {
|
||||
bail!(
|
||||
"Attempt to insert duplicate layer {} in layer map",
|
||||
layer.short_id()
|
||||
);
|
||||
}
|
||||
self.historic.insert(key, Arc::clone(&layer));
|
||||
pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
|
||||
// TODO: See #3869, resulting #4088, attempted fix and repro #4094
|
||||
self.historic.insert(
|
||||
historic_layer_coverage::LayerKey::from(&*layer),
|
||||
Arc::clone(&layer),
|
||||
);
|
||||
|
||||
if Self::is_l0(&layer) {
|
||||
self.l0_delta_layers.push(layer);
|
||||
}
|
||||
|
||||
NUM_ONDISK_LAYERS.inc();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
@@ -314,8 +307,6 @@ where
|
||||
"failed to locate removed historic layer from l0_delta_layers"
|
||||
);
|
||||
}
|
||||
|
||||
NUM_ONDISK_LAYERS.dec();
|
||||
}
|
||||
|
||||
pub(self) fn replace_historic_noflush(
|
||||
@@ -843,7 +834,7 @@ mod tests {
|
||||
|
||||
let expected_in_counts = (1, usize::from(expected_l0));
|
||||
|
||||
map.batch_update().insert_historic(remote.clone()).unwrap();
|
||||
map.batch_update().insert_historic(remote.clone());
|
||||
assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
|
||||
|
||||
let replaced = map
|
||||
|
||||
@@ -417,14 +417,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn contains(&self, layer_key: &LayerKey) -> bool {
|
||||
match self.buffer.get(layer_key) {
|
||||
Some(None) => false, // layer remove was buffered
|
||||
Some(_) => true, // layer insert was buffered
|
||||
None => self.layers.contains_key(layer_key), // no buffered ops for this layer
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
|
||||
self.buffer.insert(layer_key, Some(value));
|
||||
}
|
||||
|
||||
@@ -219,7 +219,8 @@ use utils::lsn::Lsn;
|
||||
|
||||
use crate::metrics::{
|
||||
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
|
||||
REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
|
||||
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
|
||||
REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
|
||||
};
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::{
|
||||
@@ -367,9 +368,13 @@ impl RemoteTimelineClient {
|
||||
|
||||
/// Download index file
|
||||
pub async fn download_index_file(&self) -> Result<IndexPart, DownloadError> {
|
||||
let _unfinished_gauge_guard = self
|
||||
.metrics
|
||||
.call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download);
|
||||
let _unfinished_gauge_guard = self.metrics.call_begin(
|
||||
&RemoteOpFileKind::Index,
|
||||
&RemoteOpKind::Download,
|
||||
crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
|
||||
reason: "no need for a downloads gauge",
|
||||
},
|
||||
);
|
||||
|
||||
download::download_index_part(
|
||||
self.conf,
|
||||
@@ -398,9 +403,13 @@ impl RemoteTimelineClient {
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
) -> anyhow::Result<u64> {
|
||||
let downloaded_size = {
|
||||
let _unfinished_gauge_guard = self
|
||||
.metrics
|
||||
.call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download);
|
||||
let _unfinished_gauge_guard = self.metrics.call_begin(
|
||||
&RemoteOpFileKind::Layer,
|
||||
&RemoteOpKind::Download,
|
||||
crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
|
||||
reason: "no need for a downloads gauge",
|
||||
},
|
||||
);
|
||||
download::download_layer_file(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
@@ -886,11 +895,32 @@ impl RemoteTimelineClient {
|
||||
fn calls_unfinished_metric_impl(
|
||||
&self,
|
||||
op: &UploadOp,
|
||||
) -> Option<(RemoteOpFileKind, RemoteOpKind)> {
|
||||
) -> Option<(
|
||||
RemoteOpFileKind,
|
||||
RemoteOpKind,
|
||||
RemoteTimelineClientMetricsCallTrackSize,
|
||||
)> {
|
||||
use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
|
||||
let res = match op {
|
||||
UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload),
|
||||
UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload),
|
||||
UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete),
|
||||
UploadOp::UploadLayer(_, m) => (
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Upload,
|
||||
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
|
||||
),
|
||||
UploadOp::UploadMetadata(_, _) => (
|
||||
RemoteOpFileKind::Index,
|
||||
RemoteOpKind::Upload,
|
||||
DontTrackSize {
|
||||
reason: "metadata uploads are tiny",
|
||||
},
|
||||
),
|
||||
UploadOp::Delete(file_kind, _) => (
|
||||
*file_kind,
|
||||
RemoteOpKind::Delete,
|
||||
DontTrackSize {
|
||||
reason: "should we track deletes? positive or negative sign?",
|
||||
},
|
||||
),
|
||||
UploadOp::Barrier(_) => {
|
||||
// we do not account these
|
||||
return None;
|
||||
@@ -900,20 +930,20 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
|
||||
fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
|
||||
let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
|
||||
let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
|
||||
Some(x) => x,
|
||||
None => return,
|
||||
};
|
||||
let guard = self.metrics.call_begin(&file_kind, &op_kind);
|
||||
let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
|
||||
guard.will_decrement_manually(); // in unfinished_ops_metric_end()
|
||||
}
|
||||
|
||||
fn calls_unfinished_metric_end(&self, op: &UploadOp) {
|
||||
let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
|
||||
let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
|
||||
Some(x) => x,
|
||||
None => return,
|
||||
};
|
||||
self.metrics.call_end(&file_kind, &op_kind);
|
||||
self.metrics.call_end(&file_kind, &op_kind, track_bytes);
|
||||
}
|
||||
|
||||
fn stop(&self) {
|
||||
@@ -981,11 +1011,19 @@ impl RemoteTimelineClient {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{
|
||||
tenant::harness::{TenantHarness, TIMELINE_ID},
|
||||
context::RequestContext,
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
Tenant,
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
||||
use std::{collections::HashSet, path::Path};
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
use tokio::runtime::EnterGuard;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
|
||||
@@ -1034,39 +1072,80 @@ mod tests {
|
||||
assert_eq!(found, expected);
|
||||
}
|
||||
|
||||
struct TestSetup {
|
||||
runtime: &'static tokio::runtime::Runtime,
|
||||
entered_runtime: EnterGuard<'static>,
|
||||
harness: TenantHarness<'static>,
|
||||
tenant: Arc<Tenant>,
|
||||
tenant_ctx: RequestContext,
|
||||
remote_fs_dir: PathBuf,
|
||||
client: Arc<RemoteTimelineClient>,
|
||||
}
|
||||
|
||||
impl TestSetup {
|
||||
fn new(test_name: &str) -> anyhow::Result<Self> {
|
||||
// Use a current-thread runtime in the test
|
||||
let runtime = Box::leak(Box::new(
|
||||
tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?,
|
||||
));
|
||||
let entered_runtime = runtime.enter();
|
||||
|
||||
let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
|
||||
let harness = TenantHarness::create(test_name)?;
|
||||
let (tenant, ctx) = runtime.block_on(harness.load());
|
||||
// create an empty timeline directory
|
||||
let timeline =
|
||||
tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
|
||||
let _ = timeline.initialize(&ctx).unwrap();
|
||||
|
||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
|
||||
std::fs::create_dir_all(remote_fs_dir)?;
|
||||
let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
|
||||
|
||||
let storage_config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: std::num::NonZeroUsize::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
|
||||
)
|
||||
.unwrap(),
|
||||
max_sync_errors: std::num::NonZeroU32::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
|
||||
)
|
||||
.unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
};
|
||||
|
||||
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
|
||||
|
||||
let client = Arc::new(RemoteTimelineClient {
|
||||
conf: harness.conf,
|
||||
runtime,
|
||||
tenant_id: harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
storage_impl: storage,
|
||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(
|
||||
&harness.tenant_id,
|
||||
&TIMELINE_ID,
|
||||
)),
|
||||
});
|
||||
|
||||
Ok(Self {
|
||||
runtime,
|
||||
entered_runtime,
|
||||
harness,
|
||||
tenant,
|
||||
tenant_ctx: ctx,
|
||||
remote_fs_dir,
|
||||
client,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Test scheduling
|
||||
#[test]
|
||||
fn upload_scheduling() -> anyhow::Result<()> {
|
||||
// Use a current-thread runtime in the test
|
||||
let runtime = Box::leak(Box::new(
|
||||
tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?,
|
||||
));
|
||||
let _entered = runtime.enter();
|
||||
|
||||
let harness = TenantHarness::create("upload_scheduling")?;
|
||||
let (tenant, ctx) = runtime.block_on(harness.load());
|
||||
let _timeline =
|
||||
tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
|
||||
std::fs::create_dir_all(remote_fs_dir)?;
|
||||
let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
|
||||
|
||||
let storage_config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: std::num::NonZeroUsize::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
|
||||
)
|
||||
.unwrap(),
|
||||
max_sync_errors: std::num::NonZeroU32::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
|
||||
)
|
||||
.unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
};
|
||||
|
||||
// Test outline:
|
||||
//
|
||||
// Schedule upload of a bunch of layers. Check that they are started immediately, not queued
|
||||
@@ -1081,21 +1160,19 @@ mod tests {
|
||||
// Schedule another deletion. Check that it's launched immediately.
|
||||
// Schedule index upload. Check that it's queued
|
||||
|
||||
println!("workdir: {}", harness.conf.workdir.display());
|
||||
|
||||
let storage_impl = GenericRemoteStorage::from_config(&storage_config)?;
|
||||
let client = Arc::new(RemoteTimelineClient {
|
||||
conf: harness.conf,
|
||||
let TestSetup {
|
||||
runtime,
|
||||
tenant_id: harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
storage_impl,
|
||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(
|
||||
&harness.tenant_id,
|
||||
&TIMELINE_ID,
|
||||
)),
|
||||
});
|
||||
entered_runtime: _entered_runtime,
|
||||
harness,
|
||||
tenant: _tenant,
|
||||
tenant_ctx: _tenant_ctx,
|
||||
remote_fs_dir,
|
||||
client,
|
||||
} = TestSetup::new("upload_scheduling").unwrap();
|
||||
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
println!("workdir: {}", harness.conf.workdir.display());
|
||||
|
||||
let remote_timeline_dir =
|
||||
remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir)?);
|
||||
@@ -1216,4 +1293,90 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bytes_unfinished_gauge_for_layer_file_uploads() -> anyhow::Result<()> {
|
||||
// Setup
|
||||
|
||||
let TestSetup {
|
||||
runtime,
|
||||
harness,
|
||||
client,
|
||||
..
|
||||
} = TestSetup::new("metrics")?;
|
||||
|
||||
let metadata = dummy_metadata(Lsn(0x10));
|
||||
client.init_upload_queue_for_empty_remote(&metadata)?;
|
||||
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||
let content_1 = dummy_contents("foo");
|
||||
std::fs::write(
|
||||
timeline_path.join(layer_file_name_1.file_name()),
|
||||
&content_1,
|
||||
)?;
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
struct BytesStartedFinished {
|
||||
started: Option<usize>,
|
||||
finished: Option<usize>,
|
||||
}
|
||||
let get_bytes_started_stopped = || {
|
||||
let started = client
|
||||
.metrics
|
||||
.get_bytes_started_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
|
||||
.map(|v| v.try_into().unwrap());
|
||||
let stopped = client
|
||||
.metrics
|
||||
.get_bytes_finished_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
|
||||
.map(|v| v.try_into().unwrap());
|
||||
BytesStartedFinished {
|
||||
started,
|
||||
finished: stopped,
|
||||
}
|
||||
};
|
||||
|
||||
// Test
|
||||
|
||||
let init = get_bytes_started_stopped();
|
||||
|
||||
client.schedule_layer_file_upload(
|
||||
&layer_file_name_1,
|
||||
&LayerFileMetadata::new(content_1.len() as u64),
|
||||
)?;
|
||||
|
||||
let pre = get_bytes_started_stopped();
|
||||
|
||||
runtime.block_on(client.wait_completion())?;
|
||||
|
||||
let post = get_bytes_started_stopped();
|
||||
|
||||
// Validate
|
||||
|
||||
assert_eq!(
|
||||
init,
|
||||
BytesStartedFinished {
|
||||
started: None,
|
||||
finished: None
|
||||
}
|
||||
);
|
||||
assert_eq!(
|
||||
pre,
|
||||
BytesStartedFinished {
|
||||
started: Some(content_1.len()),
|
||||
// assert that the _finished metric is created eagerly so that subtractions work on first sample
|
||||
finished: Some(0),
|
||||
}
|
||||
);
|
||||
assert_eq!(
|
||||
post,
|
||||
BytesStartedFinished {
|
||||
started: Some(content_1.len()),
|
||||
finished: Some(content_1.len())
|
||||
}
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ use tracing::{info, warn};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
@@ -43,6 +44,8 @@ pub async fn download_layer_file<'a>(
|
||||
layer_file_name: &'a LayerFileName,
|
||||
layer_metadata: &'a LayerFileMetadata,
|
||||
) -> Result<u64, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
|
||||
|
||||
let local_path = timeline_path.join(layer_file_name.file_name());
|
||||
@@ -154,7 +157,7 @@ pub async fn download_layer_file<'a>(
|
||||
.with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
tracing::info!("download complete: {}", local_path.display());
|
||||
tracing::debug!("download complete: {}", local_path.display());
|
||||
|
||||
Ok(bytes_amount)
|
||||
}
|
||||
|
||||
@@ -48,7 +48,7 @@ use crate::tenant::{
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace};
|
||||
use crate::metrics::TimelineMetrics;
|
||||
use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||
use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
|
||||
@@ -936,6 +936,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
|
||||
pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
||||
let Some(layer) = self.find_layer(layer_file_name) else { return Ok(None) };
|
||||
let Some(remote_layer) = layer.downcast_remote_layer() else { return Ok(Some(false)) };
|
||||
@@ -1483,7 +1484,7 @@ impl Timeline {
|
||||
|
||||
trace!("found layer {}", layer.path().display());
|
||||
total_physical_size += file_size;
|
||||
updates.insert_historic(Arc::new(layer))?;
|
||||
updates.insert_historic(Arc::new(layer));
|
||||
num_layers += 1;
|
||||
} else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
|
||||
// Create a DeltaLayer struct for each delta file.
|
||||
@@ -1515,7 +1516,7 @@ impl Timeline {
|
||||
|
||||
trace!("found layer {}", layer.path().display());
|
||||
total_physical_size += file_size;
|
||||
updates.insert_historic(Arc::new(layer))?;
|
||||
updates.insert_historic(Arc::new(layer));
|
||||
num_layers += 1;
|
||||
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
|
||||
// ignore these
|
||||
@@ -1589,7 +1590,7 @@ impl Timeline {
|
||||
// remote index file?
|
||||
// If so, rename_to_backup those files & replace their local layer with
|
||||
// a RemoteLayer in the layer map so that we re-download them on-demand.
|
||||
if let Some(local_layer) = &local_layer {
|
||||
if let Some(local_layer) = local_layer {
|
||||
let local_layer_path = local_layer
|
||||
.local_path()
|
||||
.expect("caller must ensure that local_layers only contains local layers");
|
||||
@@ -1614,6 +1615,7 @@ impl Timeline {
|
||||
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
|
||||
} else {
|
||||
self.metrics.resident_physical_size_gauge.sub(local_size);
|
||||
updates.remove_historic(local_layer);
|
||||
// fall-through to adding the remote layer
|
||||
}
|
||||
} else {
|
||||
@@ -1649,11 +1651,7 @@ impl Timeline {
|
||||
);
|
||||
let remote_layer = Arc::new(remote_layer);
|
||||
|
||||
if let Some(local_layer) = &local_layer {
|
||||
updates.replace_historic(local_layer, remote_layer)?;
|
||||
} else {
|
||||
updates.insert_historic(remote_layer)?;
|
||||
}
|
||||
updates.insert_historic(remote_layer);
|
||||
}
|
||||
LayerFileName::Delta(deltafilename) => {
|
||||
// Create a RemoteLayer for the delta file.
|
||||
@@ -1677,11 +1675,7 @@ impl Timeline {
|
||||
LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted),
|
||||
);
|
||||
let remote_layer = Arc::new(remote_layer);
|
||||
if let Some(local_layer) = &local_layer {
|
||||
updates.replace_historic(local_layer, remote_layer)?;
|
||||
} else {
|
||||
updates.insert_historic(remote_layer)?;
|
||||
}
|
||||
updates.insert_historic(remote_layer);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2355,6 +2349,7 @@ impl Timeline {
|
||||
id,
|
||||
ctx.task_kind()
|
||||
);
|
||||
UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
|
||||
timeline.download_remote_layer(remote_layer).await?;
|
||||
continue 'layer_map_search;
|
||||
}
|
||||
@@ -2728,7 +2723,7 @@ impl Timeline {
|
||||
.write()
|
||||
.unwrap()
|
||||
.batch_update()
|
||||
.insert_historic(Arc::new(new_delta))?;
|
||||
.insert_historic(Arc::new(new_delta));
|
||||
|
||||
// update the timeline's physical size
|
||||
let sz = new_delta_path.metadata()?.len();
|
||||
@@ -2933,7 +2928,7 @@ impl Timeline {
|
||||
self.metrics
|
||||
.resident_physical_size_gauge
|
||||
.add(metadata.len());
|
||||
updates.insert_historic(Arc::new(l))?;
|
||||
updates.insert_historic(Arc::new(l));
|
||||
}
|
||||
updates.flush();
|
||||
drop(layers);
|
||||
@@ -3366,7 +3361,7 @@ impl Timeline {
|
||||
|
||||
new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
|
||||
let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
|
||||
updates.insert_historic(x)?;
|
||||
updates.insert_historic(x);
|
||||
}
|
||||
|
||||
// Now that we have reshuffled the data to set of new delta layers, we can
|
||||
@@ -3818,11 +3813,13 @@ impl Timeline {
|
||||
/// If the caller has a deadline or needs a timeout, they can simply stop polling:
|
||||
/// we're **cancellation-safe** because the download happens in a separate task_mgr task.
|
||||
/// So, the current download attempt will run to completion even if we stop polling.
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))]
|
||||
#[instrument(skip_all, fields(layer=%remote_layer.short_id()))]
|
||||
pub async fn download_remote_layer(
|
||||
&self,
|
||||
remote_layer: Arc<RemoteLayer>,
|
||||
) -> anyhow::Result<()> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
use std::sync::atomic::Ordering::Relaxed;
|
||||
|
||||
let permit = match Arc::clone(&remote_layer.ongoing_download)
|
||||
@@ -3866,6 +3863,8 @@ impl Timeline {
|
||||
.await;
|
||||
|
||||
if let Ok(size) = &result {
|
||||
info!("layer file download finished");
|
||||
|
||||
// XXX the temp file is still around in Err() case
|
||||
// and consumes space until we clean up upon pageserver restart.
|
||||
self_clone.metrics.resident_physical_size_gauge.add(*size);
|
||||
@@ -3937,6 +3936,8 @@ impl Timeline {
|
||||
updates.flush();
|
||||
drop(layers);
|
||||
|
||||
info!("on-demand download successful");
|
||||
|
||||
// Now that we've inserted the download into the layer map,
|
||||
// close the semaphore. This will make other waiters for
|
||||
// this download return Ok(()).
|
||||
@@ -3944,7 +3945,7 @@ impl Timeline {
|
||||
remote_layer.ongoing_download.close();
|
||||
} else {
|
||||
// Keep semaphore open. We'll drop the permit at the end of the function.
|
||||
error!("on-demand download failed: {:?}", result.as_ref().unwrap_err());
|
||||
error!("layer file download failed: {:?}", result.as_ref().unwrap_err());
|
||||
}
|
||||
|
||||
// Don't treat it as an error if the task that triggered the download
|
||||
@@ -4255,3 +4256,36 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
|
||||
|
||||
bail!("couldn't find an unused backup number for {:?}", path)
|
||||
}
|
||||
|
||||
#[cfg(not(debug_assertions))]
|
||||
#[inline]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
#[inline]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
|
||||
use utils::tracing_span_assert;
|
||||
|
||||
pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
|
||||
tracing_span_assert::MultiNameExtractor<2>,
|
||||
> = once_cell::sync::Lazy::new(|| {
|
||||
tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"])
|
||||
});
|
||||
|
||||
pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<
|
||||
tracing_span_assert::MultiNameExtractor<2>,
|
||||
> = once_cell::sync::Lazy::new(|| {
|
||||
tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
|
||||
});
|
||||
|
||||
match tracing_span_assert::check_fields_present([
|
||||
&*TENANT_ID_EXTRACTOR,
|
||||
&*TIMELINE_ID_EXTRACTOR,
|
||||
]) {
|
||||
Ok(()) => (),
|
||||
Err(missing) => panic!(
|
||||
"missing extractors: {:?}",
|
||||
missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -348,7 +348,7 @@ impl ConnectionManagerState {
|
||||
.context("walreceiver connection handling failure")
|
||||
}
|
||||
.instrument(
|
||||
info_span!("walreceiver_connection", id = %id, node_id = %new_sk.safekeeper_id),
|
||||
info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, node_id = %new_sk.safekeeper_id),
|
||||
)
|
||||
});
|
||||
|
||||
|
||||
@@ -37,8 +37,8 @@ use crate::{
|
||||
use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use pq_proto::PageserverFeedback;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
|
||||
/// Status of the connection.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
@@ -319,12 +319,12 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
|
||||
|
||||
// The last LSN we processed. It is not guaranteed to survive pageserver crash.
|
||||
let last_received_lsn = u64::from(last_lsn);
|
||||
let last_received_lsn = last_lsn;
|
||||
// `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data
|
||||
let disk_consistent_lsn = u64::from(timeline.get_disk_consistent_lsn());
|
||||
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
|
||||
// The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash
|
||||
// Used by safekeepers to remove WAL preceding `remote_consistent_lsn`.
|
||||
let remote_consistent_lsn = u64::from(timeline_remote_consistent_lsn);
|
||||
let remote_consistent_lsn = timeline_remote_consistent_lsn;
|
||||
let ts = SystemTime::now();
|
||||
|
||||
// Update the status about what we just received. This is shown in the mgmt API.
|
||||
|
||||
@@ -370,6 +370,74 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
|
||||
return found;
|
||||
}
|
||||
|
||||
/*
|
||||
* Evict a page (if present) from the local file cache
|
||||
*/
|
||||
void
|
||||
lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
|
||||
{
|
||||
BufferTag tag;
|
||||
FileCacheEntry* entry;
|
||||
ssize_t rc;
|
||||
bool found;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
|
||||
uint32 hash;
|
||||
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
return;
|
||||
|
||||
INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, &found);
|
||||
|
||||
if (!found)
|
||||
{
|
||||
/* nothing to do */
|
||||
LWLockRelease(lfc_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
/* remove the page from the cache */
|
||||
entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1)));
|
||||
|
||||
/*
|
||||
* If the chunk has no live entries, we can position the chunk to be
|
||||
* recycled first.
|
||||
*/
|
||||
if (entry->bitmap[chunk_offs >> 5] == 0)
|
||||
{
|
||||
bool has_remaining_pages;
|
||||
|
||||
for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) {
|
||||
if (entry->bitmap[i] != 0)
|
||||
{
|
||||
has_remaining_pages = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Put the entry at the position that is first to be reclaimed when
|
||||
* we have no cached pages remaining in the chunk
|
||||
*/
|
||||
if (!has_remaining_pages)
|
||||
{
|
||||
dlist_delete(&entry->lru_node);
|
||||
dlist_push_head(&lfc_ctl->lru, &entry->lru_node);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Done: apart from empty chunks, we don't move chunks in the LRU when
|
||||
* they're empty because eviction isn't usage.
|
||||
*/
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to read page from local cache.
|
||||
* Returns true if page is found in local cache.
|
||||
@@ -528,7 +596,6 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
LWLockRelease(lfc_lock);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Record structure holding the to be exposed cache data.
|
||||
*/
|
||||
|
||||
@@ -17,6 +17,8 @@
|
||||
#include "pagestore_client.h"
|
||||
#include "fmgr.h"
|
||||
#include "access/xlog.h"
|
||||
#include "access/xlogutils.h"
|
||||
#include "storage/buf_internals.h"
|
||||
|
||||
#include "libpq-fe.h"
|
||||
#include "libpq/pqformat.h"
|
||||
@@ -57,6 +59,8 @@ int n_unflushed_requests = 0;
|
||||
int flush_every_n_requests = 8;
|
||||
int readahead_buffer_size = 128;
|
||||
|
||||
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
|
||||
|
||||
static void pageserver_flush(void);
|
||||
|
||||
static bool
|
||||
@@ -467,6 +471,8 @@ pg_init_libpagestore(void)
|
||||
smgr_hook = smgr_neon;
|
||||
smgr_init_hook = smgr_init_neon;
|
||||
dbsize_hook = neon_dbsize;
|
||||
old_redo_read_buffer_filter = redo_read_buffer_filter;
|
||||
redo_read_buffer_filter = neon_redo_read_buffer_filter;
|
||||
}
|
||||
lfc_init();
|
||||
}
|
||||
|
||||
@@ -32,40 +32,3 @@ CREATE VIEW local_cache AS
|
||||
SELECT P.* FROM local_cache_pages() AS P
|
||||
(pageoffs int8, relfilenode oid, reltablespace oid, reldatabase oid,
|
||||
relforknumber int2, relblocknumber int8, accesscount int4);
|
||||
|
||||
create table postgres_log (
|
||||
log_time timestamp(3) with time zone,
|
||||
user_name text,
|
||||
database_name text,
|
||||
process_id integer,
|
||||
connection_from text,
|
||||
session_id text,
|
||||
session_line_num bigint,
|
||||
command_tag text,
|
||||
session_start_time timestamp with time zone,
|
||||
virtual_transaction_id text,
|
||||
transaction_id bigint,
|
||||
error_severity text,
|
||||
sql_state_code text,
|
||||
message text,
|
||||
detail text,
|
||||
hint text,
|
||||
internal_query text,
|
||||
internal_query_pos integer,
|
||||
context text,
|
||||
query text,
|
||||
query_pos integer,
|
||||
location text,
|
||||
application_name text,
|
||||
backend_type text,
|
||||
leader_pid integer,
|
||||
query_id bigint
|
||||
);
|
||||
|
||||
CREATE FUNCTION read_postgres_log()
|
||||
RETURNS setof postgres_log
|
||||
AS 'MODULE_PATHNAME', 'read_postgres_log'
|
||||
LANGUAGE C PARALLEL SAFE;
|
||||
|
||||
CREATE VIEW pg_log AS
|
||||
SELECT * from read_postgres_log();
|
||||
|
||||
135
pgxn/neon/neon.c
135
pgxn/neon/neon.c
@@ -11,14 +11,8 @@
|
||||
#include "postgres.h"
|
||||
#include "fmgr.h"
|
||||
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include "access/table.h"
|
||||
#include "access/xact.h"
|
||||
#include "access/xlog.h"
|
||||
#include "commands/copy.h"
|
||||
#include "nodes/makefuncs.h"
|
||||
#include "nodes/value.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "catalog/pg_type.h"
|
||||
@@ -26,11 +20,11 @@
|
||||
#include "funcapi.h"
|
||||
#include "access/htup_details.h"
|
||||
#include "utils/pg_lsn.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/guc.h"
|
||||
|
||||
#include "neon.h"
|
||||
#include "walproposer.h"
|
||||
#include "pagestore_client.h"
|
||||
|
||||
PG_MODULE_MAGIC;
|
||||
void _PG_init(void);
|
||||
@@ -47,7 +41,6 @@ _PG_init(void)
|
||||
PG_FUNCTION_INFO_V1(pg_cluster_size);
|
||||
PG_FUNCTION_INFO_V1(backpressure_lsns);
|
||||
PG_FUNCTION_INFO_V1(backpressure_throttling_time);
|
||||
PG_FUNCTION_INFO_V1(read_postgres_log);
|
||||
|
||||
Datum
|
||||
pg_cluster_size(PG_FUNCTION_ARGS)
|
||||
@@ -93,129 +86,3 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
|
||||
{
|
||||
PG_RETURN_UINT64(BackpressureThrottlingTime());
|
||||
}
|
||||
|
||||
|
||||
#define PG_LOG_DIR "log"
|
||||
#define POSTGRES_LOG "postgres_log"
|
||||
#define LOG_TABLE_N_COLUMS 26
|
||||
|
||||
typedef struct {
|
||||
char* path;
|
||||
time_t ctime;
|
||||
} LogFile;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
Relation log_table;
|
||||
List* log_files;
|
||||
CopyFromState copy_state;
|
||||
ListCell* curr_log;
|
||||
} LogfileContext;
|
||||
|
||||
static int cmp_log_ctime(const ListCell *a, const ListCell *b)
|
||||
{
|
||||
LogFile* la = (LogFile*)lfirst(a);
|
||||
LogFile* lb = (LogFile*)lfirst(b);
|
||||
return la->ctime < lb->ctime ? -1 : la->ctime == lb->ctime ? 0 : 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
Datum
|
||||
read_postgres_log(PG_FUNCTION_ARGS)
|
||||
{
|
||||
FuncCallContext *funcctx;
|
||||
MemoryContext oldcontext;
|
||||
LogfileContext *fctx; /* User function context. */
|
||||
List* log_files = NULL;
|
||||
|
||||
if (SRF_IS_FIRSTCALL())
|
||||
{
|
||||
struct dirent *dent;
|
||||
DIR* dir;
|
||||
struct stat statbuf;
|
||||
char* path;
|
||||
|
||||
funcctx = SRF_FIRSTCALL_INIT();
|
||||
|
||||
/* Switch context when allocating stuff to be used in later calls */
|
||||
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
|
||||
|
||||
/* Create a user function context for cross-call persistence */
|
||||
fctx = (LogfileContext *) palloc(sizeof(LogfileContext));
|
||||
fctx->log_files = NULL;
|
||||
fctx->copy_state = NULL;
|
||||
|
||||
if ((dir = AllocateDir(PG_LOG_DIR)) != NULL)
|
||||
{
|
||||
while ((dent = ReadDirExtended(dir, PG_LOG_DIR, LOG)) != NULL)
|
||||
{
|
||||
/* Ignore non-csv files */
|
||||
if (strcmp(dent->d_name + strlen(dent->d_name) - 4, ".csv") != 0)
|
||||
continue;
|
||||
|
||||
path = psprintf("%s/%s", PG_LOG_DIR, dent->d_name);
|
||||
if (stat(path, &statbuf) == 0)
|
||||
{
|
||||
LogFile* log = (LogFile*)palloc(sizeof(LogFile));
|
||||
log->ctime = statbuf.st_ctime;
|
||||
log->path = path;
|
||||
fctx->log_files = lappend(fctx->log_files, log);
|
||||
}
|
||||
else if (errno != ENOENT) /* file can be concurrently removed */
|
||||
{
|
||||
elog(LOG, "Failed to access log file %s", path);
|
||||
pfree(path);
|
||||
}
|
||||
}
|
||||
FreeDir(dir);
|
||||
}
|
||||
list_sort(fctx->log_files, cmp_log_ctime);
|
||||
fctx->log_table = table_openrv(makeRangeVar(NULL, POSTGRES_LOG, -1), AccessShareLock);
|
||||
fctx->curr_log = list_head(fctx->log_files);
|
||||
|
||||
/* Remember the user function context. */
|
||||
funcctx->user_fctx = fctx;
|
||||
|
||||
/* Return to original context when allocating transient memory */
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
}
|
||||
|
||||
funcctx = SRF_PERCALL_SETUP();
|
||||
|
||||
/* Get the saved state */
|
||||
fctx = funcctx->user_fctx;
|
||||
while (fctx->curr_log != NULL)
|
||||
{
|
||||
if (fctx->copy_state == NULL)
|
||||
{
|
||||
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
|
||||
fctx->copy_state = BeginCopyFrom(NULL,
|
||||
fctx->log_table,
|
||||
NULL,
|
||||
((LogFile*)lfirst(fctx->curr_log))->path,
|
||||
false,
|
||||
NULL,
|
||||
NIL,
|
||||
list_make1(makeDefElem("format", (Node *) makeString("csv"), -1)));
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
}
|
||||
if (fctx->copy_state != NULL)
|
||||
{
|
||||
Datum values[LOG_TABLE_N_COLUMS];
|
||||
bool nulls[LOG_TABLE_N_COLUMS];
|
||||
if (NextCopyFrom(fctx->copy_state, NULL,
|
||||
values, nulls))
|
||||
{
|
||||
HeapTuple tuple = heap_form_tuple(RelationGetDescr(fctx->log_table), values, nulls);
|
||||
Datum result = HeapTupleGetDatum(tuple);
|
||||
SRF_RETURN_NEXT(funcctx, result);
|
||||
}
|
||||
EndCopyFrom(fctx->copy_state);
|
||||
fctx->copy_state = NULL;
|
||||
}
|
||||
fctx->curr_log = lnext(fctx->log_files, fctx->curr_log);
|
||||
}
|
||||
table_close(fctx->log_table, AccessShareLock);
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
|
||||
#ifndef NEON_H
|
||||
#define NEON_H
|
||||
#include "access/xlogreader.h"
|
||||
|
||||
/* GUCs */
|
||||
extern char *neon_auth_token;
|
||||
@@ -20,4 +21,11 @@ extern char *neon_tenant;
|
||||
extern void pg_init_libpagestore(void);
|
||||
extern void pg_init_walproposer(void);
|
||||
|
||||
/*
|
||||
* Returns true if we shouldn't do REDO on that block in record indicated by
|
||||
* block_id; false otherwise.
|
||||
*/
|
||||
extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
|
||||
extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
|
||||
|
||||
#endif /* NEON_H */
|
||||
|
||||
@@ -207,6 +207,7 @@ extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum);
|
||||
extern void lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
|
||||
extern bool lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
|
||||
extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
|
||||
extern void lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
|
||||
extern void lfc_init(void);
|
||||
|
||||
|
||||
|
||||
@@ -189,6 +189,7 @@ typedef struct PrfHashEntry {
|
||||
#define SH_DEFINE
|
||||
#define SH_DECLARE
|
||||
#include "lib/simplehash.h"
|
||||
#include "neon.h"
|
||||
|
||||
/*
|
||||
* PrefetchState maintains the state of (prefetch) getPage@LSN requests.
|
||||
@@ -1209,6 +1210,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
||||
|
||||
if (ShutdownRequestPending)
|
||||
return;
|
||||
/* Don't log any pages if we're not allowed to do so. */
|
||||
if (!XLogInsertAllowed())
|
||||
return;
|
||||
|
||||
/*
|
||||
* Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
|
||||
@@ -1375,8 +1379,18 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN
|
||||
|
||||
if (RecoveryInProgress())
|
||||
{
|
||||
/*
|
||||
* We don't know if WAL has been generated but not yet replayed, so
|
||||
* we're conservative in our estimates about latest pages.
|
||||
*/
|
||||
*latest = false;
|
||||
lsn = GetXLogReplayRecPtr(NULL);
|
||||
|
||||
/*
|
||||
* Get the last written LSN of this page.
|
||||
*/
|
||||
lsn = GetLastWrittenLSN(rnode, forknum, blkno);
|
||||
lsn = nm_adjust_lsn(lsn);
|
||||
|
||||
elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
|
||||
(uint32) ((lsn) >> 32), (uint32) (lsn));
|
||||
}
|
||||
@@ -1559,6 +1573,15 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
||||
/*
|
||||
* Newly created relation is empty, remember that in the relsize cache.
|
||||
*
|
||||
* Note that in REDO, this is called to make sure the relation fork exists,
|
||||
* but it does not truncate the relation. So, we can only update the
|
||||
* relsize if it didn't exist before.
|
||||
*
|
||||
* Also, in redo, we must make sure to update the cached size of the
|
||||
* relation, as that is the primary source of truth for REDO's
|
||||
* file length considerations, and as file extension isn't (perfectly)
|
||||
* logged, we need to take care of that before we hit file size checks.
|
||||
*
|
||||
* FIXME: This is currently not just an optimization, but required for
|
||||
* correctness. Postgres can call smgrnblocks() on the newly-created
|
||||
* relation. Currently, we don't call SetLastWrittenLSN() when a new
|
||||
@@ -1566,7 +1589,14 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
||||
* cache, we might call smgrnblocks() on the newly-created relation before
|
||||
* the creation WAL record hass been received by the page server.
|
||||
*/
|
||||
set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
|
||||
if (isRedo)
|
||||
{
|
||||
update_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
|
||||
get_cached_relsize(reln->smgr_rnode.node, forkNum,
|
||||
&reln->smgr_cached_nblocks[forkNum]);
|
||||
}
|
||||
else
|
||||
set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -1831,6 +1861,26 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
.blockNum = blkno,
|
||||
};
|
||||
|
||||
/*
|
||||
* The redo process does not lock pages that it needs to replay but are
|
||||
* not in the shared buffers, so a concurrent process may request the
|
||||
* page after redo has decided it won't redo that page and updated the
|
||||
* LwLSN for that page.
|
||||
* If we're in hot standby we need to take care that we don't return
|
||||
* until after REDO has finished replaying up to that LwLSN, as the page
|
||||
* should have been locked up to that point.
|
||||
*
|
||||
* See also the description on neon_redo_read_buffer_filter below.
|
||||
*
|
||||
* NOTE: It is possible that the WAL redo process will still do IO due to
|
||||
* concurrent failed read IOs. Those IOs should never have a request_lsn
|
||||
* that is as large as the WAL record we're currently replaying, if it
|
||||
* weren't for the behaviour of the LwLsn cache that uses the highest
|
||||
* value of the LwLsn cache when the entry is not found.
|
||||
*/
|
||||
if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
|
||||
XLogWaitForReplayOf(request_lsn);
|
||||
|
||||
/*
|
||||
* Try to find prefetched page in the list of received pages.
|
||||
*/
|
||||
@@ -2584,3 +2634,143 @@ smgr_init_neon(void)
|
||||
smgr_init_standard();
|
||||
neon_init();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Return whether we can skip the redo for this block.
|
||||
*
|
||||
* The conditions for skipping the IO are:
|
||||
*
|
||||
* - The block is not in the shared buffers, and
|
||||
* - The block is not in the local file cache
|
||||
*
|
||||
* ... because any subsequent read of the page requires us to read
|
||||
* the new version of the page from the PageServer. We do not
|
||||
* check the local file cache; we instead evict the page from LFC: it
|
||||
* is cheaper than going through the FS calls to read the page, and
|
||||
* limits the number of lock operations used in the REDO process.
|
||||
*
|
||||
* We have one exception to the rules for skipping IO: We always apply
|
||||
* changes to shared catalogs' pages. Although this is mostly out of caution,
|
||||
* catalog updates usually result in backends rebuilding their catalog snapshot,
|
||||
* which means it's quite likely the modified page is going to be used soon.
|
||||
*
|
||||
* It is important to note that skipping WAL redo for a page also means
|
||||
* the page isn't locked by the redo process, as there is no Buffer
|
||||
* being returned, nor is there a buffer descriptor to lock.
|
||||
* This means that any IO that wants to read this block needs to wait
|
||||
* for the WAL REDO process to finish processing the WAL record before
|
||||
* it allows the system to start reading the block, as releasing the
|
||||
* block early could lead to phantom reads.
|
||||
*
|
||||
* For example, REDO for a WAL record that modifies 3 blocks could skip
|
||||
* the first block, wait for a lock on the second, and then modify the
|
||||
* third block. Without skipping, all blocks would be locked and phantom
|
||||
* reads would not occur, but with skipping, a concurrent process could
|
||||
* read block 1 with post-REDO contents and read block 3 with pre-REDO
|
||||
* contents, where with REDO locking it would wait on block 1 and see
|
||||
* block 3 with post-REDO contents only.
|
||||
*/
|
||||
bool
|
||||
neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
{
|
||||
XLogRecPtr end_recptr = record->EndRecPtr;
|
||||
XLogRecPtr prev_end_recptr = record->ReadRecPtr - 1;
|
||||
RelFileNode rnode;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blkno;
|
||||
BufferTag tag;
|
||||
uint32 hash;
|
||||
LWLock *partitionLock;
|
||||
Buffer buffer;
|
||||
bool no_redo_needed;
|
||||
BlockNumber relsize;
|
||||
|
||||
if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
|
||||
return true;
|
||||
|
||||
#if PG_VERSION_NUM < 150000
|
||||
if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
|
||||
elog(PANIC, "failed to locate backup block with ID %d", block_id);
|
||||
#else
|
||||
XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Out of an abundance of caution, we always run redo on shared catalogs,
|
||||
* regardless of whether the block is stored in shared buffers.
|
||||
* See also this function's top comment.
|
||||
*/
|
||||
if (!OidIsValid(rnode.dbNode))
|
||||
return false;
|
||||
|
||||
INIT_BUFFERTAG(tag, rnode, forknum, blkno);
|
||||
hash = BufTableHashCode(&tag);
|
||||
partitionLock = BufMappingPartitionLock(hash);
|
||||
|
||||
/*
|
||||
* Lock the partition of shared_buffers so that it can't be updated
|
||||
* concurrently.
|
||||
*/
|
||||
LWLockAcquire(partitionLock, LW_SHARED);
|
||||
|
||||
/* Try to find the relevant buffer */
|
||||
buffer = BufTableLookup(&tag, hash);
|
||||
|
||||
no_redo_needed = buffer < 0;
|
||||
|
||||
/* we don't have the buffer in memory, update lwLsn past this record */
|
||||
if (no_redo_needed)
|
||||
{
|
||||
SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
|
||||
lfc_evict(rnode, forknum, blkno);
|
||||
}
|
||||
else
|
||||
{
|
||||
SetLastWrittenLSNForBlock(prev_end_recptr, rnode, forknum, blkno);
|
||||
}
|
||||
|
||||
LWLockRelease(partitionLock);
|
||||
|
||||
/* Extend the relation if we know its size */
|
||||
if (get_cached_relsize(rnode, forknum, &relsize))
|
||||
{
|
||||
if (relsize < blkno + 1)
|
||||
update_cached_relsize(rnode, forknum, blkno + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Size was not cached. We populate the cache now, with the size of the
|
||||
* relation measured after this WAL record is applied.
|
||||
*
|
||||
* This length is later reused when we open the smgr to read the block,
|
||||
* which is fine and expected.
|
||||
*/
|
||||
|
||||
NeonResponse *response;
|
||||
NeonNblocksResponse *nbresponse;
|
||||
NeonNblocksRequest request = {
|
||||
.req = (NeonRequest) {
|
||||
.lsn = end_recptr,
|
||||
.latest = false,
|
||||
.tag = T_NeonNblocksRequest,
|
||||
},
|
||||
.rnode = rnode,
|
||||
.forknum = forknum,
|
||||
};
|
||||
|
||||
response = page_server_request(&request);
|
||||
|
||||
Assert(response->tag == T_NeonNblocksResponse);
|
||||
nbresponse = (NeonNblocksResponse *) response;
|
||||
|
||||
Assert(nbresponse->n_blocks > blkno);
|
||||
|
||||
set_cached_relsize(rnode, forknum, nbresponse->n_blocks);
|
||||
|
||||
elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
|
||||
}
|
||||
|
||||
return no_redo_needed;
|
||||
}
|
||||
|
||||
@@ -1964,18 +1964,26 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
|
||||
{
|
||||
if (safekeeper[i].appendResponse.hs.ts != 0)
|
||||
{
|
||||
if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin))
|
||||
HotStandbyFeedback *skhs = &safekeeper[i].appendResponse.hs;
|
||||
if (FullTransactionIdIsNormal(skhs->xmin)
|
||||
&& FullTransactionIdPrecedes(skhs->xmin, hs->xmin))
|
||||
{
|
||||
hs->xmin = safekeeper[i].appendResponse.hs.xmin;
|
||||
hs->ts = safekeeper[i].appendResponse.hs.ts;
|
||||
hs->xmin = skhs->xmin;
|
||||
hs->ts = skhs->ts;
|
||||
}
|
||||
if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin))
|
||||
if (FullTransactionIdIsNormal(skhs->catalog_xmin)
|
||||
&& FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin))
|
||||
{
|
||||
hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin;
|
||||
hs->ts = safekeeper[i].appendResponse.hs.ts;
|
||||
hs->catalog_xmin = skhs->catalog_xmin;
|
||||
hs->ts = skhs->ts;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (hs->xmin.value == ~0)
|
||||
hs->xmin = InvalidFullTransactionId;
|
||||
if (hs->catalog_xmin.value == ~0)
|
||||
hs->catalog_xmin = InvalidFullTransactionId;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -62,6 +62,8 @@ utils.workspace = true
|
||||
uuid.workspace = true
|
||||
webpki-roots.workspace = true
|
||||
x509-parser.workspace = true
|
||||
native-tls.workspace = true
|
||||
postgres-native-tls.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
tokio-util.workspace = true
|
||||
|
||||
@@ -9,6 +9,7 @@ use crate::{
|
||||
use pq_proto::BeMessage as Be;
|
||||
use thiserror::Error;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_postgres::config::SslMode;
|
||||
use tracing::{info, info_span};
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
@@ -87,6 +88,16 @@ pub(super) async fn authenticate(
|
||||
.dbname(&db_info.dbname)
|
||||
.user(&db_info.user);
|
||||
|
||||
// Backwards compatibility. pg_sni_proxy uses "--" in domain names
|
||||
// while direct connections do not. Once we migrate to pg_sni_proxy
|
||||
// everywhere, we can remove this.
|
||||
if db_info.host.contains("--") {
|
||||
// we need TLS connection with SNI info to properly route it
|
||||
config.ssl_mode(SslMode::Require);
|
||||
} else {
|
||||
config.ssl_mode(SslMode::Disable);
|
||||
}
|
||||
|
||||
if let Some(password) = db_info.password {
|
||||
config.password(password.as_ref());
|
||||
}
|
||||
@@ -96,6 +107,7 @@ pub(super) async fn authenticate(
|
||||
value: NodeInfo {
|
||||
config,
|
||||
aux: db_info.aux.into(),
|
||||
allow_self_signed_compute: false, // caller may override
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
183
proxy/src/bin/auth_renew_router.rs
Normal file
183
proxy/src/bin/auth_renew_router.rs
Normal file
@@ -0,0 +1,183 @@
|
||||
use std::{net::SocketAddr, sync::Arc};
|
||||
use tokio::{io::AsyncWriteExt, net::TcpListener};
|
||||
|
||||
use anyhow::Context;
|
||||
use clap::{self, Arg};
|
||||
use futures::TryFutureExt;
|
||||
use proxy::{
|
||||
auth::{self, AuthFlow},
|
||||
cancellation::CancelMap,
|
||||
compute::ConnCfg,
|
||||
console::messages::MetricsAuxInfo,
|
||||
};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_postgres::config::SslMode;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::project_git_version;
|
||||
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
fn cli() -> clap::Command {
|
||||
clap::Command::new("Auth renew proxy")
|
||||
.disable_help_flag(true)
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::new("listen")
|
||||
.short('l')
|
||||
.long("listen")
|
||||
.help("listen for incoming client connections on ip:port")
|
||||
.default_value("127.0.0.1:4432"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("dest-host")
|
||||
.long("dest-host")
|
||||
.help("destination hosts")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("dest-port")
|
||||
.long("dest-port")
|
||||
.help("destination port")
|
||||
.default_value("5432"),
|
||||
)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let _logging_guard = proxy::logging::init().await?;
|
||||
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
|
||||
|
||||
let args = cli().get_matches();
|
||||
|
||||
let dest_host: String = args.get_one::<String>("dest-host").unwrap().parse()?;
|
||||
let dest_port: u16 = args.get_one::<String>("dest-port").unwrap().parse()?;
|
||||
let listen_address: SocketAddr = args.get_one::<String>("listen").unwrap().parse()?;
|
||||
|
||||
// Start listening for incoming client connections
|
||||
info!("Starting proxy on {listen_address}");
|
||||
let proxy_listener = TcpListener::bind(listen_address).await?;
|
||||
|
||||
let cancellation_token = CancellationToken::new();
|
||||
|
||||
let main = proxy::flatten_err(tokio::spawn(task_main(
|
||||
Arc::new(dest_host),
|
||||
dest_port,
|
||||
proxy_listener,
|
||||
cancellation_token.clone(),
|
||||
)));
|
||||
let signals_task = proxy::flatten_err(tokio::spawn(proxy::handle_signals(cancellation_token)));
|
||||
|
||||
tokio::select! {
|
||||
res = main => { res?; },
|
||||
res = signals_task => { res?; },
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn task_main(
|
||||
dest_host: Arc<String>,
|
||||
dest_port: u16,
|
||||
listener: tokio::net::TcpListener,
|
||||
cancellation_token: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("proxy has shut down");
|
||||
}
|
||||
|
||||
// When set for the server socket, the keepalive setting
|
||||
// will be inherited by all accepted client sockets.
|
||||
socket2::SockRef::from(&listener).set_keepalive(true)?;
|
||||
|
||||
let mut connections = tokio::task::JoinSet::new();
|
||||
let cancel_map = Arc::new(CancelMap::default());
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
accept_result = listener.accept() => {
|
||||
let (socket, peer_addr) = accept_result?;
|
||||
info!("accepted postgres client connection from {peer_addr}");
|
||||
|
||||
let cancel_map = Arc::clone(&cancel_map);
|
||||
let dest_host = Arc::clone(&dest_host);
|
||||
|
||||
connections.spawn(
|
||||
async move {
|
||||
info!("spawned a task for {peer_addr}");
|
||||
|
||||
socket
|
||||
.set_nodelay(true)
|
||||
.context("failed to set socket option")?;
|
||||
|
||||
handle_client(dest_host, dest_port, &cancel_map, socket).await
|
||||
}
|
||||
.unwrap_or_else(|e| {
|
||||
// Acknowledge that the task has finished with an error.
|
||||
error!("per-client task finished with an error: {e:#}");
|
||||
}),
|
||||
);
|
||||
}
|
||||
_ = cancellation_token.cancelled() => {
|
||||
drop(listener);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Drain connections
|
||||
while let Some(res) = connections.join_next().await {
|
||||
if let Err(e) = res {
|
||||
if !e.is_panic() && !e.is_cancelled() {
|
||||
warn!("unexpected error from joined connection task: {e:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_client(
|
||||
dest_host: Arc<String>,
|
||||
dest_port: u16,
|
||||
cancel_map: &CancelMap,
|
||||
stream: impl AsyncRead + AsyncWrite + Unpin,
|
||||
) -> anyhow::Result<()> {
|
||||
let do_handshake = proxy::proxy::handshake(stream, None, cancel_map);
|
||||
let (mut stream, params) = match do_handshake.await? {
|
||||
Some(x) => x,
|
||||
None => return Ok(()), // it's a cancellation request
|
||||
};
|
||||
|
||||
// Here we force plain test auth for the client and using received password to authenticate
|
||||
// to the destination server. Instead we can always trust the client and take the password / JWT
|
||||
// each time we get a connection.
|
||||
let password = AuthFlow::new(&mut stream)
|
||||
.begin(auth::CleartextPassword)
|
||||
.await?
|
||||
.authenticate()
|
||||
.await?;
|
||||
|
||||
let mut conn_cfg = ConnCfg::new();
|
||||
conn_cfg.set_startup_params(¶ms);
|
||||
conn_cfg.password(password);
|
||||
conn_cfg.host(dest_host.as_str());
|
||||
conn_cfg.port(dest_port);
|
||||
conn_cfg.ssl_mode(SslMode::Require);
|
||||
|
||||
info!("destination: {:?}:{}", dest_host, dest_port);
|
||||
|
||||
let mut conn = conn_cfg
|
||||
.connect(false)
|
||||
.or_else(|e| stream.throw_error(e))
|
||||
.await?;
|
||||
|
||||
cancel_map
|
||||
.with_session(|session| async {
|
||||
proxy::proxy::prepare_client_connection(&conn, false, session, &mut stream).await?;
|
||||
let (stream, read_buf) = stream.into_inner();
|
||||
conn.stream.write_all(&read_buf).await?;
|
||||
let metrics_aux: MetricsAuxInfo = Default::default();
|
||||
proxy::proxy::proxy_pass(stream, conn.stream, &metrics_aux).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
250
proxy/src/bin/pg_sni_router.rs
Normal file
250
proxy/src/bin/pg_sni_router.rs
Normal file
@@ -0,0 +1,250 @@
|
||||
/// A stand-alone program that routes connections, e.g. from
|
||||
/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`.
|
||||
///
|
||||
/// This allows connecting to pods/services running in the same Kubernetes cluster from
|
||||
/// the outside. Similar to an ingress controller for HTTPS.
|
||||
use std::{net::SocketAddr, sync::Arc};
|
||||
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use clap::{self, Arg};
|
||||
use futures::TryFutureExt;
|
||||
use proxy::console::messages::MetricsAuxInfo;
|
||||
use proxy::stream::{PqStream, Stream};
|
||||
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{project_git_version, sentry_init::init_sentry};
|
||||
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
fn cli() -> clap::Command {
|
||||
clap::Command::new("Neon proxy/router")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::new("listen")
|
||||
.short('l')
|
||||
.long("listen")
|
||||
.help("listen for incoming client connections on ip:port")
|
||||
.default_value("127.0.0.1:4432"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("tls-key")
|
||||
.short('k')
|
||||
.long("tls-key")
|
||||
.help("path to TLS key for client postgres connections")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("tls-cert")
|
||||
.short('c')
|
||||
.long("tls-cert")
|
||||
.help("path to TLS cert for client postgres connections")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("dest")
|
||||
.short('d')
|
||||
.long("destination")
|
||||
.help("append this domain zone to the SNI hostname to get the destination address")
|
||||
.required(true),
|
||||
)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let _logging_guard = proxy::logging::init().await?;
|
||||
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
let args = cli().get_matches();
|
||||
let destination: String = args.get_one::<String>("dest").unwrap().parse()?;
|
||||
|
||||
// Configure TLS
|
||||
let tls_config: Arc<rustls::ServerConfig> = match (
|
||||
args.get_one::<String>("tls-key"),
|
||||
args.get_one::<String>("tls-cert"),
|
||||
) {
|
||||
(Some(key_path), Some(cert_path)) => {
|
||||
let key = {
|
||||
let key_bytes = std::fs::read(key_path).context("TLS key file")?;
|
||||
let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
||||
.context(format!("Failed to read TLS keys at '{key_path}'"))?;
|
||||
|
||||
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
||||
keys.pop().map(rustls::PrivateKey).unwrap()
|
||||
};
|
||||
|
||||
let cert_chain_bytes = std::fs::read(cert_path)
|
||||
.context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
|
||||
|
||||
let cert_chain = {
|
||||
rustls_pemfile::certs(&mut &cert_chain_bytes[..])
|
||||
.context(format!(
|
||||
"Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
|
||||
))?
|
||||
.into_iter()
|
||||
.map(rustls::Certificate)
|
||||
.collect()
|
||||
};
|
||||
|
||||
rustls::ServerConfig::builder()
|
||||
.with_safe_default_cipher_suites()
|
||||
.with_safe_default_kx_groups()
|
||||
.with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
|
||||
.with_no_client_auth()
|
||||
.with_single_cert(cert_chain, key)?
|
||||
.into()
|
||||
}
|
||||
_ => bail!("tls-key and tls-cert must be specified"),
|
||||
};
|
||||
|
||||
// Start listening for incoming client connections
|
||||
let proxy_address: SocketAddr = args.get_one::<String>("listen").unwrap().parse()?;
|
||||
info!("Starting sni router on {proxy_address}");
|
||||
let proxy_listener = TcpListener::bind(proxy_address).await?;
|
||||
|
||||
let cancellation_token = CancellationToken::new();
|
||||
|
||||
let main = proxy::flatten_err(tokio::spawn(task_main(
|
||||
Arc::new(destination),
|
||||
tls_config,
|
||||
proxy_listener,
|
||||
cancellation_token.clone(),
|
||||
)));
|
||||
let signals_task = proxy::flatten_err(tokio::spawn(proxy::handle_signals(cancellation_token)));
|
||||
|
||||
tokio::select! {
|
||||
res = main => { res?; },
|
||||
res = signals_task => { res?; },
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn task_main(
|
||||
dest_suffix: Arc<String>,
|
||||
tls_config: Arc<rustls::ServerConfig>,
|
||||
listener: tokio::net::TcpListener,
|
||||
cancellation_token: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
// When set for the server socket, the keepalive setting
|
||||
// will be inherited by all accepted client sockets.
|
||||
socket2::SockRef::from(&listener).set_keepalive(true)?;
|
||||
|
||||
let mut connections = tokio::task::JoinSet::new();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
accept_result = listener.accept() => {
|
||||
let (socket, peer_addr) = accept_result?;
|
||||
info!("accepted postgres client connection from {peer_addr}");
|
||||
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
let tls_config = Arc::clone(&tls_config);
|
||||
let dest_suffix = Arc::clone(&dest_suffix);
|
||||
|
||||
connections.spawn(
|
||||
async move {
|
||||
info!("spawned a task for {peer_addr}");
|
||||
|
||||
socket
|
||||
.set_nodelay(true)
|
||||
.context("failed to set socket option")?;
|
||||
|
||||
handle_client(dest_suffix, tls_config, session_id, socket).await
|
||||
}
|
||||
.unwrap_or_else(|e| {
|
||||
// Acknowledge that the task has finished with an error.
|
||||
error!("per-client task finished with an error: {e:#}");
|
||||
}),
|
||||
);
|
||||
}
|
||||
_ = cancellation_token.cancelled() => {
|
||||
drop(listener);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Drain connections
|
||||
info!("waiting for all client connections to finish");
|
||||
while let Some(res) = connections.join_next().await {
|
||||
if let Err(e) = res {
|
||||
if !e.is_panic() && !e.is_cancelled() {
|
||||
warn!("unexpected error from joined connection task: {e:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
info!("all client connections have finished");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
|
||||
|
||||
async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
raw_stream: S,
|
||||
tls_config: Arc<rustls::ServerConfig>,
|
||||
) -> anyhow::Result<Stream<S>> {
|
||||
let mut stream = PqStream::new(Stream::from_raw(raw_stream));
|
||||
|
||||
let msg = stream.read_startup_packet().await?;
|
||||
info!("received {msg:?}");
|
||||
use pq_proto::FeStartupPacket::*;
|
||||
|
||||
match msg {
|
||||
SslRequest => {
|
||||
stream
|
||||
.write_message(&pq_proto::BeMessage::EncryptionResponse(true))
|
||||
.await?;
|
||||
// Upgrade raw stream into a secure TLS-backed stream.
|
||||
// NOTE: We've consumed `tls`; this fact will be used later.
|
||||
|
||||
let (raw, read_buf) = stream.into_inner();
|
||||
// TODO: Normally, client doesn't send any data before
|
||||
// server says TLS handshake is ok and read_buf is empy.
|
||||
// However, you could imagine pipelining of postgres
|
||||
// SSLRequest + TLS ClientHello in one hunk similar to
|
||||
// pipelining in our node js driver. We should probably
|
||||
// support that by chaining read_buf with the stream.
|
||||
if !read_buf.is_empty() {
|
||||
bail!("data is sent before server replied with EncryptionResponse");
|
||||
}
|
||||
Ok(raw.upgrade(tls_config).await?)
|
||||
}
|
||||
_ => stream.throw_error_str(ERR_INSECURE_CONNECTION).await?,
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
|
||||
async fn handle_client(
|
||||
dest_suffix: Arc<String>,
|
||||
tls_config: Arc<rustls::ServerConfig>,
|
||||
session_id: uuid::Uuid,
|
||||
stream: impl AsyncRead + AsyncWrite + Unpin,
|
||||
) -> anyhow::Result<()> {
|
||||
let tls_stream = ssl_handshake(stream, tls_config).await?;
|
||||
|
||||
// Cut off first part of the SNI domain
|
||||
// We receive required destination details in the format of
|
||||
// `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain`
|
||||
let sni = tls_stream.sni_hostname().ok_or(anyhow!("SNI missing"))?;
|
||||
let dest: Vec<&str> = sni
|
||||
.split_once('.')
|
||||
.context("invalid SNI")?
|
||||
.0
|
||||
.splitn(3, "--")
|
||||
.collect();
|
||||
let port = dest[2].parse::<u16>().context("invalid port")?;
|
||||
let destination = format!("{}.{}.{}:{}", dest[0], dest[1], dest_suffix, port);
|
||||
|
||||
info!("destination: {}", destination);
|
||||
|
||||
let client = tokio::net::TcpStream::connect(destination).await?;
|
||||
|
||||
let metrics_aux: MetricsAuxInfo = Default::default();
|
||||
proxy::proxy::proxy_pass(tls_stream, client, &metrics_aux).await
|
||||
}
|
||||
@@ -1,49 +1,23 @@
|
||||
//! Postgres protocol proxy/router.
|
||||
//!
|
||||
//! This service listens psql port and can check auth via external service
|
||||
//! (control plane API in our case) and can create new databases and accounts
|
||||
//! in somewhat transparent manner (again via communication with control plane API).
|
||||
use proxy::auth;
|
||||
use proxy::console;
|
||||
use proxy::http;
|
||||
use proxy::metrics;
|
||||
|
||||
mod auth;
|
||||
mod cache;
|
||||
mod cancellation;
|
||||
mod compute;
|
||||
mod config;
|
||||
mod console;
|
||||
mod error;
|
||||
mod http;
|
||||
mod logging;
|
||||
mod metrics;
|
||||
mod parse;
|
||||
mod proxy;
|
||||
mod sasl;
|
||||
mod scram;
|
||||
mod stream;
|
||||
mod url;
|
||||
mod waiters;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use anyhow::bail;
|
||||
use clap::{self, Arg};
|
||||
use config::ProxyConfig;
|
||||
use futures::FutureExt;
|
||||
use std::{borrow::Cow, future::Future, net::SocketAddr};
|
||||
use tokio::{net::TcpListener, task::JoinError};
|
||||
use proxy::config::{self, ProxyConfig};
|
||||
use std::{borrow::Cow, net::SocketAddr};
|
||||
use tokio::net::TcpListener;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info, warn};
|
||||
use tracing::info;
|
||||
use tracing::warn;
|
||||
use utils::{project_git_version, sentry_init::init_sentry};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
/// Flattens `Result<Result<T>>` into `Result<T>`.
|
||||
async fn flatten_err(
|
||||
f: impl Future<Output = Result<anyhow::Result<()>, JoinError>>,
|
||||
) -> anyhow::Result<()> {
|
||||
f.map(|r| r.context("join error").and_then(|x| x)).await
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let _logging_guard = logging::init().await?;
|
||||
let _logging_guard = proxy::logging::init().await?;
|
||||
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
@@ -69,7 +43,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
let proxy_listener = TcpListener::bind(proxy_address).await?;
|
||||
let cancellation_token = CancellationToken::new();
|
||||
|
||||
let mut client_tasks = vec![tokio::spawn(proxy::task_main(
|
||||
let mut client_tasks = vec![tokio::spawn(proxy::proxy::task_main(
|
||||
config,
|
||||
proxy_listener,
|
||||
cancellation_token.clone(),
|
||||
@@ -88,7 +62,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
let mut tasks = vec![
|
||||
tokio::spawn(handle_signals(cancellation_token)),
|
||||
tokio::spawn(proxy::handle_signals(cancellation_token)),
|
||||
tokio::spawn(http::server::task_main(http_listener)),
|
||||
tokio::spawn(console::mgmt::task_main(mgmt_listener)),
|
||||
];
|
||||
@@ -97,8 +71,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
tasks.push(tokio::spawn(metrics::task_main(metrics_config)));
|
||||
}
|
||||
|
||||
let tasks = futures::future::try_join_all(tasks.into_iter().map(flatten_err));
|
||||
let client_tasks = futures::future::try_join_all(client_tasks.into_iter().map(flatten_err));
|
||||
let tasks = futures::future::try_join_all(tasks.into_iter().map(proxy::flatten_err));
|
||||
let client_tasks =
|
||||
futures::future::try_join_all(client_tasks.into_iter().map(proxy::flatten_err));
|
||||
tokio::select! {
|
||||
// We are only expecting an error from these forever tasks
|
||||
res = tasks => { res?; },
|
||||
@@ -107,33 +82,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handle unix signals appropriately.
|
||||
async fn handle_signals(token: CancellationToken) -> anyhow::Result<()> {
|
||||
use tokio::signal::unix::{signal, SignalKind};
|
||||
|
||||
let mut hangup = signal(SignalKind::hangup())?;
|
||||
let mut interrupt = signal(SignalKind::interrupt())?;
|
||||
let mut terminate = signal(SignalKind::terminate())?;
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
// Hangup is commonly used for config reload.
|
||||
_ = hangup.recv() => {
|
||||
warn!("received SIGHUP; config reload is not supported");
|
||||
}
|
||||
// Shut down the whole application.
|
||||
_ = interrupt.recv() => {
|
||||
warn!("received SIGINT, exiting immediately");
|
||||
bail!("interrupted");
|
||||
}
|
||||
_ = terminate.recv() => {
|
||||
warn!("received SIGTERM, shutting down once all existing connections have closed");
|
||||
token.cancel();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// ProxyConfig is created at proxy startup, and lives forever.
|
||||
fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> {
|
||||
let tls_config = match (
|
||||
@@ -149,6 +97,14 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
|
||||
_ => bail!("either both or neither tls-key and tls-cert must be specified"),
|
||||
};
|
||||
|
||||
let allow_self_signed_compute: bool = args
|
||||
.get_one::<String>("allow-self-signed-compute")
|
||||
.unwrap()
|
||||
.parse()?;
|
||||
if allow_self_signed_compute {
|
||||
warn!("allowing self-signed compute certificates");
|
||||
}
|
||||
|
||||
let metric_collection = match (
|
||||
args.get_one::<String>("metric-collection-endpoint"),
|
||||
args.get_one::<String>("metric-collection-interval"),
|
||||
@@ -198,6 +154,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
|
||||
tls_config,
|
||||
auth_backend,
|
||||
metric_collection,
|
||||
allow_self_signed_compute,
|
||||
}));
|
||||
|
||||
Ok(config)
|
||||
@@ -288,6 +245,12 @@ fn cli() -> clap::Command {
|
||||
.help("cache for `wake_compute` api method (use `size=0` to disable)")
|
||||
.default_value(config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("allow-self-signed-compute")
|
||||
.long("allow-self-signed-compute")
|
||||
.help("Allow self-signed certificates for compute nodes (for testing)")
|
||||
.default_value("false"),
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -1,11 +1,11 @@
|
||||
use crate::{cancellation::CancelClosure, error::UserFacingError};
|
||||
use futures::TryFutureExt;
|
||||
use futures::{FutureExt, TryFutureExt};
|
||||
use itertools::Itertools;
|
||||
use pq_proto::StartupMessageParams;
|
||||
use std::{io, net::SocketAddr};
|
||||
use std::{io, net::SocketAddr, time::Duration};
|
||||
use thiserror::Error;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio_postgres::NoTls;
|
||||
use tokio_postgres::tls::MakeTlsConnect;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
|
||||
@@ -19,6 +19,9 @@ pub enum ConnectionError {
|
||||
|
||||
#[error("{COULD_NOT_CONNECT}: {0}")]
|
||||
CouldNotConnect(#[from] io::Error),
|
||||
|
||||
#[error("{COULD_NOT_CONNECT}: {0}")]
|
||||
TlsError(#[from] native_tls::Error),
|
||||
}
|
||||
|
||||
impl UserFacingError for ConnectionError {
|
||||
@@ -125,14 +128,34 @@ impl std::ops::DerefMut for ConnCfg {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ConnCfg {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl ConnCfg {
|
||||
/// Establish a raw TCP connection to the compute node.
|
||||
async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> {
|
||||
async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream, &str)> {
|
||||
use tokio_postgres::config::Host;
|
||||
|
||||
// wrap TcpStream::connect with timeout
|
||||
let connect_with_timeout = |host, port| {
|
||||
let connection_timeout = Duration::from_millis(10000);
|
||||
tokio::time::timeout(connection_timeout, TcpStream::connect((host, port))).map(
|
||||
move |res| match res {
|
||||
Ok(tcpstream_connect_res) => tcpstream_connect_res,
|
||||
Err(_) => Err(io::Error::new(
|
||||
io::ErrorKind::TimedOut,
|
||||
format!("exceeded connection timeout {connection_timeout:?}"),
|
||||
)),
|
||||
},
|
||||
)
|
||||
};
|
||||
|
||||
let connect_once = |host, port| {
|
||||
info!("trying to connect to compute node at {host}:{port}");
|
||||
TcpStream::connect((host, port)).and_then(|socket| async {
|
||||
connect_with_timeout(host, port).and_then(|socket| async {
|
||||
let socket_addr = socket.peer_addr()?;
|
||||
// This prevents load balancer from severing the connection.
|
||||
socket2::SockRef::from(&socket).set_keepalive(true)?;
|
||||
@@ -165,9 +188,8 @@ impl ConnCfg {
|
||||
Host::Unix(_) => continue, // unix sockets are not welcome here
|
||||
};
|
||||
|
||||
// TODO: maybe we should add a timeout.
|
||||
match connect_once(host, *port).await {
|
||||
Ok(socket) => return Ok(socket),
|
||||
Ok((sockaddr, stream)) => return Ok((sockaddr, stream, host)),
|
||||
Err(err) => {
|
||||
// We can't throw an error here, as there might be more hosts to try.
|
||||
warn!("couldn't connect to compute node at {host}:{port}: {err}");
|
||||
@@ -187,7 +209,10 @@ impl ConnCfg {
|
||||
|
||||
pub struct PostgresConnection {
|
||||
/// Socket connected to a compute node.
|
||||
pub stream: TcpStream,
|
||||
pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
|
||||
tokio::net::TcpStream,
|
||||
postgres_native_tls::TlsStream<tokio::net::TcpStream>,
|
||||
>,
|
||||
/// PostgreSQL connection parameters.
|
||||
pub params: std::collections::HashMap<String, String>,
|
||||
/// Query cancellation token.
|
||||
@@ -195,11 +220,27 @@ pub struct PostgresConnection {
|
||||
}
|
||||
|
||||
impl ConnCfg {
|
||||
async fn do_connect(&self) -> Result<PostgresConnection, ConnectionError> {
|
||||
// TODO: establish a secure connection to the DB.
|
||||
let (socket_addr, mut stream) = self.connect_raw().await?;
|
||||
let (client, connection) = self.0.connect_raw(&mut stream, NoTls).await?;
|
||||
info!("connected to compute node at {socket_addr}");
|
||||
async fn do_connect(
|
||||
&self,
|
||||
allow_self_signed_compute: bool,
|
||||
) -> Result<PostgresConnection, ConnectionError> {
|
||||
let (socket_addr, stream, host) = self.connect_raw().await?;
|
||||
|
||||
let tls_connector = native_tls::TlsConnector::builder()
|
||||
.danger_accept_invalid_certs(allow_self_signed_compute)
|
||||
.build()
|
||||
.unwrap();
|
||||
let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector);
|
||||
let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;
|
||||
|
||||
// connect_raw() will not use TLS if sslmode is "disable"
|
||||
let (client, connection) = self.0.connect_raw(stream, tls).await?;
|
||||
let stream = connection.stream.into_inner();
|
||||
|
||||
info!(
|
||||
"connected to compute node at {host} ({socket_addr}) sslmode={:?}",
|
||||
self.0.get_ssl_mode()
|
||||
);
|
||||
|
||||
// This is very ugly but as of now there's no better way to
|
||||
// extract the connection parameters from tokio-postgres' connection.
|
||||
@@ -220,8 +261,11 @@ impl ConnCfg {
|
||||
}
|
||||
|
||||
/// Connect to a corresponding compute node.
|
||||
pub async fn connect(&self) -> Result<PostgresConnection, ConnectionError> {
|
||||
self.do_connect()
|
||||
pub async fn connect(
|
||||
&self,
|
||||
allow_self_signed_compute: bool,
|
||||
) -> Result<PostgresConnection, ConnectionError> {
|
||||
self.do_connect(allow_self_signed_compute)
|
||||
.inspect_err(|err| {
|
||||
// Immediately log the error we have at our disposal.
|
||||
error!("couldn't connect to compute node: {err}");
|
||||
|
||||
@@ -12,6 +12,7 @@ pub struct ProxyConfig {
|
||||
pub tls_config: Option<TlsConfig>,
|
||||
pub auth_backend: auth::BackendType<'static, ()>,
|
||||
pub metric_collection: Option<MetricCollectionConfig>,
|
||||
pub allow_self_signed_compute: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -170,6 +170,9 @@ pub struct NodeInfo {
|
||||
|
||||
/// Labels for proxy's metrics.
|
||||
pub aux: Arc<MetricsAuxInfo>,
|
||||
|
||||
/// Whether we should accept self-signed certificates (for testing)
|
||||
pub allow_self_signed_compute: bool,
|
||||
}
|
||||
|
||||
pub type NodeInfoCache = TimedLru<Arc<str>, NodeInfo>;
|
||||
|
||||
@@ -8,6 +8,7 @@ use crate::{auth::ClientCredentials, compute, error::io_error, scram, url::ApiUr
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use thiserror::Error;
|
||||
use tokio_postgres::config::SslMode;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
@@ -86,11 +87,13 @@ impl Api {
|
||||
let mut config = compute::ConnCfg::new();
|
||||
config
|
||||
.host(self.endpoint.host_str().unwrap_or("localhost"))
|
||||
.port(self.endpoint.port().unwrap_or(5432));
|
||||
.port(self.endpoint.port().unwrap_or(5432))
|
||||
.ssl_mode(SslMode::Disable);
|
||||
|
||||
let node = NodeInfo {
|
||||
config,
|
||||
aux: Default::default(),
|
||||
allow_self_signed_compute: false,
|
||||
};
|
||||
|
||||
Ok(node)
|
||||
|
||||
@@ -8,6 +8,7 @@ use super::{
|
||||
use crate::{auth::ClientCredentials, compute, http, scram};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use tokio_postgres::config::SslMode;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -100,11 +101,12 @@ impl Api {
|
||||
// We'll set username and such later using the startup message.
|
||||
// TODO: add more type safety (in progress).
|
||||
let mut config = compute::ConnCfg::new();
|
||||
config.host(host).port(port);
|
||||
config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
|
||||
|
||||
let node = NodeInfo {
|
||||
config,
|
||||
aux: body.aux.into(),
|
||||
allow_self_signed_compute: false,
|
||||
};
|
||||
|
||||
Ok(node)
|
||||
|
||||
57
proxy/src/lib.rs
Normal file
57
proxy/src/lib.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use anyhow::{bail, Context};
|
||||
use futures::{Future, FutureExt};
|
||||
use tokio::task::JoinError;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
|
||||
pub mod auth;
|
||||
pub mod cache;
|
||||
pub mod cancellation;
|
||||
pub mod compute;
|
||||
pub mod config;
|
||||
pub mod console;
|
||||
pub mod error;
|
||||
pub mod http;
|
||||
pub mod logging;
|
||||
pub mod metrics;
|
||||
pub mod parse;
|
||||
pub mod proxy;
|
||||
pub mod sasl;
|
||||
pub mod scram;
|
||||
pub mod stream;
|
||||
pub mod url;
|
||||
pub mod waiters;
|
||||
|
||||
/// Handle unix signals appropriately.
|
||||
pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<()> {
|
||||
use tokio::signal::unix::{signal, SignalKind};
|
||||
|
||||
let mut hangup = signal(SignalKind::hangup())?;
|
||||
let mut interrupt = signal(SignalKind::interrupt())?;
|
||||
let mut terminate = signal(SignalKind::terminate())?;
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
// Hangup is commonly used for config reload.
|
||||
_ = hangup.recv() => {
|
||||
warn!("received SIGHUP; config reload is not supported");
|
||||
}
|
||||
// Shut down the whole application.
|
||||
_ = interrupt.recv() => {
|
||||
warn!("received SIGINT, exiting immediately");
|
||||
bail!("interrupted");
|
||||
}
|
||||
_ = terminate.recv() => {
|
||||
warn!("received SIGTERM, shutting down once all existing connections have closed");
|
||||
token.cancel();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Flattens `Result<Result<T>>` into `Result<T>`.
|
||||
pub async fn flatten_err(
|
||||
f: impl Future<Output = Result<anyhow::Result<()>, JoinError>>,
|
||||
) -> anyhow::Result<()> {
|
||||
f.map(|r| r.context("join error").and_then(|x| x)).await
|
||||
}
|
||||
@@ -1,4 +1,3 @@
|
||||
use tracing_opentelemetry::OpenTelemetryLayer;
|
||||
use tracing_subscriber::{
|
||||
filter::{EnvFilter, LevelFilter},
|
||||
prelude::*,
|
||||
@@ -22,13 +21,8 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
|
||||
.with_writer(std::io::stderr)
|
||||
.with_target(false);
|
||||
|
||||
let otlp_layer = tracing_utils::init_tracing("proxy")
|
||||
.await
|
||||
.map(OpenTelemetryLayer::new);
|
||||
|
||||
tracing_subscriber::registry()
|
||||
.with(env_filter)
|
||||
.with(otlp_layer)
|
||||
.with(fmt_layer)
|
||||
.try_init()?;
|
||||
|
||||
|
||||
@@ -95,7 +95,7 @@ fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
|
||||
current_metrics.push((
|
||||
Ids {
|
||||
endpoint_id: endpoint_id.to_string(),
|
||||
branch_id: "".to_string(),
|
||||
branch_id: branch_id.to_string(),
|
||||
},
|
||||
(value, Utc::now()),
|
||||
));
|
||||
|
||||
@@ -95,9 +95,9 @@ pub async fn task_main(
|
||||
|
||||
handle_client(config, &cancel_map, session_id, socket).await
|
||||
}
|
||||
.unwrap_or_else(|e| {
|
||||
.unwrap_or_else(move |e| {
|
||||
// Acknowledge that the task has finished with an error.
|
||||
error!("per-client task finished with an error: {e:#}");
|
||||
error!(?session_id, "per-client task finished with an error: {e:#}");
|
||||
}),
|
||||
);
|
||||
}
|
||||
@@ -155,7 +155,7 @@ pub async fn handle_ws_client(
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
};
|
||||
|
||||
let client = Client::new(stream, creds, ¶ms, session_id);
|
||||
let client = Client::new(stream, creds, ¶ms, session_id, false);
|
||||
cancel_map
|
||||
.with_session(|session| client.connect_to_db(session, true))
|
||||
.await
|
||||
@@ -194,7 +194,15 @@ async fn handle_client(
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
};
|
||||
|
||||
let client = Client::new(stream, creds, ¶ms, session_id);
|
||||
let allow_self_signed_compute = config.allow_self_signed_compute;
|
||||
|
||||
let client = Client::new(
|
||||
stream,
|
||||
creds,
|
||||
¶ms,
|
||||
session_id,
|
||||
allow_self_signed_compute,
|
||||
);
|
||||
cancel_map
|
||||
.with_session(|session| client.connect_to_db(session, false))
|
||||
.await
|
||||
@@ -205,7 +213,7 @@ async fn handle_client(
|
||||
/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
|
||||
/// we also take an extra care of propagating only the select handshake errors to client.
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
stream: S,
|
||||
mut tls: Option<&TlsConfig>,
|
||||
cancel_map: &CancelMap,
|
||||
@@ -297,9 +305,11 @@ async fn connect_to_compute_once(
|
||||
NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
|
||||
};
|
||||
|
||||
let allow_self_signed_compute = node_info.allow_self_signed_compute;
|
||||
|
||||
node_info
|
||||
.config
|
||||
.connect()
|
||||
.connect(allow_self_signed_compute)
|
||||
.inspect_err(invalidate_cache)
|
||||
.await
|
||||
}
|
||||
@@ -340,7 +350,7 @@ async fn connect_to_compute(
|
||||
|
||||
/// Finish client connection initialization: confirm auth success, send params, etc.
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn prepare_client_connection(
|
||||
pub async fn prepare_client_connection(
|
||||
node: &compute::PostgresConnection,
|
||||
reported_auth_ok: bool,
|
||||
session: cancellation::Session<'_>,
|
||||
@@ -378,7 +388,7 @@ async fn prepare_client_connection(
|
||||
|
||||
/// Forward bytes in both directions (client <-> compute).
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn proxy_pass(
|
||||
pub async fn proxy_pass(
|
||||
client: impl AsyncRead + AsyncWrite + Unpin,
|
||||
compute: impl AsyncRead + AsyncWrite + Unpin,
|
||||
aux: &MetricsAuxInfo,
|
||||
@@ -420,6 +430,8 @@ struct Client<'a, S> {
|
||||
params: &'a StartupMessageParams,
|
||||
/// Unique connection ID.
|
||||
session_id: uuid::Uuid,
|
||||
/// Allow self-signed certificates (for testing).
|
||||
allow_self_signed_compute: bool,
|
||||
}
|
||||
|
||||
impl<'a, S> Client<'a, S> {
|
||||
@@ -429,12 +441,14 @@ impl<'a, S> Client<'a, S> {
|
||||
creds: auth::BackendType<'a, auth::ClientCredentials<'a>>,
|
||||
params: &'a StartupMessageParams,
|
||||
session_id: uuid::Uuid,
|
||||
allow_self_signed_compute: bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
stream,
|
||||
creds,
|
||||
params,
|
||||
session_id,
|
||||
allow_self_signed_compute,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -451,6 +465,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
mut creds,
|
||||
params,
|
||||
session_id,
|
||||
allow_self_signed_compute,
|
||||
} = self;
|
||||
|
||||
let extra = console::ConsoleReqExtra {
|
||||
@@ -473,6 +488,8 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
value: mut node_info,
|
||||
} = auth_result;
|
||||
|
||||
node_info.allow_self_signed_compute = allow_self_signed_compute;
|
||||
|
||||
let mut node = connect_to_compute(&mut node_info, params, &extra, &creds)
|
||||
.or_else(|e| stream.throw_error(e))
|
||||
.await?;
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# If you save this in your path under the name "cargo-zclippy" (or whatever
|
||||
# name you like), then you can run it as "cargo zclippy" from the shell prompt.
|
||||
@@ -8,7 +9,11 @@
|
||||
# warnings and errors right in the editor.
|
||||
# In vscode, this setting is Rust-analyzer>Check On Save:Command
|
||||
|
||||
# * `-A unknown_lints` – do not warn about unknown lint suppressions
|
||||
# that people with newer toolchains might use
|
||||
# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status)
|
||||
cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings
|
||||
# NB: the CI runs the full feature powerset, so, it catches slightly more errors
|
||||
# at the expense of longer runtime. This script is used by developers, so, don't
|
||||
# do that here.
|
||||
|
||||
thisscript="${BASH_SOURCE[0]}"
|
||||
thisscript_dir="$(dirname "$thisscript")"
|
||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
||||
exec cargo clippy --all-features $CLIPPY_COMMON_ARGS
|
||||
|
||||
@@ -19,11 +19,13 @@ git-version.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper.workspace = true
|
||||
futures.workspace = true
|
||||
once_cell.workspace = true
|
||||
parking_lot.workspace = true
|
||||
postgres.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde_with.workspace = true
|
||||
@@ -33,6 +35,7 @@ tokio = { workspace = true, features = ["fs"] }
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
toml_edit.workspace = true
|
||||
tempfile.workspace = true
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
metrics.workspace = true
|
||||
@@ -45,6 +48,3 @@ storage_broker.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile.workspace = true
|
||||
|
||||
@@ -134,7 +134,10 @@ fn main() -> anyhow::Result<()> {
|
||||
// 1. init logging
|
||||
// 2. tracing panic hook
|
||||
// 3. sentry
|
||||
logging::init(LogFormat::from_config(&args.log_format)?)?;
|
||||
logging::init(
|
||||
LogFormat::from_config(&args.log_format)?,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
)?;
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
info!("version: {GIT_VERSION}");
|
||||
|
||||
|
||||
@@ -91,7 +91,7 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
|
||||
// connection to the broker.
|
||||
|
||||
// note: there are blocking operations below, but it's considered fine for now
|
||||
tli.record_safekeeper_info(&msg).await?
|
||||
tli.record_safekeeper_info(msg).await?
|
||||
}
|
||||
}
|
||||
bail!("end of stream");
|
||||
|
||||
@@ -9,9 +9,10 @@ use std::path::PathBuf;
|
||||
use anyhow::Result;
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres_ffi::XLogSegNo;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
|
||||
use utils::http::json::display_serialize;
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::id::NodeId;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -22,11 +23,11 @@ use crate::safekeeper::SafekeeperMemState;
|
||||
use crate::safekeeper::TermHistory;
|
||||
use crate::SafeKeeperConf;
|
||||
|
||||
use crate::timeline::ReplicaState;
|
||||
use crate::send_wal::WalSenderState;
|
||||
use crate::GlobalTimelines;
|
||||
|
||||
/// Various filters that influence the resulting JSON output.
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct Args {
|
||||
/// Dump all available safekeeper state. False by default.
|
||||
pub dump_all: bool,
|
||||
@@ -51,7 +52,7 @@ pub struct Args {
|
||||
}
|
||||
|
||||
/// Response for debug dump request.
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct Response {
|
||||
pub start_time: DateTime<Utc>,
|
||||
pub finish_time: DateTime<Utc>,
|
||||
@@ -61,7 +62,7 @@ pub struct Response {
|
||||
}
|
||||
|
||||
/// Safekeeper configuration.
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct Config {
|
||||
pub id: NodeId,
|
||||
pub workdir: PathBuf,
|
||||
@@ -72,22 +73,23 @@ pub struct Config {
|
||||
pub wal_backup_enabled: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct Timeline {
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub timeline_id: TimelineId,
|
||||
pub control_file: Option<SafeKeeperState>,
|
||||
pub memory: Option<Memory>,
|
||||
pub disk_content: Option<DiskContent>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct Memory {
|
||||
pub is_cancelled: bool,
|
||||
pub peers_info_len: usize,
|
||||
pub replicas: Vec<Option<ReplicaState>>,
|
||||
pub walsenders: Vec<WalSenderState>,
|
||||
pub wal_backup_active: bool,
|
||||
pub active: bool,
|
||||
pub num_computes: u32,
|
||||
@@ -102,12 +104,12 @@ pub struct Memory {
|
||||
pub file_open: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct DiskContent {
|
||||
pub files: Vec<FileInfo>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct FileInfo {
|
||||
pub name: String,
|
||||
pub size: u64,
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
use anyhow::Context;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, info_span, Instrument};
|
||||
|
||||
@@ -49,12 +50,14 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
|
||||
if cmd.starts_with("START_WAL_PUSH") {
|
||||
Ok(SafekeeperPostgresCommand::StartWalPush)
|
||||
} else if cmd.starts_with("START_REPLICATION") {
|
||||
let re =
|
||||
Regex::new(r"START_REPLICATION(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)").unwrap();
|
||||
let re = Regex::new(
|
||||
r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)",
|
||||
)
|
||||
.unwrap();
|
||||
let mut caps = re.captures_iter(cmd);
|
||||
let start_lsn = caps
|
||||
.next()
|
||||
.map(|cap| cap[1].parse::<Lsn>())
|
||||
.map(|cap| Lsn::from_str(&cap[1]))
|
||||
.context("parse start LSN from START_REPLICATION command")??;
|
||||
Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
|
||||
} else if cmd.starts_with("IDENTIFY_SYSTEM") {
|
||||
|
||||
@@ -3,19 +3,21 @@ use hyper::{Body, Request, Response, StatusCode, Uri};
|
||||
use once_cell::sync::Lazy;
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use safekeeper_api::models::SkTimelineInfo;
|
||||
use serde::Serialize;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use tokio::fs::File;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio::task::JoinError;
|
||||
use utils::http::json::display_serialize;
|
||||
|
||||
use crate::debug_dump;
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::safekeeper::Term;
|
||||
use crate::{debug_dump, pull_timeline};
|
||||
|
||||
use crate::timelines_global_map::TimelineDeleteForceResult;
|
||||
use crate::GlobalTimelines;
|
||||
@@ -57,44 +59,46 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
|
||||
|
||||
/// Same as TermSwitchEntry, but serializes LSN using display serializer
|
||||
/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
|
||||
#[derive(Debug, Serialize)]
|
||||
struct TermSwitchApiEntry {
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct TermSwitchApiEntry {
|
||||
pub term: Term,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Augment AcceptorState with epoch for convenience
|
||||
#[derive(Debug, Serialize)]
|
||||
struct AcceptorStateStatus {
|
||||
term: Term,
|
||||
epoch: Term,
|
||||
term_history: Vec<TermSwitchApiEntry>,
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct AcceptorStateStatus {
|
||||
pub term: Term,
|
||||
pub epoch: Term,
|
||||
pub term_history: Vec<TermSwitchApiEntry>,
|
||||
}
|
||||
|
||||
/// Info about timeline on safekeeper ready for reporting.
|
||||
#[derive(Debug, Serialize)]
|
||||
struct TimelineStatus {
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
tenant_id: TenantId,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
timeline_id: TimelineId,
|
||||
acceptor_state: AcceptorStateStatus,
|
||||
pg_info: ServerInfo,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
flush_lsn: Lsn,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
timeline_start_lsn: Lsn,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
local_start_lsn: Lsn,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
commit_lsn: Lsn,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
backup_lsn: Lsn,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
peer_horizon_lsn: Lsn,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
remote_consistent_lsn: Lsn,
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct TimelineStatus {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub timeline_id: TimelineId,
|
||||
pub acceptor_state: AcceptorStateStatus,
|
||||
pub pg_info: ServerInfo,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub flush_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub timeline_start_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub local_start_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub commit_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub backup_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
}
|
||||
|
||||
fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Result<(), ApiError> {
|
||||
@@ -144,7 +148,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
commit_lsn: inmem.commit_lsn,
|
||||
backup_lsn: inmem.backup_lsn,
|
||||
peer_horizon_lsn: inmem.peer_horizon_lsn,
|
||||
remote_consistent_lsn: inmem.remote_consistent_lsn,
|
||||
remote_consistent_lsn: tli.get_walsenders().get_remote_consistent_lsn(),
|
||||
};
|
||||
json_response(StatusCode::OK, status)
|
||||
}
|
||||
@@ -175,6 +179,49 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Pull timeline from peer safekeeper instances.
|
||||
async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let data: pull_timeline::Request = json_request(&mut request).await?;
|
||||
|
||||
let resp = pull_timeline::handle_request(data)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, resp)
|
||||
}
|
||||
|
||||
/// Download a file from the timeline directory.
|
||||
// TODO: figure out a better way to copy files between safekeepers
|
||||
async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let filename: String = parse_request_param(&request, "filename")?;
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
|
||||
let filepath = tli.timeline_dir.join(filename);
|
||||
let mut file = File::open(&filepath)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
let mut content = Vec::new();
|
||||
// TODO: don't store files in memory
|
||||
file.read_to_end(&mut content)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header("Content-Type", "application/octet-stream")
|
||||
.body(Body::from(content))
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))
|
||||
}
|
||||
|
||||
/// Deactivates the timeline and removes its data directory.
|
||||
async fn timeline_delete_force_handler(
|
||||
mut request: Request<Body>,
|
||||
@@ -246,7 +293,7 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
|
||||
};
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
tli.record_safekeeper_info(&proto_sk_info)
|
||||
tli.record_safekeeper_info(proto_sk_info)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -351,6 +398,11 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
timeline_delete_force_handler,
|
||||
)
|
||||
.delete("/v1/tenant/:tenant_id", tenant_delete_force_handler)
|
||||
.post("/v1/pull_timeline", timeline_pull_handler)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename",
|
||||
timeline_files_handler,
|
||||
)
|
||||
// for tests
|
||||
.post(
|
||||
"/v1/record_safekeeper_info/:tenant_id/:timeline_id",
|
||||
|
||||
@@ -50,7 +50,7 @@ pub struct AppendLogicalMessage {
|
||||
pub pg_version: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize)]
|
||||
struct AppendResult {
|
||||
// safekeeper state after append
|
||||
state: SafeKeeperState,
|
||||
@@ -133,7 +133,7 @@ fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::R
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct InsertedWAL {
|
||||
begin_lsn: Lsn,
|
||||
pub end_lsn: Lsn,
|
||||
|
||||
@@ -15,6 +15,7 @@ pub mod handler;
|
||||
pub mod http;
|
||||
pub mod json_ctrl;
|
||||
pub mod metrics;
|
||||
pub mod pull_timeline;
|
||||
pub mod receive_wal;
|
||||
pub mod remove_wal;
|
||||
pub mod safekeeper;
|
||||
|
||||
@@ -15,11 +15,11 @@ use metrics::{
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use postgres_ffi::XLogSegNo;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{
|
||||
safekeeper::{SafeKeeperState, SafekeeperMemState},
|
||||
timeline::ReplicaState,
|
||||
GlobalTimelines,
|
||||
};
|
||||
|
||||
@@ -231,7 +231,7 @@ pub fn time_io_closure(closure: impl FnOnce() -> Result<()>) -> Result<f64> {
|
||||
/// Metrics for a single timeline.
|
||||
pub struct FullTimelineInfo {
|
||||
pub ttid: TenantTimelineId,
|
||||
pub replicas: Vec<ReplicaState>,
|
||||
pub ps_feedback: PageserverFeedback,
|
||||
pub wal_backup_active: bool,
|
||||
pub timeline_is_active: bool,
|
||||
pub num_computes: u32,
|
||||
@@ -242,6 +242,7 @@ pub struct FullTimelineInfo {
|
||||
pub persisted_state: SafeKeeperState,
|
||||
|
||||
pub flush_lsn: Lsn,
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
|
||||
pub wal_storage: WalStorageMetrics,
|
||||
}
|
||||
@@ -514,19 +515,6 @@ impl Collector for TimelineCollector {
|
||||
let timeline_id = tli.ttid.timeline_id.to_string();
|
||||
let labels = &[tenant_id.as_str(), timeline_id.as_str()];
|
||||
|
||||
let mut most_advanced: Option<pq_proto::PageserverFeedback> = None;
|
||||
for replica in tli.replicas.iter() {
|
||||
if let Some(replica_feedback) = replica.pageserver_feedback {
|
||||
if let Some(current) = most_advanced {
|
||||
if current.last_received_lsn < replica_feedback.last_received_lsn {
|
||||
most_advanced = Some(replica_feedback);
|
||||
}
|
||||
} else {
|
||||
most_advanced = Some(replica_feedback);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.commit_lsn
|
||||
.with_label_values(labels)
|
||||
.set(tli.mem_state.commit_lsn.into());
|
||||
@@ -544,7 +532,7 @@ impl Collector for TimelineCollector {
|
||||
.set(tli.mem_state.peer_horizon_lsn.into());
|
||||
self.remote_consistent_lsn
|
||||
.with_label_values(labels)
|
||||
.set(tli.mem_state.remote_consistent_lsn.into());
|
||||
.set(tli.remote_consistent_lsn.into());
|
||||
self.timeline_active
|
||||
.with_label_values(labels)
|
||||
.set(tli.timeline_is_active as u64);
|
||||
@@ -567,15 +555,17 @@ impl Collector for TimelineCollector {
|
||||
.with_label_values(labels)
|
||||
.set(tli.wal_storage.flush_wal_seconds);
|
||||
|
||||
if let Some(feedback) = most_advanced {
|
||||
self.ps_last_received_lsn
|
||||
self.ps_last_received_lsn
|
||||
.with_label_values(labels)
|
||||
.set(tli.ps_feedback.last_received_lsn.0);
|
||||
if let Ok(unix_time) = tli
|
||||
.ps_feedback
|
||||
.replytime
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
{
|
||||
self.feedback_last_time_seconds
|
||||
.with_label_values(labels)
|
||||
.set(feedback.last_received_lsn);
|
||||
if let Ok(unix_time) = feedback.replytime.duration_since(SystemTime::UNIX_EPOCH) {
|
||||
self.feedback_last_time_seconds
|
||||
.with_label_values(labels)
|
||||
.set(unix_time.as_secs());
|
||||
}
|
||||
.set(unix_time.as_secs());
|
||||
}
|
||||
|
||||
if tli.last_removed_segno != 0 {
|
||||
|
||||
240
safekeeper/src/pull_timeline.rs
Normal file
240
safekeeper/src/pull_timeline.rs
Normal file
@@ -0,0 +1,240 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tracing::info;
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
use crate::{
|
||||
control_file, debug_dump,
|
||||
http::routes::TimelineStatus,
|
||||
wal_storage::{self, Storage},
|
||||
GlobalTimelines,
|
||||
};
|
||||
|
||||
/// Info about timeline on safekeeper ready for reporting.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct Request {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub timeline_id: TimelineId,
|
||||
pub http_hosts: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Response {
|
||||
// Donor safekeeper host
|
||||
pub safekeeper_host: String,
|
||||
// TODO: add more fields?
|
||||
}
|
||||
|
||||
/// Find the most advanced safekeeper and pull timeline from it.
|
||||
pub async fn handle_request(request: Request) -> Result<Response> {
|
||||
let existing_tli = GlobalTimelines::get(TenantTimelineId::new(
|
||||
request.tenant_id,
|
||||
request.timeline_id,
|
||||
));
|
||||
if existing_tli.is_ok() {
|
||||
bail!("Timeline {} already exists", request.timeline_id);
|
||||
}
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let http_hosts = request.http_hosts.clone();
|
||||
|
||||
// Send request to /v1/tenant/:tenant_id/timeline/:timeline_id
|
||||
let responses = futures::future::join_all(http_hosts.iter().map(|url| {
|
||||
let url = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}",
|
||||
url, request.tenant_id, request.timeline_id
|
||||
);
|
||||
client.get(url).send()
|
||||
}))
|
||||
.await;
|
||||
|
||||
let mut statuses = Vec::new();
|
||||
for (i, response) in responses.into_iter().enumerate() {
|
||||
let response = response.context(format!("Failed to get status from {}", http_hosts[i]))?;
|
||||
let status: crate::http::routes::TimelineStatus = response.json().await?;
|
||||
statuses.push((status, i));
|
||||
}
|
||||
|
||||
// Find the most advanced safekeeper
|
||||
// TODO: current logic may be wrong, fix it later
|
||||
let (status, i) = statuses
|
||||
.into_iter()
|
||||
.max_by_key(|(status, _)| {
|
||||
(
|
||||
status.acceptor_state.epoch,
|
||||
status.flush_lsn,
|
||||
status.commit_lsn,
|
||||
)
|
||||
})
|
||||
.unwrap();
|
||||
let safekeeper_host = http_hosts[i].clone();
|
||||
|
||||
assert!(status.tenant_id == request.tenant_id);
|
||||
assert!(status.timeline_id == request.timeline_id);
|
||||
|
||||
pull_timeline(status, safekeeper_host).await
|
||||
}
|
||||
|
||||
async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response> {
|
||||
let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
|
||||
info!(
|
||||
"Pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
|
||||
ttid,
|
||||
host,
|
||||
status.commit_lsn,
|
||||
status.flush_lsn,
|
||||
status.acceptor_state.term,
|
||||
status.acceptor_state.epoch
|
||||
);
|
||||
|
||||
let conf = &GlobalTimelines::get_global_config();
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
// TODO: don't use debug dump, it should be used only in tests.
|
||||
// This is a proof of concept, we should figure out a way
|
||||
// to use scp without implementing it manually.
|
||||
|
||||
// Implementing our own scp over HTTP.
|
||||
// At first, we need to fetch list of files from safekeeper.
|
||||
let dump: debug_dump::Response = client
|
||||
.get(format!(
|
||||
"{}/v1/debug_dump?dump_all=true&tenant_id={}&timeline_id={}",
|
||||
host, status.tenant_id, status.timeline_id
|
||||
))
|
||||
.send()
|
||||
.await?
|
||||
.json()
|
||||
.await?;
|
||||
|
||||
if dump.timelines.len() != 1 {
|
||||
bail!(
|
||||
"Expected to fetch single timeline, got {} timelines",
|
||||
dump.timelines.len()
|
||||
);
|
||||
}
|
||||
|
||||
let timeline = dump.timelines.into_iter().next().unwrap();
|
||||
let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!(
|
||||
"Timeline {} doesn't have disk content",
|
||||
ttid
|
||||
))?;
|
||||
|
||||
let mut filenames = disk_content
|
||||
.files
|
||||
.iter()
|
||||
.map(|file| file.name.clone())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// Sort filenames to make sure we pull files in correct order
|
||||
// After sorting, we should have:
|
||||
// - 000000010000000000000001
|
||||
// - ...
|
||||
// - 000000010000000000000002.partial
|
||||
// - safekeeper.control
|
||||
filenames.sort();
|
||||
|
||||
// safekeeper.control should be the first file, so we need to move it to the beginning
|
||||
let control_file_index = filenames
|
||||
.iter()
|
||||
.position(|name| name == "safekeeper.control")
|
||||
.ok_or(anyhow::anyhow!("safekeeper.control not found"))?;
|
||||
filenames.remove(control_file_index);
|
||||
filenames.insert(0, "safekeeper.control".to_string());
|
||||
|
||||
info!(
|
||||
"Downloading {} files from safekeeper {}",
|
||||
filenames.len(),
|
||||
host
|
||||
);
|
||||
|
||||
// Creating temp directory for a new timeline. It needs to be
|
||||
// located on the same filesystem as the rest of the timelines.
|
||||
|
||||
// conf.workdir is usually /storage/safekeeper/data
|
||||
// will try to transform it into /storage/safekeeper/tmp
|
||||
let temp_base = conf
|
||||
.workdir
|
||||
.parent()
|
||||
.ok_or(anyhow::anyhow!("workdir has no parent"))?
|
||||
.join("tmp");
|
||||
|
||||
tokio::fs::create_dir_all(&temp_base).await?;
|
||||
|
||||
let tli_dir = tempfile::Builder::new()
|
||||
.suffix("_temptli")
|
||||
.prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
|
||||
.tempdir_in(temp_base)?;
|
||||
let tli_dir_path = tli_dir.path().to_owned();
|
||||
|
||||
// Note: some time happens between fetching list of files and fetching files themselves.
|
||||
// It's possible that some files will be removed from safekeeper and we will fail to fetch them.
|
||||
// This function will fail in this case, should be retried by the caller.
|
||||
for filename in filenames {
|
||||
let file_path = tli_dir_path.join(&filename);
|
||||
// /v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename
|
||||
let http_url = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/file/{}",
|
||||
host, status.tenant_id, status.timeline_id, filename
|
||||
);
|
||||
|
||||
let mut file = tokio::fs::File::create(&file_path).await?;
|
||||
let mut response = client.get(&http_url).send().await?;
|
||||
while let Some(chunk) = response.chunk().await? {
|
||||
file.write_all(&chunk).await?;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: fsync?
|
||||
|
||||
// Let's create timeline from temp directory and verify that it's correct
|
||||
|
||||
let control_path = tli_dir_path.join("safekeeper.control");
|
||||
|
||||
let control_store = control_file::FileStorage::load_control_file(control_path)?;
|
||||
if control_store.server.wal_seg_size == 0 {
|
||||
bail!("wal_seg_size is not set");
|
||||
}
|
||||
|
||||
let wal_store =
|
||||
wal_storage::PhysicalStorage::new(&ttid, tli_dir_path.clone(), conf, &control_store)?;
|
||||
|
||||
let commit_lsn = status.commit_lsn;
|
||||
let flush_lsn = wal_store.flush_lsn();
|
||||
|
||||
info!(
|
||||
"Finished downloading timeline {}, commit_lsn={}, flush_lsn={}",
|
||||
ttid, commit_lsn, flush_lsn
|
||||
);
|
||||
assert!(status.commit_lsn <= status.flush_lsn);
|
||||
|
||||
// Move timeline dir to the correct location
|
||||
let timeline_path = conf.timeline_dir(&ttid);
|
||||
|
||||
info!(
|
||||
"Moving timeline {} from {} to {}",
|
||||
ttid,
|
||||
tli_dir_path.display(),
|
||||
timeline_path.display()
|
||||
);
|
||||
tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
|
||||
tokio::fs::rename(tli_dir_path, &timeline_path).await?;
|
||||
|
||||
let tli = GlobalTimelines::load_timeline(ttid).context("Failed to load timeline after copy")?;
|
||||
|
||||
info!(
|
||||
"Loaded timeline {}, flush_lsn={}",
|
||||
ttid,
|
||||
tli.get_flush_lsn()
|
||||
);
|
||||
|
||||
Ok(Response {
|
||||
safekeeper_host: host,
|
||||
})
|
||||
}
|
||||
@@ -18,7 +18,8 @@ use crate::control_file;
|
||||
use crate::send_wal::HotStandbyFeedback;
|
||||
|
||||
use crate::wal_storage;
|
||||
use pq_proto::{PageserverFeedback, SystemId};
|
||||
use pq_proto::SystemId;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
use utils::{
|
||||
bin_ser::LeSer,
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
@@ -205,14 +206,13 @@ pub struct SafeKeeperState {
|
||||
pub peers: PersistedPeers,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
// In memory safekeeper state. Fields mirror ones in `SafeKeeperState`; values
|
||||
// are not flushed yet.
|
||||
pub struct SafekeeperMemState {
|
||||
pub commit_lsn: Lsn,
|
||||
pub backup_lsn: Lsn,
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
#[serde(with = "hex")]
|
||||
pub proposer_uuid: PgUuid,
|
||||
}
|
||||
@@ -347,7 +347,7 @@ pub struct AppendRequestHeader {
|
||||
}
|
||||
|
||||
/// Report safekeeper state to proposer
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct AppendResponse {
|
||||
// Current term of the safekeeper; if it is higher than proposer's, the
|
||||
// compute is out of date.
|
||||
@@ -540,7 +540,6 @@ where
|
||||
commit_lsn: state.commit_lsn,
|
||||
backup_lsn: state.backup_lsn,
|
||||
peer_horizon_lsn: state.peer_horizon_lsn,
|
||||
remote_consistent_lsn: state.remote_consistent_lsn,
|
||||
proposer_uuid: state.proposer_uuid,
|
||||
},
|
||||
state,
|
||||
@@ -781,10 +780,6 @@ where
|
||||
|
||||
// Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment.
|
||||
self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn);
|
||||
// Initializing remote_consistent_lsn sets that we have nothing to
|
||||
// stream to pageserver(s) immediately after creation.
|
||||
self.inmem.remote_consistent_lsn =
|
||||
max(self.inmem.remote_consistent_lsn, state.timeline_start_lsn);
|
||||
|
||||
state.acceptor_state.term_history = msg.term_history.clone();
|
||||
self.persist_control_file(state)?;
|
||||
@@ -837,7 +832,6 @@ where
|
||||
state.commit_lsn = self.inmem.commit_lsn;
|
||||
state.backup_lsn = self.inmem.backup_lsn;
|
||||
state.peer_horizon_lsn = self.inmem.peer_horizon_lsn;
|
||||
state.remote_consistent_lsn = self.inmem.remote_consistent_lsn;
|
||||
state.proposer_uuid = self.inmem.proposer_uuid;
|
||||
self.state.persist(&state)
|
||||
}
|
||||
@@ -940,14 +934,12 @@ where
|
||||
self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn;
|
||||
self.inmem.backup_lsn = new_backup_lsn;
|
||||
|
||||
let new_remote_consistent_lsn = max(
|
||||
Lsn(sk_info.remote_consistent_lsn),
|
||||
self.inmem.remote_consistent_lsn,
|
||||
);
|
||||
// value in sk_info should be maximized over our local in memory value.
|
||||
let new_remote_consistent_lsn = Lsn(sk_info.remote_consistent_lsn);
|
||||
assert!(self.state.remote_consistent_lsn <= new_remote_consistent_lsn);
|
||||
sync_control_file |= self.state.remote_consistent_lsn
|
||||
+ (self.state.server.wal_seg_size as u64)
|
||||
< new_remote_consistent_lsn;
|
||||
self.inmem.remote_consistent_lsn = new_remote_consistent_lsn;
|
||||
|
||||
let new_peer_horizon_lsn = max(Lsn(sk_info.peer_horizon_lsn), self.inmem.peer_horizon_lsn);
|
||||
sync_control_file |= self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64)
|
||||
@@ -955,7 +947,12 @@ where
|
||||
self.inmem.peer_horizon_lsn = new_peer_horizon_lsn;
|
||||
|
||||
if sync_control_file {
|
||||
self.persist_control_file(self.state.clone())?;
|
||||
let mut state = self.state.clone();
|
||||
// Note: we do not persist remote_consistent_lsn in other paths of
|
||||
// persisting cf -- that is not much needed currently. We could do
|
||||
// that by storing Arc to walsenders in Safekeeper.
|
||||
state.remote_consistent_lsn = new_remote_consistent_lsn;
|
||||
self.persist_control_file(state)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,21 +1,28 @@
|
||||
//! This module implements the streaming side of replication protocol, starting
|
||||
//! with the "START_REPLICATION" message.
|
||||
//! with the "START_REPLICATION" message, and registry of walsenders.
|
||||
|
||||
use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::timeline::{ReplicaState, Timeline};
|
||||
use crate::timeline::Timeline;
|
||||
use crate::wal_service::ConnectionId;
|
||||
use crate::wal_storage::WalReader;
|
||||
use crate::GlobalTimelines;
|
||||
use anyhow::Context as AnyhowContext;
|
||||
use bytes::Bytes;
|
||||
use parking_lot::Mutex;
|
||||
use postgres_backend::PostgresBackend;
|
||||
use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
|
||||
use postgres_ffi::get_current_timestamp;
|
||||
use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
|
||||
use pq_proto::{BeMessage, PageserverFeedback, WalSndKeepAlive, XLogDataBody};
|
||||
use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::AtomicLsn;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
|
||||
use std::cmp::min;
|
||||
use std::cmp::{max, min};
|
||||
use std::net::SocketAddr;
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
@@ -40,6 +47,8 @@ pub struct HotStandbyFeedback {
|
||||
pub catalog_xmin: FullTransactionId,
|
||||
}
|
||||
|
||||
const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0;
|
||||
|
||||
impl HotStandbyFeedback {
|
||||
pub fn empty() -> HotStandbyFeedback {
|
||||
HotStandbyFeedback {
|
||||
@@ -51,24 +60,294 @@ impl HotStandbyFeedback {
|
||||
}
|
||||
|
||||
/// Standby status update
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct StandbyReply {
|
||||
pub write_lsn: Lsn, // last lsn received by pageserver
|
||||
pub flush_lsn: Lsn, // pageserver's disk consistent lSN
|
||||
pub apply_lsn: Lsn, // pageserver's remote consistent lSN
|
||||
pub reply_ts: TimestampTz,
|
||||
pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby.
|
||||
pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby.
|
||||
pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby.
|
||||
pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01.
|
||||
pub reply_requested: bool,
|
||||
}
|
||||
|
||||
/// Scope guard to unregister replication connection from timeline
|
||||
struct ReplicationConnGuard {
|
||||
replica: usize, // replica internal ID assigned by timeline
|
||||
timeline: Arc<Timeline>,
|
||||
impl StandbyReply {
|
||||
fn empty() -> Self {
|
||||
StandbyReply {
|
||||
write_lsn: Lsn::INVALID,
|
||||
flush_lsn: Lsn::INVALID,
|
||||
apply_lsn: Lsn::INVALID,
|
||||
reply_ts: 0,
|
||||
reply_requested: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ReplicationConnGuard {
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct StandbyFeedback {
|
||||
reply: StandbyReply,
|
||||
hs_feedback: HotStandbyFeedback,
|
||||
}
|
||||
|
||||
/// WalSenders registry. Timeline holds it (wrapped in Arc).
|
||||
pub struct WalSenders {
|
||||
/// Lsn maximized over all walsenders *and* peer data, so might be higher
|
||||
/// than what we receive from replicas.
|
||||
remote_consistent_lsn: AtomicLsn,
|
||||
mutex: Mutex<WalSendersShared>,
|
||||
}
|
||||
|
||||
impl WalSenders {
|
||||
pub fn new(remote_consistent_lsn: Lsn) -> Arc<WalSenders> {
|
||||
Arc::new(WalSenders {
|
||||
remote_consistent_lsn: AtomicLsn::from(remote_consistent_lsn),
|
||||
mutex: Mutex::new(WalSendersShared::new()),
|
||||
})
|
||||
}
|
||||
|
||||
/// Register new walsender. Returned guard provides access to the slot and
|
||||
/// automatically deregisters in Drop.
|
||||
fn register(
|
||||
self: &Arc<WalSenders>,
|
||||
ttid: TenantTimelineId,
|
||||
addr: SocketAddr,
|
||||
conn_id: ConnectionId,
|
||||
appname: Option<String>,
|
||||
) -> WalSenderGuard {
|
||||
let slots = &mut self.mutex.lock().slots;
|
||||
let walsender_state = WalSenderState {
|
||||
ttid,
|
||||
addr,
|
||||
conn_id,
|
||||
appname,
|
||||
feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
|
||||
};
|
||||
// find empty slot or create new one
|
||||
let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
|
||||
slots[pos] = Some(walsender_state);
|
||||
pos
|
||||
} else {
|
||||
let pos = slots.len();
|
||||
slots.push(Some(walsender_state));
|
||||
pos
|
||||
};
|
||||
WalSenderGuard {
|
||||
id: pos,
|
||||
walsenders: self.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get state of all walsenders.
|
||||
pub fn get_all(self: &Arc<WalSenders>) -> Vec<WalSenderState> {
|
||||
self.mutex.lock().slots.iter().flatten().cloned().collect()
|
||||
}
|
||||
|
||||
/// Get aggregated pageserver feedback.
|
||||
pub fn get_ps_feedback(self: &Arc<WalSenders>) -> PageserverFeedback {
|
||||
self.mutex.lock().agg_ps_feedback
|
||||
}
|
||||
|
||||
/// Get aggregated pageserver and hot standby feedback (we send them to compute).
|
||||
pub fn get_feedbacks(self: &Arc<WalSenders>) -> (PageserverFeedback, HotStandbyFeedback) {
|
||||
let shared = self.mutex.lock();
|
||||
(shared.agg_ps_feedback, shared.agg_hs_feedback)
|
||||
}
|
||||
|
||||
/// Record new pageserver feedback, update aggregated values.
|
||||
fn record_ps_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &PageserverFeedback) {
|
||||
let mut shared = self.mutex.lock();
|
||||
shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback);
|
||||
shared.update_ps_feedback();
|
||||
self.update_remote_consistent_lsn(shared.agg_ps_feedback.remote_consistent_lsn);
|
||||
}
|
||||
|
||||
/// Record standby reply.
|
||||
fn record_standby_reply(self: &Arc<WalSenders>, id: WalSenderId, reply: &StandbyReply) {
|
||||
let mut shared = self.mutex.lock();
|
||||
let slot = shared.get_slot_mut(id);
|
||||
match &mut slot.feedback {
|
||||
ReplicationFeedback::Standby(sf) => sf.reply = *reply,
|
||||
ReplicationFeedback::Pageserver(_) => {
|
||||
slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
|
||||
reply: *reply,
|
||||
hs_feedback: HotStandbyFeedback::empty(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Record hot standby feedback, update aggregated value.
|
||||
fn record_hs_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &HotStandbyFeedback) {
|
||||
let mut shared = self.mutex.lock();
|
||||
let slot = shared.get_slot_mut(id);
|
||||
match &mut slot.feedback {
|
||||
ReplicationFeedback::Standby(sf) => sf.hs_feedback = *feedback,
|
||||
ReplicationFeedback::Pageserver(_) => {
|
||||
slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
|
||||
reply: StandbyReply::empty(),
|
||||
hs_feedback: *feedback,
|
||||
})
|
||||
}
|
||||
}
|
||||
shared.update_hs_feedback();
|
||||
}
|
||||
|
||||
/// Get remote_consistent_lsn reported by the pageserver. Returns None if
|
||||
/// client is not pageserver.
|
||||
fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
|
||||
let shared = self.mutex.lock();
|
||||
let slot = shared.get_slot(id);
|
||||
match slot.feedback {
|
||||
ReplicationFeedback::Pageserver(feedback) => Some(feedback.remote_consistent_lsn),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get remote_consistent_lsn maximized across all walsenders and peers.
|
||||
pub fn get_remote_consistent_lsn(self: &Arc<WalSenders>) -> Lsn {
|
||||
self.remote_consistent_lsn.load()
|
||||
}
|
||||
|
||||
/// Update maximized remote_consistent_lsn, return new (potentially) value.
|
||||
pub fn update_remote_consistent_lsn(self: &Arc<WalSenders>, candidate: Lsn) -> Lsn {
|
||||
self.remote_consistent_lsn
|
||||
.fetch_max(candidate)
|
||||
.max(candidate)
|
||||
}
|
||||
|
||||
/// Unregister walsender.
|
||||
fn unregister(self: &Arc<WalSenders>, id: WalSenderId) {
|
||||
let mut shared = self.mutex.lock();
|
||||
shared.slots[id] = None;
|
||||
shared.update_hs_feedback();
|
||||
}
|
||||
}
|
||||
|
||||
struct WalSendersShared {
|
||||
// aggregated over all walsenders value
|
||||
agg_hs_feedback: HotStandbyFeedback,
|
||||
// aggregated over all walsenders value
|
||||
agg_ps_feedback: PageserverFeedback,
|
||||
slots: Vec<Option<WalSenderState>>,
|
||||
}
|
||||
|
||||
impl WalSendersShared {
|
||||
fn new() -> Self {
|
||||
WalSendersShared {
|
||||
agg_hs_feedback: HotStandbyFeedback::empty(),
|
||||
agg_ps_feedback: PageserverFeedback::empty(),
|
||||
slots: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get content of provided id slot, it must exist.
|
||||
fn get_slot(&self, id: WalSenderId) -> &WalSenderState {
|
||||
self.slots[id].as_ref().expect("walsender doesn't exist")
|
||||
}
|
||||
|
||||
/// Get mut content of provided id slot, it must exist.
|
||||
fn get_slot_mut(&mut self, id: WalSenderId) -> &mut WalSenderState {
|
||||
self.slots[id].as_mut().expect("walsender doesn't exist")
|
||||
}
|
||||
|
||||
/// Update aggregated hot standy feedback. We just take min of valid xmins
|
||||
/// and ts.
|
||||
fn update_hs_feedback(&mut self) {
|
||||
let mut agg = HotStandbyFeedback::empty();
|
||||
for ws_state in self.slots.iter().flatten() {
|
||||
if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
|
||||
let hs_feedback = standby_feedback.hs_feedback;
|
||||
// doing Option math like op1.iter().chain(op2.iter()).min()
|
||||
// would be nicer, but we serialize/deserialize this struct
|
||||
// directly, so leave as is for now
|
||||
if hs_feedback.xmin != INVALID_FULL_TRANSACTION_ID {
|
||||
if agg.xmin != INVALID_FULL_TRANSACTION_ID {
|
||||
agg.xmin = min(agg.xmin, hs_feedback.xmin);
|
||||
} else {
|
||||
agg.xmin = hs_feedback.xmin;
|
||||
}
|
||||
agg.ts = min(agg.ts, hs_feedback.ts);
|
||||
}
|
||||
if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
|
||||
if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
|
||||
agg.catalog_xmin = min(agg.catalog_xmin, hs_feedback.catalog_xmin);
|
||||
} else {
|
||||
agg.catalog_xmin = hs_feedback.catalog_xmin;
|
||||
}
|
||||
agg.ts = min(agg.ts, hs_feedback.ts);
|
||||
}
|
||||
}
|
||||
}
|
||||
self.agg_hs_feedback = agg;
|
||||
}
|
||||
|
||||
/// Update aggregated pageserver feedback. LSNs (last_received,
|
||||
/// disk_consistent, remote_consistent) and reply timestamp are just
|
||||
/// maximized; timeline_size if taken from feedback with highest
|
||||
/// last_received lsn. This is generally reasonable, but we might want to
|
||||
/// implement other policies once multiple pageservers start to be actively
|
||||
/// used.
|
||||
fn update_ps_feedback(&mut self) {
|
||||
let init = PageserverFeedback::empty();
|
||||
let acc =
|
||||
self.slots
|
||||
.iter()
|
||||
.flatten()
|
||||
.fold(init, |mut acc, ws_state| match ws_state.feedback {
|
||||
ReplicationFeedback::Pageserver(feedback) => {
|
||||
if feedback.last_received_lsn > acc.last_received_lsn {
|
||||
acc.current_timeline_size = feedback.current_timeline_size;
|
||||
}
|
||||
acc.last_received_lsn =
|
||||
max(feedback.last_received_lsn, acc.last_received_lsn);
|
||||
acc.disk_consistent_lsn =
|
||||
max(feedback.disk_consistent_lsn, acc.disk_consistent_lsn);
|
||||
acc.remote_consistent_lsn =
|
||||
max(feedback.remote_consistent_lsn, acc.remote_consistent_lsn);
|
||||
acc.replytime = max(feedback.replytime, acc.replytime);
|
||||
acc
|
||||
}
|
||||
ReplicationFeedback::Standby(_) => acc,
|
||||
});
|
||||
self.agg_ps_feedback = acc;
|
||||
}
|
||||
}
|
||||
|
||||
// Serialized is used only for pretty printing in json.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalSenderState {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
ttid: TenantTimelineId,
|
||||
addr: SocketAddr,
|
||||
conn_id: ConnectionId,
|
||||
// postgres application_name
|
||||
appname: Option<String>,
|
||||
feedback: ReplicationFeedback,
|
||||
}
|
||||
|
||||
// Receiver is either pageserver or regular standby, which have different
|
||||
// feedbacks.
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
enum ReplicationFeedback {
|
||||
Pageserver(PageserverFeedback),
|
||||
Standby(StandbyFeedback),
|
||||
}
|
||||
|
||||
// id of the occupied slot in WalSenders to access it (and save in the
|
||||
// WalSenderGuard). We could give Arc directly to the slot, but there is not
|
||||
// much sense in that as values aggregation which is performed on each feedback
|
||||
// receival iterates over all walsenders.
|
||||
pub type WalSenderId = usize;
|
||||
|
||||
/// Scope guard to access slot in WalSenders registry and unregister from it in
|
||||
/// Drop.
|
||||
pub struct WalSenderGuard {
|
||||
id: WalSenderId,
|
||||
walsenders: Arc<WalSenders>,
|
||||
}
|
||||
|
||||
impl Drop for WalSenderGuard {
|
||||
fn drop(&mut self) {
|
||||
self.timeline.remove_replica(self.replica);
|
||||
self.walsenders.unregister(self.id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,16 +376,13 @@ impl SafekeeperPostgresHandler {
|
||||
let tli =
|
||||
GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?;
|
||||
|
||||
let state = ReplicaState::new();
|
||||
// This replica_id is used below to check if it's time to stop replication.
|
||||
let replica_id = tli.add_replica(state);
|
||||
|
||||
// Use a guard object to remove our entry from the timeline, when the background
|
||||
// thread and us have both finished using it.
|
||||
let _guard = Arc::new(ReplicationConnGuard {
|
||||
replica: replica_id,
|
||||
timeline: tli.clone(),
|
||||
});
|
||||
// Use a guard object to remove our entry from the timeline when we are done.
|
||||
let ws_guard = Arc::new(tli.get_walsenders().register(
|
||||
self.ttid,
|
||||
*pgb.get_peer_addr(),
|
||||
self.conn_id,
|
||||
self.appname.clone(),
|
||||
));
|
||||
|
||||
// Walproposer gets special handling: safekeeper must give proposer all
|
||||
// local WAL till the end, whether committed or not (walproposer will
|
||||
@@ -154,16 +430,11 @@ impl SafekeeperPostgresHandler {
|
||||
end_pos,
|
||||
stop_pos,
|
||||
commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
|
||||
replica_id,
|
||||
ws_guard: ws_guard.clone(),
|
||||
wal_reader,
|
||||
send_buf: [0; MAX_SEND_SIZE],
|
||||
};
|
||||
let mut reply_reader = ReplyReader {
|
||||
reader,
|
||||
tli,
|
||||
replica_id,
|
||||
feedback: ReplicaState::new(),
|
||||
};
|
||||
let mut reply_reader = ReplyReader { reader, ws_guard };
|
||||
|
||||
let res = tokio::select! {
|
||||
// todo: add read|write .context to these errors
|
||||
@@ -190,7 +461,7 @@ struct WalSender<'a, IO> {
|
||||
// in recovery.
|
||||
stop_pos: Option<Lsn>,
|
||||
commit_lsn_watch_rx: Receiver<Lsn>,
|
||||
replica_id: usize,
|
||||
ws_guard: Arc<WalSenderGuard>,
|
||||
wal_reader: WalReader,
|
||||
// buffer for readling WAL into to send it
|
||||
send_buf: [u8; MAX_SEND_SIZE],
|
||||
@@ -264,14 +535,20 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
return Ok(());
|
||||
}
|
||||
// Timed out waiting for WAL, check for termination and send KA
|
||||
if self.tli.should_walsender_stop(self.replica_id) {
|
||||
// Terminate if there is nothing more to send.
|
||||
// TODO close the stream properly
|
||||
return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
|
||||
"ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
|
||||
self.appname, self.start_pos,
|
||||
)));
|
||||
if let Some(remote_consistent_lsn) = self
|
||||
.ws_guard
|
||||
.walsenders
|
||||
.get_ws_remote_consistent_lsn(self.ws_guard.id)
|
||||
{
|
||||
if self.tli.should_walsender_stop(remote_consistent_lsn) {
|
||||
// Terminate if there is nothing more to send.
|
||||
return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
|
||||
"ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
|
||||
self.appname, self.start_pos,
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
self.pgb
|
||||
.write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
|
||||
sent_ptr: self.end_pos.0,
|
||||
@@ -286,9 +563,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
/// A half driving receiving replies.
|
||||
struct ReplyReader<IO> {
|
||||
reader: PostgresBackendReader<IO>,
|
||||
tli: Arc<Timeline>,
|
||||
replica_id: usize,
|
||||
feedback: ReplicaState,
|
||||
ws_guard: Arc<WalSenderGuard>,
|
||||
}
|
||||
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
|
||||
@@ -303,29 +578,32 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
|
||||
match msg.first().cloned() {
|
||||
Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => {
|
||||
// Note: deserializing is on m[1..] because we skip the tag byte.
|
||||
self.feedback.hs_feedback = HotStandbyFeedback::des(&msg[1..])
|
||||
let hs_feedback = HotStandbyFeedback::des(&msg[1..])
|
||||
.context("failed to deserialize HotStandbyFeedback")?;
|
||||
self.tli
|
||||
.update_replica_state(self.replica_id, self.feedback);
|
||||
self.ws_guard
|
||||
.walsenders
|
||||
.record_hs_feedback(self.ws_guard.id, &hs_feedback);
|
||||
}
|
||||
Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => {
|
||||
let _reply =
|
||||
let reply =
|
||||
StandbyReply::des(&msg[1..]).context("failed to deserialize StandbyReply")?;
|
||||
// This must be a regular postgres replica,
|
||||
// because pageserver doesn't send this type of messages to safekeeper.
|
||||
// Currently we just ignore this, tracking progress for them is not supported.
|
||||
self.ws_guard
|
||||
.walsenders
|
||||
.record_standby_reply(self.ws_guard.id, &reply);
|
||||
}
|
||||
Some(NEON_STATUS_UPDATE_TAG_BYTE) => {
|
||||
// pageserver sends this.
|
||||
// Note: deserializing is on m[9..] because we skip the tag byte and len bytes.
|
||||
let buf = Bytes::copy_from_slice(&msg[9..]);
|
||||
let reply = PageserverFeedback::parse(buf);
|
||||
let ps_feedback = PageserverFeedback::parse(buf);
|
||||
|
||||
trace!("PageserverFeedback is {:?}", reply);
|
||||
self.feedback.pageserver_feedback = Some(reply);
|
||||
|
||||
self.tli
|
||||
.update_replica_state(self.replica_id, self.feedback);
|
||||
trace!("PageserverFeedback is {:?}", ps_feedback);
|
||||
self.ws_guard
|
||||
.walsenders
|
||||
.record_ps_feedback(self.ws_guard.id, &ps_feedback);
|
||||
// in principle new remote_consistent_lsn could allow to
|
||||
// deactivate the timeline, but we check that regularly through
|
||||
// broker updated, not need to do it here
|
||||
}
|
||||
_ => warn!("unexpected message {:?}", msg),
|
||||
}
|
||||
@@ -368,3 +646,89 @@ async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> anyhow::Result<Option
|
||||
Err(_) => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use postgres_protocol::PG_EPOCH;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::*;
|
||||
|
||||
fn mock_ttid() -> TenantTimelineId {
|
||||
TenantTimelineId {
|
||||
tenant_id: TenantId::from_slice(&[0x00; 16]).unwrap(),
|
||||
timeline_id: TimelineId::from_slice(&[0x00; 16]).unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
fn mock_addr() -> SocketAddr {
|
||||
"127.0.0.1:8080".parse().unwrap()
|
||||
}
|
||||
|
||||
// add to wss specified feedback setting other fields to dummy values
|
||||
fn push_feedback(wss: &mut WalSendersShared, feedback: ReplicationFeedback) {
|
||||
let walsender_state = WalSenderState {
|
||||
ttid: mock_ttid(),
|
||||
addr: mock_addr(),
|
||||
conn_id: 1,
|
||||
appname: None,
|
||||
feedback,
|
||||
};
|
||||
wss.slots.push(Some(walsender_state))
|
||||
}
|
||||
|
||||
// form standby feedback with given hot standby feedback ts/xmin and the
|
||||
// rest set to dummy values.
|
||||
fn hs_feedback(ts: TimestampTz, xmin: FullTransactionId) -> ReplicationFeedback {
|
||||
ReplicationFeedback::Standby(StandbyFeedback {
|
||||
reply: StandbyReply::empty(),
|
||||
hs_feedback: HotStandbyFeedback {
|
||||
ts,
|
||||
xmin,
|
||||
catalog_xmin: 0,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// test that hs aggregation works as expected
|
||||
#[test]
|
||||
fn test_hs_feedback_no_valid() {
|
||||
let mut wss = WalSendersShared::new();
|
||||
push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
|
||||
wss.update_hs_feedback();
|
||||
assert_eq!(wss.agg_hs_feedback.xmin, INVALID_FULL_TRANSACTION_ID);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hs_feedback() {
|
||||
let mut wss = WalSendersShared::new();
|
||||
push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
|
||||
push_feedback(&mut wss, hs_feedback(1, 42));
|
||||
push_feedback(&mut wss, hs_feedback(1, 64));
|
||||
wss.update_hs_feedback();
|
||||
assert_eq!(wss.agg_hs_feedback.xmin, 42);
|
||||
}
|
||||
|
||||
// form pageserver feedback with given last_record_lsn / tli size and the
|
||||
// rest set to dummy values.
|
||||
fn ps_feedback(current_timeline_size: u64, last_received_lsn: Lsn) -> ReplicationFeedback {
|
||||
ReplicationFeedback::Pageserver(PageserverFeedback {
|
||||
current_timeline_size,
|
||||
last_received_lsn,
|
||||
disk_consistent_lsn: Lsn::INVALID,
|
||||
remote_consistent_lsn: Lsn::INVALID,
|
||||
replytime: *PG_EPOCH,
|
||||
})
|
||||
}
|
||||
|
||||
// test that ps aggregation works as expected
|
||||
#[test]
|
||||
fn test_ps_feedback() {
|
||||
let mut wss = WalSendersShared::new();
|
||||
push_feedback(&mut wss, ps_feedback(8, Lsn(42)));
|
||||
push_feedback(&mut wss, ps_feedback(4, Lsn(84)));
|
||||
wss.update_ps_feedback();
|
||||
assert_eq!(wss.agg_ps_feedback.current_timeline_size, 4);
|
||||
assert_eq!(wss.agg_ps_feedback.last_received_lsn, Lsn(84));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,10 +4,10 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use parking_lot::{Mutex, MutexGuard};
|
||||
use postgres_ffi::XLogSegNo;
|
||||
use pq_proto::PageserverFeedback;
|
||||
use serde::Serialize;
|
||||
use std::cmp::{max, min};
|
||||
|
||||
use std::cmp::max;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use tokio::{
|
||||
sync::{mpsc::Sender, watch},
|
||||
time::Instant,
|
||||
@@ -26,7 +26,7 @@ use crate::safekeeper::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
|
||||
SafekeeperMemState, ServerInfo, Term,
|
||||
};
|
||||
use crate::send_wal::HotStandbyFeedback;
|
||||
use crate::send_wal::WalSenders;
|
||||
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
|
||||
|
||||
use crate::metrics::FullTimelineInfo;
|
||||
@@ -81,48 +81,12 @@ impl PeersInfo {
|
||||
}
|
||||
}
|
||||
|
||||
/// Replica status update + hot standby feedback
|
||||
#[derive(Debug, Clone, Copy, Serialize)]
|
||||
pub struct ReplicaState {
|
||||
/// last known lsn received by replica
|
||||
pub last_received_lsn: Lsn, // None means we don't know
|
||||
/// combined remote consistent lsn of pageservers
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
/// combined hot standby feedback from all replicas
|
||||
pub hs_feedback: HotStandbyFeedback,
|
||||
/// Replication specific feedback received from pageserver, if any
|
||||
pub pageserver_feedback: Option<PageserverFeedback>,
|
||||
}
|
||||
|
||||
impl Default for ReplicaState {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl ReplicaState {
|
||||
pub fn new() -> ReplicaState {
|
||||
ReplicaState {
|
||||
last_received_lsn: Lsn::MAX,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
hs_feedback: HotStandbyFeedback {
|
||||
ts: 0,
|
||||
xmin: u64::MAX,
|
||||
catalog_xmin: u64::MAX,
|
||||
},
|
||||
pageserver_feedback: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Shared state associated with database instance
|
||||
pub struct SharedState {
|
||||
/// Safekeeper object
|
||||
sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
|
||||
/// In memory list containing state of peers sent in latest messages from them.
|
||||
peers_info: PeersInfo,
|
||||
/// State of replicas
|
||||
replicas: Vec<Option<ReplicaState>>,
|
||||
/// True when WAL backup launcher oversees the timeline, making sure WAL is
|
||||
/// offloaded, allows to bother launcher less.
|
||||
wal_backup_active: bool,
|
||||
@@ -165,13 +129,13 @@ impl SharedState {
|
||||
// We don't want to write anything to disk, because we may have existing timeline there.
|
||||
// These functions should not change anything on disk.
|
||||
let control_store = control_file::FileStorage::create_new(ttid, conf, state)?;
|
||||
let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?;
|
||||
let wal_store =
|
||||
wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
|
||||
let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
|
||||
|
||||
Ok(Self {
|
||||
sk,
|
||||
peers_info: PeersInfo(vec![]),
|
||||
replicas: vec![],
|
||||
wal_backup_active: false,
|
||||
active: false,
|
||||
num_computes: 0,
|
||||
@@ -186,12 +150,12 @@ impl SharedState {
|
||||
bail!(TimelineError::UninitializedWalSegSize(*ttid));
|
||||
}
|
||||
|
||||
let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?;
|
||||
let wal_store =
|
||||
wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
|
||||
|
||||
Ok(Self {
|
||||
sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
|
||||
peers_info: PeersInfo(vec![]),
|
||||
replicas: Vec::new(),
|
||||
wal_backup_active: false,
|
||||
active: false,
|
||||
num_computes: 0,
|
||||
@@ -199,17 +163,17 @@ impl SharedState {
|
||||
})
|
||||
}
|
||||
|
||||
fn is_active(&self) -> bool {
|
||||
fn is_active(&self, remote_consistent_lsn: Lsn) -> bool {
|
||||
self.is_wal_backup_required()
|
||||
// FIXME: add tracking of relevant pageservers and check them here individually,
|
||||
// otherwise migration won't work (we suspend too early).
|
||||
|| self.sk.inmem.remote_consistent_lsn < self.sk.inmem.commit_lsn
|
||||
|| remote_consistent_lsn < self.sk.inmem.commit_lsn
|
||||
}
|
||||
|
||||
/// Mark timeline active/inactive and return whether s3 offloading requires
|
||||
/// start/stop action.
|
||||
fn update_status(&mut self, ttid: TenantTimelineId) -> bool {
|
||||
let is_active = self.is_active();
|
||||
fn update_status(&mut self, remote_consistent_lsn: Lsn, ttid: TenantTimelineId) -> bool {
|
||||
let is_active = self.is_active(remote_consistent_lsn);
|
||||
if self.active != is_active {
|
||||
info!("timeline {} active={} now", ttid, is_active);
|
||||
}
|
||||
@@ -254,68 +218,11 @@ impl SharedState {
|
||||
self.sk.state.server.wal_seg_size as usize
|
||||
}
|
||||
|
||||
/// Get combined state of all alive replicas
|
||||
pub fn get_replicas_state(&self) -> ReplicaState {
|
||||
let mut acc = ReplicaState::new();
|
||||
for state in self.replicas.iter().flatten() {
|
||||
acc.hs_feedback.ts = max(acc.hs_feedback.ts, state.hs_feedback.ts);
|
||||
acc.hs_feedback.xmin = min(acc.hs_feedback.xmin, state.hs_feedback.xmin);
|
||||
acc.hs_feedback.catalog_xmin =
|
||||
min(acc.hs_feedback.catalog_xmin, state.hs_feedback.catalog_xmin);
|
||||
|
||||
// FIXME
|
||||
// If multiple pageservers are streaming WAL and send feedback for the same timeline simultaneously,
|
||||
// this code is not correct.
|
||||
// Now the most advanced feedback is used.
|
||||
// If one pageserver lags when another doesn't, the backpressure won't be activated on compute and lagging
|
||||
// pageserver is prone to timeout errors.
|
||||
//
|
||||
// To choose what feedback to use and resend to compute node,
|
||||
// we need to know which pageserver compute node considers to be main.
|
||||
// See https://github.com/neondatabase/neon/issues/1171
|
||||
//
|
||||
if let Some(pageserver_feedback) = state.pageserver_feedback {
|
||||
if let Some(acc_feedback) = acc.pageserver_feedback {
|
||||
if acc_feedback.last_received_lsn < pageserver_feedback.last_received_lsn {
|
||||
warn!("More than one pageserver is streaming WAL for the timeline. Feedback resolving is not fully supported yet.");
|
||||
acc.pageserver_feedback = Some(pageserver_feedback);
|
||||
}
|
||||
} else {
|
||||
acc.pageserver_feedback = Some(pageserver_feedback);
|
||||
}
|
||||
|
||||
// last lsn received by pageserver
|
||||
// FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver.
|
||||
// See https://github.com/neondatabase/neon/issues/1171
|
||||
acc.last_received_lsn = Lsn::from(pageserver_feedback.last_received_lsn);
|
||||
|
||||
// When at least one pageserver has preserved data up to remote_consistent_lsn,
|
||||
// safekeeper is free to delete it, so choose max of all pageservers.
|
||||
acc.remote_consistent_lsn = max(
|
||||
Lsn::from(pageserver_feedback.remote_consistent_lsn),
|
||||
acc.remote_consistent_lsn,
|
||||
);
|
||||
}
|
||||
}
|
||||
acc
|
||||
}
|
||||
|
||||
/// Assign new replica ID. We choose first empty cell in the replicas vector
|
||||
/// or extend the vector if there are no free slots.
|
||||
pub fn add_replica(&mut self, state: ReplicaState) -> usize {
|
||||
if let Some(pos) = self.replicas.iter().position(|r| r.is_none()) {
|
||||
self.replicas[pos] = Some(state);
|
||||
return pos;
|
||||
}
|
||||
let pos = self.replicas.len();
|
||||
self.replicas.push(Some(state));
|
||||
pos
|
||||
}
|
||||
|
||||
fn get_safekeeper_info(
|
||||
&self,
|
||||
ttid: &TenantTimelineId,
|
||||
conf: &SafeKeeperConf,
|
||||
remote_consistent_lsn: Lsn,
|
||||
) -> SafekeeperTimelineInfo {
|
||||
SafekeeperTimelineInfo {
|
||||
safekeeper_id: conf.my_id.0,
|
||||
@@ -328,11 +235,7 @@ impl SharedState {
|
||||
// note: this value is not flushed to control file yet and can be lost
|
||||
commit_lsn: self.sk.inmem.commit_lsn.0,
|
||||
// TODO: rework feedbacks to avoid max here
|
||||
remote_consistent_lsn: max(
|
||||
self.get_replicas_state().remote_consistent_lsn,
|
||||
self.sk.inmem.remote_consistent_lsn,
|
||||
)
|
||||
.0,
|
||||
remote_consistent_lsn: remote_consistent_lsn.0,
|
||||
peer_horizon_lsn: self.sk.inmem.peer_horizon_lsn.0,
|
||||
safekeeper_connstr: conf.listen_pg_addr.clone(),
|
||||
backup_lsn: self.sk.inmem.backup_lsn.0,
|
||||
@@ -387,6 +290,7 @@ pub struct Timeline {
|
||||
/// Safekeeper and other state, that should remain consistent and synchronized
|
||||
/// with the disk.
|
||||
mutex: Mutex<SharedState>,
|
||||
walsenders: Arc<WalSenders>,
|
||||
|
||||
/// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal.
|
||||
cancellation_tx: watch::Sender<bool>,
|
||||
@@ -409,6 +313,7 @@ impl Timeline {
|
||||
let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
|
||||
|
||||
let shared_state = SharedState::restore(&conf, &ttid)?;
|
||||
let rcl = shared_state.sk.state.remote_consistent_lsn;
|
||||
let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
|
||||
watch::channel(shared_state.sk.state.commit_lsn);
|
||||
let (cancellation_tx, cancellation_rx) = watch::channel(false);
|
||||
@@ -419,6 +324,7 @@ impl Timeline {
|
||||
commit_lsn_watch_tx,
|
||||
commit_lsn_watch_rx,
|
||||
mutex: Mutex::new(shared_state),
|
||||
walsenders: WalSenders::new(rcl),
|
||||
cancellation_rx,
|
||||
cancellation_tx,
|
||||
timeline_dir: conf.timeline_dir(&ttid),
|
||||
@@ -444,6 +350,7 @@ impl Timeline {
|
||||
commit_lsn_watch_tx,
|
||||
commit_lsn_watch_rx,
|
||||
mutex: Mutex::new(SharedState::create_new(&conf, &ttid, state)?),
|
||||
walsenders: WalSenders::new(Lsn(0)),
|
||||
cancellation_rx,
|
||||
cancellation_tx,
|
||||
timeline_dir: conf.timeline_dir(&ttid),
|
||||
@@ -475,7 +382,7 @@ impl Timeline {
|
||||
match || -> Result<()> {
|
||||
shared_state.sk.persist()?;
|
||||
// TODO: add more initialization steps here
|
||||
shared_state.update_status(self.ttid);
|
||||
self.update_status(shared_state);
|
||||
Ok(())
|
||||
}() {
|
||||
Ok(_) => Ok(()),
|
||||
@@ -531,6 +438,10 @@ impl Timeline {
|
||||
self.mutex.lock()
|
||||
}
|
||||
|
||||
fn update_status(&self, shared_state: &mut SharedState) -> bool {
|
||||
shared_state.update_status(self.get_walsenders().get_remote_consistent_lsn(), self.ttid)
|
||||
}
|
||||
|
||||
/// Register compute connection, starting timeline-related activity if it is
|
||||
/// not running yet.
|
||||
pub async fn on_compute_connect(&self) -> Result<()> {
|
||||
@@ -542,7 +453,7 @@ impl Timeline {
|
||||
{
|
||||
let mut shared_state = self.write_shared_state();
|
||||
shared_state.num_computes += 1;
|
||||
is_wal_backup_action_pending = shared_state.update_status(self.ttid);
|
||||
is_wal_backup_action_pending = self.update_status(&mut shared_state);
|
||||
}
|
||||
// Wake up wal backup launcher, if offloading not started yet.
|
||||
if is_wal_backup_action_pending {
|
||||
@@ -559,7 +470,7 @@ impl Timeline {
|
||||
{
|
||||
let mut shared_state = self.write_shared_state();
|
||||
shared_state.num_computes -= 1;
|
||||
is_wal_backup_action_pending = shared_state.update_status(self.ttid);
|
||||
is_wal_backup_action_pending = self.update_status(&mut shared_state);
|
||||
}
|
||||
// Wake up wal backup launcher, if it is time to stop the offloading.
|
||||
if is_wal_backup_action_pending {
|
||||
@@ -574,26 +485,19 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns true if walsender should stop sending WAL to pageserver.
|
||||
/// TODO: check this pageserver is actually interested in this timeline.
|
||||
pub fn should_walsender_stop(&self, replica_id: usize) -> bool {
|
||||
/// Returns true if walsender should stop sending WAL to pageserver. We
|
||||
/// terminate it if remote_consistent_lsn reached commit_lsn and there is no
|
||||
/// computes. While there might be nothing to stream already, we learn about
|
||||
/// remote_consistent_lsn update through replication feedback, and we want
|
||||
/// to stop pushing to the broker if pageserver is fully caughtup.
|
||||
pub fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
|
||||
if self.is_cancelled() {
|
||||
return true;
|
||||
}
|
||||
let mut shared_state = self.write_shared_state();
|
||||
let shared_state = self.write_shared_state();
|
||||
if shared_state.num_computes == 0 {
|
||||
let replica_state = shared_state.replicas[replica_id].unwrap();
|
||||
let reported_remote_consistent_lsn = replica_state
|
||||
.pageserver_feedback
|
||||
.map(|f| Lsn(f.remote_consistent_lsn))
|
||||
.unwrap_or(Lsn::INVALID);
|
||||
let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
|
||||
(reported_remote_consistent_lsn!= Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet.
|
||||
reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn);
|
||||
if stop {
|
||||
shared_state.update_status(self.ttid);
|
||||
return true;
|
||||
}
|
||||
return shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
|
||||
reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn;
|
||||
}
|
||||
false
|
||||
}
|
||||
@@ -628,13 +532,12 @@ impl Timeline {
|
||||
let mut shared_state = self.write_shared_state();
|
||||
rmsg = shared_state.sk.process_msg(msg)?;
|
||||
|
||||
// if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn
|
||||
// if this is AppendResponse, fill in proper pageserver and hot
|
||||
// standby feedback.
|
||||
if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
|
||||
let state = shared_state.get_replicas_state();
|
||||
resp.hs_feedback = state.hs_feedback;
|
||||
if let Some(pageserver_feedback) = state.pageserver_feedback {
|
||||
resp.pageserver_feedback = pageserver_feedback;
|
||||
}
|
||||
let (ps_feedback, hs_feedback) = self.walsenders.get_feedbacks();
|
||||
resp.hs_feedback = hs_feedback;
|
||||
resp.pageserver_feedback = ps_feedback;
|
||||
}
|
||||
|
||||
commit_lsn = shared_state.sk.inmem.commit_lsn;
|
||||
@@ -684,19 +587,29 @@ impl Timeline {
|
||||
/// Get safekeeper info for broadcasting to broker and other peers.
|
||||
pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
|
||||
let shared_state = self.write_shared_state();
|
||||
shared_state.get_safekeeper_info(&self.ttid, conf)
|
||||
shared_state.get_safekeeper_info(
|
||||
&self.ttid,
|
||||
conf,
|
||||
self.walsenders.get_remote_consistent_lsn(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Update timeline state with peer safekeeper data.
|
||||
pub async fn record_safekeeper_info(&self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
|
||||
pub async fn record_safekeeper_info(&self, mut sk_info: SafekeeperTimelineInfo) -> Result<()> {
|
||||
// Update local remote_consistent_lsn in memory (in .walsenders) and in
|
||||
// sk_info to pass it down to control file.
|
||||
sk_info.remote_consistent_lsn = self
|
||||
.walsenders
|
||||
.update_remote_consistent_lsn(Lsn(sk_info.remote_consistent_lsn))
|
||||
.0;
|
||||
let is_wal_backup_action_pending: bool;
|
||||
let commit_lsn: Lsn;
|
||||
{
|
||||
let mut shared_state = self.write_shared_state();
|
||||
shared_state.sk.record_safekeeper_info(sk_info)?;
|
||||
let peer_info = PeerInfo::from_sk_info(sk_info, Instant::now());
|
||||
shared_state.sk.record_safekeeper_info(&sk_info)?;
|
||||
let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
|
||||
shared_state.peers_info.upsert(&peer_info);
|
||||
is_wal_backup_action_pending = shared_state.update_status(self.ttid);
|
||||
is_wal_backup_action_pending = self.update_status(&mut shared_state);
|
||||
commit_lsn = shared_state.sk.inmem.commit_lsn;
|
||||
}
|
||||
self.commit_lsn_watch_tx.send(commit_lsn)?;
|
||||
@@ -723,22 +636,8 @@ impl Timeline {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Add send_wal replica to the in-memory vector of replicas.
|
||||
pub fn add_replica(&self, state: ReplicaState) -> usize {
|
||||
self.write_shared_state().add_replica(state)
|
||||
}
|
||||
|
||||
/// Update replication replica state.
|
||||
pub fn update_replica_state(&self, id: usize, state: ReplicaState) {
|
||||
let mut shared_state = self.write_shared_state();
|
||||
shared_state.replicas[id] = Some(state);
|
||||
}
|
||||
|
||||
/// Remove send_wal replica from the in-memory vector of replicas.
|
||||
pub fn remove_replica(&self, id: usize) {
|
||||
let mut shared_state = self.write_shared_state();
|
||||
assert!(shared_state.replicas[id].is_some());
|
||||
shared_state.replicas[id] = None;
|
||||
pub fn get_walsenders(&self) -> &Arc<WalSenders> {
|
||||
&self.walsenders
|
||||
}
|
||||
|
||||
/// Returns flush_lsn.
|
||||
@@ -781,16 +680,12 @@ impl Timeline {
|
||||
return None;
|
||||
}
|
||||
|
||||
let ps_feedback = self.walsenders.get_ps_feedback();
|
||||
let state = self.write_shared_state();
|
||||
if state.active {
|
||||
Some(FullTimelineInfo {
|
||||
ttid: self.ttid,
|
||||
replicas: state
|
||||
.replicas
|
||||
.iter()
|
||||
.filter_map(|r| r.as_ref())
|
||||
.copied()
|
||||
.collect(),
|
||||
ps_feedback,
|
||||
wal_backup_active: state.wal_backup_active,
|
||||
timeline_is_active: state.active,
|
||||
num_computes: state.num_computes,
|
||||
@@ -799,6 +694,7 @@ impl Timeline {
|
||||
mem_state: state.sk.inmem.clone(),
|
||||
persisted_state: state.sk.state.clone(),
|
||||
flush_lsn: state.sk.wal_store.flush_lsn(),
|
||||
remote_consistent_lsn: self.get_walsenders().get_remote_consistent_lsn(),
|
||||
wal_storage: state.sk.wal_store.get_metrics(),
|
||||
})
|
||||
} else {
|
||||
@@ -816,7 +712,7 @@ impl Timeline {
|
||||
debug_dump::Memory {
|
||||
is_cancelled: self.is_cancelled(),
|
||||
peers_info_len: state.peers_info.0.len(),
|
||||
replicas: state.replicas.clone(),
|
||||
walsenders: self.walsenders.get_all(),
|
||||
wal_backup_active: state.wal_backup_active,
|
||||
active: state.active,
|
||||
num_computes: state.num_computes,
|
||||
|
||||
@@ -159,6 +159,26 @@ impl GlobalTimelines {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load timeline from disk to the memory.
|
||||
pub fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
|
||||
let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
|
||||
|
||||
match Timeline::load_timeline(conf, ttid, wal_backup_launcher_tx) {
|
||||
Ok(timeline) => {
|
||||
let tli = Arc::new(timeline);
|
||||
// TODO: prevent concurrent timeline creation/loading
|
||||
TIMELINES_STATE
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.insert(ttid, tli.clone());
|
||||
Ok(tli)
|
||||
}
|
||||
// If we can't load a timeline, it's bad. Caller will figure it out.
|
||||
Err(e) => bail!("failed to load timeline {}, reason: {:?}", ttid, e),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the number of timelines in the map.
|
||||
pub fn timelines_count() -> usize {
|
||||
TIMELINES_STATE.lock().unwrap().timelines.len()
|
||||
|
||||
@@ -18,6 +18,7 @@ use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogF
|
||||
use postgres_ffi::{XLogSegNo, PG_TLI};
|
||||
use std::cmp::{max, min};
|
||||
|
||||
use bytes::Bytes;
|
||||
use std::fs::{self, remove_file, File, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -36,6 +37,7 @@ use postgres_ffi::XLOG_BLCKSZ;
|
||||
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
|
||||
use pq_proto::SystemId;
|
||||
use tokio::io::{AsyncReadExt, AsyncSeekExt};
|
||||
|
||||
pub trait Storage {
|
||||
@@ -110,10 +112,10 @@ impl PhysicalStorage {
|
||||
/// the disk. Otherwise, all LSNs are set to zero.
|
||||
pub fn new(
|
||||
ttid: &TenantTimelineId,
|
||||
timeline_dir: PathBuf,
|
||||
conf: &SafeKeeperConf,
|
||||
state: &SafeKeeperState,
|
||||
) -> Result<PhysicalStorage> {
|
||||
let timeline_dir = conf.timeline_dir(ttid);
|
||||
let wal_seg_size = state.server.wal_seg_size as usize;
|
||||
|
||||
// Find out where stored WAL ends, starting at commit_lsn which is a
|
||||
@@ -478,6 +480,13 @@ pub struct WalReader {
|
||||
|
||||
// We don't have WAL locally if LSN is less than local_start_lsn
|
||||
local_start_lsn: Lsn,
|
||||
// We will respond with zero-ed bytes before this Lsn as long as
|
||||
// pos is in the same segment as timeline_start_lsn.
|
||||
timeline_start_lsn: Lsn,
|
||||
// integer version number of PostgreSQL, e.g. 14; 15; 16
|
||||
pg_version: u32,
|
||||
system_id: SystemId,
|
||||
timeline_start_segment: Option<Bytes>,
|
||||
}
|
||||
|
||||
impl WalReader {
|
||||
@@ -488,19 +497,27 @@ impl WalReader {
|
||||
start_pos: Lsn,
|
||||
enable_remote_read: bool,
|
||||
) -> Result<Self> {
|
||||
if start_pos < state.timeline_start_lsn {
|
||||
if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) {
|
||||
bail!("state uninitialized, no data to read");
|
||||
}
|
||||
|
||||
// TODO: Upgrade to bail!() once we know this couldn't possibly happen
|
||||
if state.timeline_start_lsn == Lsn(0) {
|
||||
warn!("timeline_start_lsn uninitialized before initializing wal reader");
|
||||
}
|
||||
|
||||
if start_pos
|
||||
< state
|
||||
.timeline_start_lsn
|
||||
.segment_lsn(state.server.wal_seg_size as usize)
|
||||
{
|
||||
bail!(
|
||||
"Requested streaming from {}, which is before the start of the timeline {}",
|
||||
"Requested streaming from {}, which is before the start of the timeline {}, and also doesn't start at the first segment of that timeline",
|
||||
start_pos,
|
||||
state.timeline_start_lsn
|
||||
);
|
||||
}
|
||||
|
||||
// TODO: add state.timeline_start_lsn == Lsn(0) check
|
||||
if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) {
|
||||
bail!("state uninitialized, no data to read");
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
workdir,
|
||||
timeline_dir,
|
||||
@@ -509,10 +526,65 @@ impl WalReader {
|
||||
wal_segment: None,
|
||||
enable_remote_read,
|
||||
local_start_lsn: state.local_start_lsn,
|
||||
timeline_start_lsn: state.timeline_start_lsn,
|
||||
pg_version: state.server.pg_version / 10000,
|
||||
system_id: state.server.system_id,
|
||||
timeline_start_segment: None,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
|
||||
// If this timeline is new, we may not have a full segment yet, so
|
||||
// we pad the first bytes of the timeline's first WAL segment with 0s
|
||||
if self.pos < self.timeline_start_lsn {
|
||||
debug_assert_eq!(
|
||||
self.pos.segment_number(self.wal_seg_size),
|
||||
self.timeline_start_lsn.segment_number(self.wal_seg_size)
|
||||
);
|
||||
|
||||
// All bytes after timeline_start_lsn are in WAL, but those before
|
||||
// are not, so we manually construct an empty segment for the bytes
|
||||
// not available in this timeline.
|
||||
if self.timeline_start_segment.is_none() {
|
||||
let it = postgres_ffi::generate_wal_segment(
|
||||
self.timeline_start_lsn.segment_number(self.wal_seg_size),
|
||||
self.system_id,
|
||||
self.pg_version,
|
||||
self.timeline_start_lsn,
|
||||
)?;
|
||||
self.timeline_start_segment = Some(it);
|
||||
}
|
||||
|
||||
assert!(self.timeline_start_segment.is_some());
|
||||
let segment = self.timeline_start_segment.take().unwrap();
|
||||
|
||||
let seg_bytes = &segment[..];
|
||||
|
||||
// How much of the current segment have we already consumed?
|
||||
let pos_seg_offset = self.pos.segment_offset(self.wal_seg_size);
|
||||
|
||||
// How many bytes may we consume in total?
|
||||
let tl_start_seg_offset = self.timeline_start_lsn.segment_offset(self.wal_seg_size);
|
||||
|
||||
debug_assert!(seg_bytes.len() > pos_seg_offset);
|
||||
debug_assert!(seg_bytes.len() > tl_start_seg_offset);
|
||||
|
||||
// Copy as many bytes as possible into the buffer
|
||||
let len = (tl_start_seg_offset - pos_seg_offset).min(buf.len());
|
||||
buf[0..len].copy_from_slice(&seg_bytes[pos_seg_offset..pos_seg_offset + len]);
|
||||
|
||||
self.pos += len as u64;
|
||||
|
||||
// If we're done with the segment, we can release it's memory.
|
||||
// However, if we're not yet done, store it so that we don't have to
|
||||
// construct the segment the next time this function is called.
|
||||
if self.pos < self.timeline_start_lsn {
|
||||
self.timeline_start_segment = Some(segment);
|
||||
}
|
||||
|
||||
return Ok(len);
|
||||
}
|
||||
|
||||
let mut wal_segment = match self.wal_segment.take() {
|
||||
Some(reader) => reader,
|
||||
None => self.open_segment().await?,
|
||||
|
||||
@@ -42,12 +42,16 @@ def main(args: argparse.Namespace):
|
||||
res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]]
|
||||
res = defaultdict(lambda: defaultdict(dict))
|
||||
|
||||
logging.info("connecting to the database...")
|
||||
with psycopg2.connect(connstr, connect_timeout=10) as conn:
|
||||
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||
logging.info("fetching flaky tests...")
|
||||
cur.execute(FLAKY_TESTS_QUERY, (interval_days,))
|
||||
rows = cur.fetchall()
|
||||
try:
|
||||
logging.info("connecting to the database...")
|
||||
with psycopg2.connect(connstr, connect_timeout=30) as conn:
|
||||
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||
logging.info("fetching flaky tests...")
|
||||
cur.execute(FLAKY_TESTS_QUERY, (interval_days,))
|
||||
rows = cur.fetchall()
|
||||
except psycopg2.OperationalError as exc:
|
||||
logging.error("cannot fetch flaky tests from the DB due to an error", exc)
|
||||
rows = []
|
||||
|
||||
for row in rows:
|
||||
logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}")
|
||||
|
||||
@@ -430,7 +430,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// 1. init logging
|
||||
// 2. tracing panic hook
|
||||
// 3. sentry
|
||||
logging::init(LogFormat::from_config(&args.log_format)?)?;
|
||||
logging::init(
|
||||
LogFormat::from_config(&args.log_format)?,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
)?;
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
@@ -45,14 +45,18 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
|
||||
*[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]],
|
||||
*[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]],
|
||||
"pageserver_remote_physical_size",
|
||||
"pageserver_remote_timeline_client_bytes_started_total",
|
||||
"pageserver_remote_timeline_client_bytes_finished_total",
|
||||
)
|
||||
|
||||
PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_storage_operations_seconds_global_count",
|
||||
"pageserver_storage_operations_seconds_global_sum",
|
||||
"pageserver_storage_operations_seconds_global_bucket",
|
||||
"pageserver_unexpected_ondemand_downloads_count_total",
|
||||
"libmetrics_launch_timestamp",
|
||||
"libmetrics_build_info",
|
||||
"libmetrics_tracing_event_count_total",
|
||||
)
|
||||
|
||||
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
|
||||
@@ -1451,6 +1451,7 @@ class NeonCli(AbstractNeonCli):
|
||||
branch_name: str,
|
||||
endpoint_id: Optional[str] = None,
|
||||
tenant_id: Optional[TenantId] = None,
|
||||
hot_standby: bool = False,
|
||||
lsn: Optional[Lsn] = None,
|
||||
port: Optional[int] = None,
|
||||
) -> "subprocess.CompletedProcess[str]":
|
||||
@@ -1470,6 +1471,8 @@ class NeonCli(AbstractNeonCli):
|
||||
args.extend(["--port", str(port)])
|
||||
if endpoint_id is not None:
|
||||
args.append(endpoint_id)
|
||||
if hot_standby:
|
||||
args.extend(["--hot-standby", "true"])
|
||||
|
||||
res = self.raw_cli(args)
|
||||
res.check_returncode()
|
||||
@@ -1817,6 +1820,36 @@ class VanillaPostgres(PgProtocol):
|
||||
self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)])
|
||||
self.configure([f"port = {port}\n"])
|
||||
|
||||
def enable_tls(self):
|
||||
assert not self.running
|
||||
# generate self-signed certificate
|
||||
subprocess.run(
|
||||
[
|
||||
"openssl",
|
||||
"req",
|
||||
"-new",
|
||||
"-x509",
|
||||
"-days",
|
||||
"365",
|
||||
"-nodes",
|
||||
"-text",
|
||||
"-out",
|
||||
self.pgdatadir / "server.crt",
|
||||
"-keyout",
|
||||
self.pgdatadir / "server.key",
|
||||
"-subj",
|
||||
"/CN=localhost",
|
||||
]
|
||||
)
|
||||
# configure postgresql.conf
|
||||
self.configure(
|
||||
[
|
||||
"ssl = on",
|
||||
"ssl_cert_file = 'server.crt'",
|
||||
"ssl_key_file = 'server.key'",
|
||||
]
|
||||
)
|
||||
|
||||
def configure(self, options: List[str]):
|
||||
"""Append lines into postgresql.conf file."""
|
||||
assert not self.running
|
||||
@@ -1989,6 +2022,7 @@ class NeonProxy(PgProtocol):
|
||||
# Link auth backend params
|
||||
*["--auth-backend", "link"],
|
||||
*["--uri", NeonProxy.link_auth_uri],
|
||||
*["--allow-self-signed-compute", "true"],
|
||||
]
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -2009,6 +2043,7 @@ class NeonProxy(PgProtocol):
|
||||
def __init__(
|
||||
self,
|
||||
neon_binpath: Path,
|
||||
test_output_dir: Path,
|
||||
proxy_port: int,
|
||||
http_port: int,
|
||||
mgmt_port: int,
|
||||
@@ -2022,6 +2057,7 @@ class NeonProxy(PgProtocol):
|
||||
self.host = host
|
||||
self.http_port = http_port
|
||||
self.neon_binpath = neon_binpath
|
||||
self.test_output_dir = test_output_dir
|
||||
self.proxy_port = proxy_port
|
||||
self.mgmt_port = mgmt_port
|
||||
self.auth_backend = auth_backend
|
||||
@@ -2048,7 +2084,8 @@ class NeonProxy(PgProtocol):
|
||||
*["--metric-collection-interval", self.metric_collection_interval],
|
||||
]
|
||||
|
||||
self._popen = subprocess.Popen(args)
|
||||
logfile = open(self.test_output_dir / "proxy.log", "w")
|
||||
self._popen = subprocess.Popen(args, stdout=logfile, stderr=logfile)
|
||||
self._wait_until_ready()
|
||||
return self
|
||||
|
||||
@@ -2105,7 +2142,7 @@ class NeonProxy(PgProtocol):
|
||||
try:
|
||||
self._popen.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
log.warn("failed to gracefully terminate proxy; killing")
|
||||
log.warning("failed to gracefully terminate proxy; killing")
|
||||
self._popen.kill()
|
||||
|
||||
@staticmethod
|
||||
@@ -2116,6 +2153,7 @@ class NeonProxy(PgProtocol):
|
||||
|
||||
if create_user:
|
||||
log.info("creating a new user for link auth test")
|
||||
local_vanilla_pg.enable_tls()
|
||||
local_vanilla_pg.start()
|
||||
local_vanilla_pg.safe_psql(f"create user {pg_user} with login superuser")
|
||||
|
||||
@@ -2149,7 +2187,9 @@ class NeonProxy(PgProtocol):
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def link_proxy(port_distributor: PortDistributor, neon_binpath: Path) -> Iterator[NeonProxy]:
|
||||
def link_proxy(
|
||||
port_distributor: PortDistributor, neon_binpath: Path, test_output_dir: Path
|
||||
) -> Iterator[NeonProxy]:
|
||||
"""Neon proxy that routes through link auth."""
|
||||
|
||||
http_port = port_distributor.get_port()
|
||||
@@ -2158,6 +2198,7 @@ def link_proxy(port_distributor: PortDistributor, neon_binpath: Path) -> Iterato
|
||||
|
||||
with NeonProxy(
|
||||
neon_binpath=neon_binpath,
|
||||
test_output_dir=test_output_dir,
|
||||
proxy_port=proxy_port,
|
||||
http_port=http_port,
|
||||
mgmt_port=mgmt_port,
|
||||
@@ -2169,7 +2210,10 @@ def link_proxy(port_distributor: PortDistributor, neon_binpath: Path) -> Iterato
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def static_proxy(
|
||||
vanilla_pg: VanillaPostgres, port_distributor: PortDistributor, neon_binpath: Path
|
||||
vanilla_pg: VanillaPostgres,
|
||||
port_distributor: PortDistributor,
|
||||
neon_binpath: Path,
|
||||
test_output_dir: Path,
|
||||
) -> Iterator[NeonProxy]:
|
||||
"""Neon proxy that routes directly to vanilla postgres."""
|
||||
|
||||
@@ -2188,6 +2232,7 @@ def static_proxy(
|
||||
|
||||
with NeonProxy(
|
||||
neon_binpath=neon_binpath,
|
||||
test_output_dir=test_output_dir,
|
||||
proxy_port=proxy_port,
|
||||
http_port=http_port,
|
||||
mgmt_port=mgmt_port,
|
||||
@@ -2206,6 +2251,7 @@ class Endpoint(PgProtocol):
|
||||
super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres")
|
||||
self.env = env
|
||||
self.running = False
|
||||
self.branch_name: Optional[str] = None # dubious
|
||||
self.endpoint_id: Optional[str] = None # dubious, see asserts below
|
||||
self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA
|
||||
self.tenant_id = tenant_id
|
||||
@@ -2217,6 +2263,7 @@ class Endpoint(PgProtocol):
|
||||
self,
|
||||
branch_name: str,
|
||||
endpoint_id: Optional[str] = None,
|
||||
hot_standby: bool = False,
|
||||
lsn: Optional[Lsn] = None,
|
||||
config_lines: Optional[List[str]] = None,
|
||||
) -> "Endpoint":
|
||||
@@ -2231,12 +2278,14 @@ class Endpoint(PgProtocol):
|
||||
if endpoint_id is None:
|
||||
endpoint_id = self.env.generate_endpoint_id()
|
||||
self.endpoint_id = endpoint_id
|
||||
self.branch_name = branch_name
|
||||
|
||||
self.env.neon_cli.endpoint_create(
|
||||
branch_name,
|
||||
endpoint_id=self.endpoint_id,
|
||||
tenant_id=self.tenant_id,
|
||||
lsn=lsn,
|
||||
hot_standby=hot_standby,
|
||||
port=self.port,
|
||||
)
|
||||
path = Path("endpoints") / self.endpoint_id / "pgdata"
|
||||
@@ -2361,6 +2410,7 @@ class Endpoint(PgProtocol):
|
||||
self,
|
||||
branch_name: str,
|
||||
endpoint_id: Optional[str] = None,
|
||||
hot_standby: bool = False,
|
||||
lsn: Optional[Lsn] = None,
|
||||
config_lines: Optional[List[str]] = None,
|
||||
) -> "Endpoint":
|
||||
@@ -2375,6 +2425,7 @@ class Endpoint(PgProtocol):
|
||||
branch_name=branch_name,
|
||||
endpoint_id=endpoint_id,
|
||||
config_lines=config_lines,
|
||||
hot_standby=hot_standby,
|
||||
lsn=lsn,
|
||||
).start()
|
||||
|
||||
@@ -2408,6 +2459,7 @@ class EndpointFactory:
|
||||
endpoint_id: Optional[str] = None,
|
||||
tenant_id: Optional[TenantId] = None,
|
||||
lsn: Optional[Lsn] = None,
|
||||
hot_standby: bool = False,
|
||||
config_lines: Optional[List[str]] = None,
|
||||
) -> Endpoint:
|
||||
ep = Endpoint(
|
||||
@@ -2421,6 +2473,7 @@ class EndpointFactory:
|
||||
return ep.create_start(
|
||||
branch_name=branch_name,
|
||||
endpoint_id=endpoint_id,
|
||||
hot_standby=hot_standby,
|
||||
config_lines=config_lines,
|
||||
lsn=lsn,
|
||||
)
|
||||
@@ -2431,6 +2484,7 @@ class EndpointFactory:
|
||||
endpoint_id: Optional[str] = None,
|
||||
tenant_id: Optional[TenantId] = None,
|
||||
lsn: Optional[Lsn] = None,
|
||||
hot_standby: bool = False,
|
||||
config_lines: Optional[List[str]] = None,
|
||||
) -> Endpoint:
|
||||
ep = Endpoint(
|
||||
@@ -2449,6 +2503,7 @@ class EndpointFactory:
|
||||
branch_name=branch_name,
|
||||
endpoint_id=endpoint_id,
|
||||
lsn=lsn,
|
||||
hot_standby=hot_standby,
|
||||
config_lines=config_lines,
|
||||
)
|
||||
|
||||
@@ -2458,6 +2513,36 @@ class EndpointFactory:
|
||||
|
||||
return self
|
||||
|
||||
def new_replica(self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]]):
|
||||
branch_name = origin.branch_name
|
||||
assert origin in self.endpoints
|
||||
assert branch_name is not None
|
||||
|
||||
return self.create(
|
||||
branch_name=branch_name,
|
||||
endpoint_id=endpoint_id,
|
||||
tenant_id=origin.tenant_id,
|
||||
lsn=None,
|
||||
hot_standby=True,
|
||||
config_lines=config_lines,
|
||||
)
|
||||
|
||||
def new_replica_start(
|
||||
self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]] = None
|
||||
):
|
||||
branch_name = origin.branch_name
|
||||
assert origin in self.endpoints
|
||||
assert branch_name is not None
|
||||
|
||||
return self.create_start(
|
||||
branch_name=branch_name,
|
||||
endpoint_id=endpoint_id,
|
||||
tenant_id=origin.tenant_id,
|
||||
lsn=None,
|
||||
hot_standby=True,
|
||||
config_lines=config_lines,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperPort:
|
||||
@@ -2543,6 +2628,7 @@ class SafekeeperTimelineStatus:
|
||||
commit_lsn: Lsn
|
||||
timeline_start_lsn: Lsn
|
||||
backup_lsn: Lsn
|
||||
peer_horizon_lsn: Lsn
|
||||
remote_consistent_lsn: Lsn
|
||||
|
||||
|
||||
@@ -2575,6 +2661,13 @@ class SafekeeperHttpClient(requests.Session):
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
|
||||
res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_create(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, pg_version: int, commit_lsn: Lsn
|
||||
):
|
||||
@@ -2600,6 +2693,7 @@ class SafekeeperHttpClient(requests.Session):
|
||||
commit_lsn=Lsn(resj["commit_lsn"]),
|
||||
timeline_start_lsn=Lsn(resj["timeline_start_lsn"]),
|
||||
backup_lsn=Lsn(resj["backup_lsn"]),
|
||||
peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
|
||||
remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
|
||||
)
|
||||
|
||||
@@ -2928,32 +3022,18 @@ def fork_at_current_lsn(
|
||||
return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn)
|
||||
|
||||
|
||||
def wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
safekeepers: List[Safekeeper],
|
||||
pageserver: NeonPageserver,
|
||||
):
|
||||
sk_commit_lsns = [
|
||||
sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn for sk in safekeepers
|
||||
]
|
||||
lsn = max(sk_commit_lsns)
|
||||
ps_http = pageserver.http_client()
|
||||
wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, lsn)
|
||||
return lsn
|
||||
|
||||
|
||||
def wait_for_sk_commit_lsn_to_reach_remote_storage(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
safekeepers: List[Safekeeper],
|
||||
pageserver: NeonPageserver,
|
||||
):
|
||||
lsn = wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn(
|
||||
tenant_id, timeline_id, safekeepers, pageserver
|
||||
)
|
||||
ps_http = pageserver.http_client()
|
||||
def last_flush_lsn_upload(
|
||||
env: NeonEnv, endpoint: Endpoint, tenant_id: TenantId, timeline_id: TimelineId
|
||||
) -> Lsn:
|
||||
"""
|
||||
Wait for pageserver to catch to the latest flush LSN of given endpoint,
|
||||
checkpoint pageserver, and wait for it to be uploaded (remote_consistent_lsn
|
||||
reaching flush LSN).
|
||||
"""
|
||||
last_flush_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
ps_http = env.pageserver.http_client()
|
||||
wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_flush_lsn)
|
||||
# force a checkpoint to trigger upload
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(ps_http, tenant_id, timeline_id, lsn)
|
||||
return lsn
|
||||
wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
|
||||
return last_flush_lsn
|
||||
|
||||
@@ -550,3 +550,13 @@ class PageserverHttpClient(requests.Session):
|
||||
def tenant_break(self, tenant_id: TenantId):
|
||||
res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
|
||||
self.verbose_error(res)
|
||||
|
||||
def post_tracing_event(self, level: str, message: str):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tracing/event",
|
||||
json={
|
||||
"level": level,
|
||||
"message": message,
|
||||
},
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
@@ -54,10 +54,9 @@ def wait_for_upload(
|
||||
if current_lsn >= lsn:
|
||||
log.info("wait finished")
|
||||
return
|
||||
lr_lsn = last_record_lsn(pageserver_http, tenant, timeline)
|
||||
log.info(
|
||||
"waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
|
||||
lsn, current_lsn, i + 1
|
||||
)
|
||||
f"waiting for remote_consistent_lsn to reach {lsn}, now {current_lsn}, last_record_lsn={lr_lsn}, iteration {i + 1}"
|
||||
)
|
||||
time.sleep(1)
|
||||
raise Exception(
|
||||
|
||||
@@ -59,11 +59,6 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
"value": "replica",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "hot_standby",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "neon.safekeepers",
|
||||
"value": """
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user