diff --git a/.github/ansible/staging.eu-central-1.hosts.yaml b/.github/ansible/staging.eu-central-1.hosts.yaml new file mode 100644 index 0000000000..db1d1adcff --- /dev/null +++ b/.github/ansible/staging.eu-central-1.hosts.yaml @@ -0,0 +1,47 @@ +storage: + vars: + bucket_name: neon-dev-storage-eu-central-1 + bucket_region: eu-central-1 + # We only register/update storage in one preview console and manually copy to other instances + console_mgmt_base_url: http://neon-internal-api.helium.aws.neon.build + broker_endpoint: http://storage-broker-lb.alpha.eu-central-1.internal.aws.neon.build:50051 + pageserver_config_stub: + pg_distrib_dir: /usr/local + metric_collection_endpoint: http://neon-internal-api.helium.aws.neon.build/billing/api/v1/usage_events + metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "20m" + threshold: &default_eviction_threshold "20m" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold + remote_storage: + bucket_name: "{{ bucket_name }}" + bucket_region: "{{ bucket_region }}" + prefix_in_bucket: "pageserver/v1" + safekeeper_s3_prefix: safekeeper/v1/wal + hostname_suffix: "" + remote_user: ssm-user + ansible_aws_ssm_region: eu-central-1 + ansible_aws_ssm_bucket_name: neon-dev-storage-eu-central-1 + console_region_id: aws-eu-central-1 + sentry_environment: staging + + children: + pageservers: + hosts: + pageserver-0.eu-central-1.aws.neon.build: + ansible_host: i-011f93ec26cfba2d4 + + safekeepers: + hosts: + safekeeper-0.eu-central-1.aws.neon.build: + ansible_host: i-0ff026d27babf8ddd + safekeeper-1.eu-central-1.aws.neon.build: + ansible_host: i-03983a49ee54725d9 + safekeeper-2.eu-central-1.aws.neon.build: + ansible_host: i-0bd025ecdb61b0db3 diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index e63ed6e639..fb218c443d 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -48,9 +48,9 @@ storage: hosts: safekeeper-0.us-east-2.aws.neon.build: ansible_host: i-027662bd552bf5db0 - safekeeper-1.us-east-2.aws.neon.build: - ansible_host: i-0171efc3604a7b907 safekeeper-2.us-east-2.aws.neon.build: ansible_host: i-0de0b03a51676a6ce + safekeeper-3.us-east-2.aws.neon.build: + ansible_host: i-05f8ba2cda243bd18 safekeeper-99.us-east-2.aws.neon.build: ansible_host: i-0d61b6a2ea32028d5 diff --git a/.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml b/.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml new file mode 100644 index 0000000000..aaa1ec59b4 --- /dev/null +++ b/.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml @@ -0,0 +1,52 @@ +# Helm chart values for neon-storage-broker +podLabels: + neon_env: staging + neon_service: storage-broker + +# Use L4 LB +service: + # service.annotations -- Annotations to add to the service + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet + # assign service to this name at external-dns + external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.alpha.eu-central-1.internal.aws.neon.build + # service.type -- Service type + type: LoadBalancer + # service.port -- broker listen port + port: 50051 + +ingress: + enabled: false + +metrics: + enabled: false + +extraManifests: + - apiVersion: operator.victoriametrics.com/v1beta1 + kind: VMServiceScrape + metadata: + name: "{{ include \"neon-storage-broker.fullname\" . }}" + labels: + helm.sh/chart: neon-storage-broker-{{ .Chart.Version }} + app.kubernetes.io/name: neon-storage-broker + app.kubernetes.io/instance: neon-storage-broker + app.kubernetes.io/version: "{{ .Chart.AppVersion }}" + app.kubernetes.io/managed-by: Helm + namespace: "{{ .Release.Namespace }}" + spec: + selector: + matchLabels: + app.kubernetes.io/name: "neon-storage-broker" + endpoints: + - port: broker + path: /metrics + interval: 10s + scrapeTimeout: 10s + namespaceSelector: + matchNames: + - "{{ .Release.Namespace }}" + +settings: + sentryEnvironment: "staging" diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml index a8567665d3..a7d8587ec2 100644 --- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml @@ -23,6 +23,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2" domain: "*.eu-west-1.aws.neon.build" + otelExporterOtlpEndpoint: "https://otel-collector.zeta.eu-west-1.internal.aws.neon.build" sentryEnvironment: "staging" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml index feca05aff6..893e0fab10 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/" uri: "https://console.stage.neon.tech/psql_session/" domain: "pg.neon.build" + otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build" sentryEnvironment: "staging" metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" metricCollectionInterval: "1min" diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml index 46cfdd2e69..77f6cf080e 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml @@ -24,6 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2" domain: "*.cloud.stage.neon.tech" + otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build" sentryEnvironment: "staging" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml index fdd869c122..2510d624cd 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml @@ -25,6 +25,7 @@ settings: authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2" domain: "*.us-east-2.aws.neon.build" extraDomains: ["*.us-east-2.postgres.zenith.tech", "*.us-east-2.retooldb-staging.com"] + otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build" sentryEnvironment: "staging" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" diff --git a/.github/helm-values/preview-template.neon-proxy-scram.yaml b/.github/helm-values/preview-template.neon-proxy-scram.yaml new file mode 100644 index 0000000000..f4bd418e28 --- /dev/null +++ b/.github/helm-values/preview-template.neon-proxy-scram.yaml @@ -0,0 +1,67 @@ +# Helm chart values for neon-proxy-scram. +# This is a YAML-formatted file. + +deploymentStrategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 100% + maxUnavailable: 50% + +image: + repository: neondatabase/neon + +settings: + authBackend: "console" + authEndpoint: "http://neon-internal-api.${PREVIEW_NAME}.aws.neon.build/management/api/v2" + domain: "*.cloud.${PREVIEW_NAME}.aws.neon.build" + sentryEnvironment: "staging" + wssPort: 8443 + metricCollectionEndpoint: "http://neon-internal-api.${PREVIEW_NAME}.aws.neon.build/billing/api/v1/usage_events" + metricCollectionInterval: "1min" + +# -- Additional labels for neon-proxy pods +podLabels: + neon_service: proxy-scram + neon_env: test + neon_region: ${PREVIEW_NAME}.eu-central-1 + + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: cloud.${PREVIEW_NAME}.aws.neon.build + httpsPort: 443 + +#metrics: +# enabled: true +# serviceMonitor: +# enabled: true +# selector: +# release: kube-prometheus-stack + +extraManifests: + - apiVersion: operator.victoriametrics.com/v1beta1 + kind: VMServiceScrape + metadata: + name: "{{ include \"neon-proxy.fullname\" . }}" + labels: + helm.sh/chart: neon-proxy-{{ .Chart.Version }} + app.kubernetes.io/name: neon-proxy + app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}" + app.kubernetes.io/version: "{{ .Chart.AppVersion }}" + app.kubernetes.io/managed-by: Helm + namespace: "{{ .Release.Namespace }}" + spec: + selector: + matchLabels: + app.kubernetes.io/name: "neon-proxy" + endpoints: + - port: http + path: /metrics + interval: 10s + scrapeTimeout: 10s + namespaceSelector: + matchNames: + - "{{ .Release.Namespace }}" diff --git a/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml index f113d1f861..1c7e646810 100644 --- a/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml @@ -23,8 +23,8 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.us-east-1.aws.neon.tech" - # These domains haven't been delegated yet. - # extraDomains: ["*.us-east-1.retooldb.com", "*.us-east-1.postgres.vercel-storage.com"] + # *.us-east-1.retooldb.com hasn't been delegated yet. + extraDomains: ["*.us-east-1.postgres.vercel-storage.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 3212b76731..e5ba7aa3eb 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -111,8 +111,21 @@ jobs: - name: Get postgres headers run: make postgres-headers -j$(nproc) - - name: Run cargo clippy - run: ./run_clippy.sh + # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations. + # This will catch compiler & clippy warnings in all feature combinations. + # TODO: use cargo hack for build and test as well, but, that's quite expensive. + # NB: keep clippy args in sync with ./run_clippy.sh + - run: | + CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" + if [ "$CLIPPY_COMMON_ARGS" = "" ]; then + echo "No clippy args found in .neon_clippy_args" + exit 1 + fi + echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV + - name: Run cargo clippy (debug) + run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS + - name: Run cargo clippy (release) + run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - name: Check formatting @@ -541,7 +554,7 @@ jobs: container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned options: --init - needs: [ push-docker-hub, tag ] + needs: [ promote-images, tag ] steps: - name: Set PR's status to pending and request a remote CI test run: | @@ -584,8 +597,7 @@ jobs: neon-image: runs-on: [ self-hosted, gen3, large ] needs: [ tag ] - # https://github.com/GoogleContainerTools/kaniko/issues/2005 - container: gcr.io/kaniko-project/executor:v1.7.0-debug + container: gcr.io/kaniko-project/executor:v1.9.2-debug defaults: run: shell: sh -eu {0} @@ -597,11 +609,32 @@ jobs: submodules: true fetch-depth: 0 - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + - name: Configure ECR and Docker Hub login + run: | + DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) + echo "::add-mask::${DOCKERHUB_AUTH}" + + cat <<-EOF > /kaniko/.docker/config.json + { + "auths": { + "https://index.docker.io/v1/": { + "auth": "${DOCKERHUB_AUTH}" + } + }, + "credHelpers": { + "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" + } + } + EOF - name: Kaniko build neon - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} + run: + /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true + --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache + --context . + --build-arg GIT_VERSION=${{ github.sha }} + --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} + --destination neondatabase/neon:${{needs.tag.outputs.build-tag}} # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - name: Cleanup ECR folder @@ -652,7 +685,7 @@ jobs: compute-tools-image: runs-on: [ self-hosted, gen3, large ] needs: [ tag ] - container: gcr.io/kaniko-project/executor:v1.7.0-debug + container: gcr.io/kaniko-project/executor:v1.9.2-debug defaults: run: shell: sh -eu {0} @@ -661,18 +694,41 @@ jobs: - name: Checkout uses: actions/checkout@v1 # v3 won't work with kaniko - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + - name: Configure ECR and Docker Hub login + run: | + DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) + echo "::add-mask::${DOCKERHUB_AUTH}" + + cat <<-EOF > /kaniko/.docker/config.json + { + "auths": { + "https://index.docker.io/v1/": { + "auth": "${DOCKERHUB_AUTH}" + } + }, + "credHelpers": { + "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" + } + } + EOF - name: Kaniko build compute tools - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} + run: + /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true + --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache + --context . + --build-arg GIT_VERSION=${{ github.sha }} + --dockerfile Dockerfile.compute-tools + --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} + --destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} + # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - name: Cleanup ECR folder run: rm -rf ~/.ecr compute-node-image: runs-on: [ self-hosted, gen3, large ] - container: gcr.io/kaniko-project/executor:v1.7.0-debug + container: gcr.io/kaniko-project/executor:v1.9.2-debug needs: [ tag ] strategy: fail-fast: false @@ -689,12 +745,36 @@ jobs: submodules: true fetch-depth: 0 - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + - name: Configure ECR and Docker Hub login + run: | + DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) + echo "::add-mask::${DOCKERHUB_AUTH}" + + cat <<-EOF > /kaniko/.docker/config.json + { + "auths": { + "https://index.docker.io/v1/": { + "auth": "${DOCKERHUB_AUTH}" + } + }, + "credHelpers": { + "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" + } + } + EOF - name: Kaniko build compute node with extensions - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + run: + /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true + --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache + --context . + --build-arg GIT_VERSION=${{ github.sha }} + --build-arg PG_VERSION=${{ matrix.version }} + --dockerfile Dockerfile.compute-node + --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - name: Cleanup ECR folder run: rm -rf ~/.ecr @@ -786,13 +866,11 @@ jobs: runs-on: [ self-hosted, gen3, small ] needs: [ tag, test-images, vm-compute-node-image ] container: golang:1.19-bullseye - if: github.event_name != 'workflow_dispatch' + # Don't add if-condition here. + # The job should always be run because we have dependant other jobs that shouldn't be skipped steps: - name: Install Crane & ECR helper - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' run: | go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 @@ -802,10 +880,15 @@ jobs: mkdir /github/home/.docker/ echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + - name: Copy vm-compute-node images to Docker Hub + run: | + crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14 + crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15 + - name: Add latest tag to images if: | (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + github.event_name != 'workflow_dispatch' run: | crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest @@ -814,50 +897,10 @@ jobs: crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - - name: Cleanup ECR folder - run: rm -rf ~/.ecr - - push-docker-hub: - runs-on: [ self-hosted, dev, x64 ] - needs: [ promote-images, tag ] - container: golang:1.19-bullseye - - steps: - - name: Install Crane & ECR helper - run: | - go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 - go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 - - - name: Configure ECR login - run: | - mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json - - - name: Pull neon image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} neon - - - name: Pull compute tools image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools - - - name: Pull compute node v14 image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14 - - - name: Pull vm compute node v14 image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14 - - - name: Pull compute node v15 image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15 - - - name: Pull vm compute node v15 image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15 - - - name: Pull rust image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust - - name: Push images to production ECR if: | (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + github.event_name != 'workflow_dispatch' run: | crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest @@ -872,28 +915,12 @@ jobs: echo "" > /github/home/.docker/config.json crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io - - name: Push neon image to Docker Hub - run: crane push neon neondatabase/neon:${{needs.tag.outputs.build-tag}} + - name: Push vm-compute-node to Docker Hub + run: | + crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} + crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} - - name: Push compute tools image to Docker Hub - run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} - - - name: Push compute node v14 image to Docker Hub - run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} - - - name: Push vm compute node v14 image to Docker Hub - run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} - - - name: Push compute node v15 image to Docker Hub - run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} - - - name: Push vm compute node v15 image to Docker Hub - run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} - - - name: Push rust image to Docker Hub - run: crane push rust neondatabase/rust:pinned - - - name: Add latest tag to images in Docker Hub + - name: Push latest tags to Docker Hub if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' @@ -913,7 +940,7 @@ jobs: container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly - needs: [ push-docker-hub, tag, regress-tests ] + needs: [ promote-images, tag, regress-tests ] if: | contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') && github.event_name != 'workflow_dispatch' @@ -947,7 +974,7 @@ jobs: deploy: runs-on: [ self-hosted, gen3, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - needs: [ push-docker-hub, tag, regress-tests ] + needs: [ promote-images, tag, regress-tests ] if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch' steps: - name: Fix git ownership @@ -984,7 +1011,7 @@ jobs: container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init - needs: [ push-docker-hub, tag, regress-tests ] + needs: [ promote-images, tag, regress-tests ] if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch' steps: - name: Promote compatibility snapshot for the release diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml index b080a29f7c..5d1c6e0e16 100644 --- a/.github/workflows/deploy-dev.yml +++ b/.github/workflows/deploy-dev.yml @@ -48,7 +48,8 @@ jobs: shell: bash strategy: matrix: - target_region: [ eu-west-1, us-east-2 ] + # TODO(sergey): Fix storage deploy in eu-central-1 + target_region: [ eu-west-1, us-east-2] environment: name: dev-${{ matrix.target_region }} steps: @@ -133,6 +134,53 @@ jobs: - name: Cleanup helm folder run: rm -rf ~/.cache + + deploy-preview-proxy-new: + runs-on: [ self-hosted, gen3, small ] + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned + if: inputs.deployProxy + defaults: + run: + shell: bash + strategy: + matrix: + include: + - target_region: eu-central-1 + target_cluster: dev-eu-central-1-alpha + environment: + name: dev-${{ matrix.target_region }} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + ref: ${{ inputs.branch }} + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v1-node16 + with: + role-to-assume: arn:aws:iam::369495373322:role/github-runner + aws-region: eu-central-1 + role-skip-session-tagging: true + role-duration-seconds: 1800 + + - name: Configure environment + run: | + helm repo add neondatabase https://neondatabase.github.io/helm-charts + aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} + + - name: Re-deploy preview proxies + run: | + DOCKER_TAG=${{ inputs.dockerTag }} + for PREVIEW_NAME in helium argon krypton xenon radon oganesson hydrogen nitrogen oxygen fluorine chlorine; do + export PREVIEW_NAME + envsubst <.github/helm-values/preview-template.neon-proxy-scram.yaml >preview-${PREVIEW_NAME}.neon-proxy-scram.yaml + helm upgrade neon-proxy-scram-${PREVIEW_NAME} neondatabase/neon-proxy --namespace neon-proxy-${PREVIEW_NAME} --create-namespace --install --atomic -f preview-${PREVIEW_NAME}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + done + + - name: Cleanup helm folder + run: rm -rf ~/.cache deploy-storage-broker-new: runs-on: [ self-hosted, gen3, small ] @@ -148,6 +196,8 @@ jobs: target_cluster: dev-us-east-2-beta - target_region: eu-west-1 target_cluster: dev-eu-west-1-zeta + - target_region: eu-central-1 + target_cluster: dev-eu-central-1-alpha environment: name: dev-${{ matrix.target_region }} steps: diff --git a/.neon_clippy_args b/.neon_clippy_args new file mode 100644 index 0000000000..25e09c61a6 --- /dev/null +++ b/.neon_clippy_args @@ -0,0 +1,4 @@ +# * `-A unknown_lints` – do not warn about unknown lint suppressions +# that people with newer toolchains might use +# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) +export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings" diff --git a/Cargo.lock b/Cargo.lock index 88ae3b57d2..c9af0d04a7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4636,6 +4636,16 @@ dependencies = [ "valuable", ] +[[package]] +name = "tracing-error" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d686ec1c0f384b1277f097b2f279a2ecc11afe8c133c1aabf036a27cb4cd206e" +dependencies = [ + "tracing", + "tracing-subscriber", +] + [[package]] name = "tracing-futures" version = "0.2.5" @@ -4886,6 +4896,7 @@ dependencies = [ "thiserror", "tokio", "tracing", + "tracing-error", "tracing-subscriber", "url", "uuid", diff --git a/Cargo.toml b/Cargo.toml index 0b545e6190..f4872433cd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -110,6 +110,7 @@ toml = "0.7" toml_edit = "0.19" tonic = {version = "0.9", features = ["tls", "tls-roots"]} tracing = "0.1" +tracing-error = "0.2.0" tracing-opentelemetry = "0.18.0" tracing-subscriber = { version = "0.3", features = ["env-filter"] } url = "2.2" diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 309310407d..36dbc382b5 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -73,7 +73,7 @@ fn main() -> Result<()> { // Try to use just 'postgres' if no path is provided let pgbin = matches.get_one::("pgbin").unwrap(); - let mut spec = None; + let spec; let mut live_config_allowed = false; match spec_json { // First, try to get cluster spec from the cli argument @@ -89,9 +89,13 @@ fn main() -> Result<()> { } else if let Some(id) = compute_id { if let Some(cp_base) = control_plane_uri { live_config_allowed = true; - if let Ok(s) = get_spec_from_control_plane(cp_base, id) { - spec = Some(s); - } + spec = match get_spec_from_control_plane(cp_base, id) { + Ok(s) => s, + Err(e) => { + error!("cannot get response from control plane: {}", e); + panic!("neither spec nor confirmation that compute is in the Empty state was received"); + } + }; } else { panic!("must specify both --control-plane-uri and --compute-id or none"); } @@ -114,7 +118,6 @@ fn main() -> Result<()> { spec_set = false; } let compute_node = ComputeNode { - start_time: Utc::now(), connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?, pgdata: pgdata.to_string(), pgbin: pgbin.to_string(), @@ -147,6 +150,17 @@ fn main() -> Result<()> { let mut state = compute.state.lock().unwrap(); let pspec = state.pspec.as_ref().expect("spec must be set"); let startup_tracing_context = pspec.spec.startup_tracing_context.clone(); + + // Record for how long we slept waiting for the spec. + state.metrics.wait_for_spec_ms = Utc::now() + .signed_duration_since(state.start_time) + .to_std() + .unwrap() + .as_millis() as u64; + // Reset start time to the actual start of the configuration, so that + // total startup time was properly measured at the end. + state.start_time = Utc::now(); + state.status = ComputeStatus::Init; compute.state_changed.notify_all(); drop(state); diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 51de2b6e0a..b6bc234beb 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -38,7 +38,6 @@ use crate::spec::*; /// Compute node info shared across several `compute_ctl` threads. pub struct ComputeNode { - pub start_time: DateTime, // Url type maintains proper escaping pub connstr: url::Url, pub pgdata: String, @@ -66,6 +65,7 @@ pub struct ComputeNode { #[derive(Clone, Debug)] pub struct ComputeState { + pub start_time: DateTime, pub status: ComputeStatus, /// Timestamp of the last Postgres activity pub last_active: DateTime, @@ -77,6 +77,7 @@ pub struct ComputeState { impl ComputeState { pub fn new() -> Self { Self { + start_time: Utc::now(), status: ComputeStatus::Empty, last_active: Utc::now(), error: None, @@ -248,18 +249,63 @@ impl ComputeNode { /// safekeepers sync, basebackup, etc. #[instrument(skip(self, compute_state))] pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { + #[derive(Clone)] + enum Replication { + Primary, + Static { lsn: Lsn }, + HotStandby, + } + let pspec = compute_state.pspec.as_ref().expect("spec must be set"); + let spec = &pspec.spec; let pgdata_path = Path::new(&self.pgdata); + let hot_replica = if let Some(option) = spec.cluster.settings.find_ref("hot_standby") { + if let Some(value) = &option.value { + anyhow::ensure!(option.vartype == "bool"); + matches!(value.as_str(), "on" | "yes" | "true") + } else { + false + } + } else { + false + }; + + let replication = if hot_replica { + Replication::HotStandby + } else if let Some(lsn) = spec.cluster.settings.find("recovery_target_lsn") { + Replication::Static { + lsn: Lsn::from_str(&lsn)?, + } + } else { + Replication::Primary + }; + // Remove/create an empty pgdata directory and put configuration there. self.create_pgdata()?; config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?; - info!("starting safekeepers syncing"); - let lsn = self - .sync_safekeepers(pspec.storage_auth_token.clone()) - .with_context(|| "failed to sync safekeepers")?; - info!("safekeepers synced at LSN {}", lsn); + // Syncing safekeepers is only safe with primary nodes: if a primary + // is already connected it will be kicked out, so a secondary (standby) + // cannot sync safekeepers. + let lsn = match &replication { + Replication::Primary => { + info!("starting safekeepers syncing"); + let lsn = self + .sync_safekeepers(pspec.storage_auth_token.clone()) + .with_context(|| "failed to sync safekeepers")?; + info!("safekeepers synced at LSN {}", lsn); + lsn + } + Replication::Static { lsn } => { + info!("Starting read-only node at static LSN {}", lsn); + *lsn + } + Replication::HotStandby => { + info!("Initializing standby from latest Pageserver LSN"); + Lsn(0) + } + }; info!( "getting basebackup@{} from pageserver {}", @@ -275,6 +321,13 @@ impl ComputeNode { // Update pg_hba.conf received with basebackup. update_pg_hba(pgdata_path)?; + match &replication { + Replication::Primary | Replication::Static { .. } => {} + Replication::HotStandby => { + add_standby_signal(pgdata_path)?; + } + } + Ok(()) } @@ -425,7 +478,7 @@ impl ComputeNode { .unwrap() .as_millis() as u64; state.metrics.total_startup_ms = startup_end_time - .signed_duration_since(self.start_time) + .signed_duration_since(compute_state.start_time) .to_std() .unwrap() .as_millis() as u64; diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 3ca688de69..4468f6f5e4 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -18,6 +18,7 @@ use tracing_utils::http::OtelName; fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse { ComputeStatusResponse { + start_time: state.start_time, tenant: state .pspec .as_ref() diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index bdb09d4a6b..cc8f074a50 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -152,11 +152,14 @@ components: type: object description: Compute startup metrics. required: + - wait_for_spec_ms - sync_safekeepers_ms - basebackup_ms - config_ms - total_startup_ms properties: + wait_for_spec_ms: + type: integer sync_safekeepers_ms: type: integer basebackup_ms: @@ -181,6 +184,13 @@ components: - status - last_active properties: + start_time: + type: string + description: | + Time when compute was started. If initially compute was started in the `empty` + state and then provided with valid spec, `start_time` will be reset to the + moment, when spec was received. + example: "2022-10-12T07:20:50.52Z" status: $ref: '#/components/schemas/ComputeStatus' last_active: diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index bb787d0506..40dbea6907 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -94,6 +94,7 @@ impl PgOptionsSerialize for GenericOptions { pub trait GenericOptionsSearch { fn find(&self, name: &str) -> Option; + fn find_ref(&self, name: &str) -> Option<&GenericOption>; } impl GenericOptionsSearch for GenericOptions { @@ -103,6 +104,12 @@ impl GenericOptionsSearch for GenericOptions { let op = ops.iter().find(|s| s.name == name)?; op.value.clone() } + + /// Lookup option by name, returning ref + fn find_ref(&self, name: &str) -> Option<&GenericOption> { + let ops = self.as_ref()?; + ops.iter().find(|s| s.name == name) + } } pub trait RoleExt { diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 088f74335a..bf3c407202 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,45 +1,121 @@ +use std::fs::File; use std::path::Path; use std::str::FromStr; use anyhow::{anyhow, bail, Result}; use postgres::config::Config; use postgres::{Client, NoTls}; -use tracing::{info, info_span, instrument, span_enabled, warn, Level}; +use reqwest::StatusCode; +use tracing::{error, info, info_span, instrument, span_enabled, warn, Level}; use crate::config; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; -use compute_api::responses::ControlPlaneSpecResponse; +use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse}; use compute_api::spec::{ComputeSpec, Database, PgIdent, Role}; +// Do control plane request and return response if any. In case of error it +// returns a bool flag indicating whether it makes sense to retry the request +// and a string with error message. +fn do_control_plane_request( + uri: &str, + jwt: &str, +) -> Result { + let resp = reqwest::blocking::Client::new() + .get(uri) + .header("Authorization", jwt) + .send() + .map_err(|e| { + ( + true, + format!("could not perform spec request to control plane: {}", e), + ) + })?; + + match resp.status() { + StatusCode::OK => match resp.json::() { + Ok(spec_resp) => Ok(spec_resp), + Err(e) => Err(( + true, + format!("could not deserialize control plane response: {}", e), + )), + }, + StatusCode::SERVICE_UNAVAILABLE => { + Err((true, "control plane is temporarily unavailable".to_string())) + } + StatusCode::BAD_GATEWAY => { + // We have a problem with intermittent 502 errors now + // https://github.com/neondatabase/cloud/issues/2353 + // It's fine to retry GET request in this case. + Err((true, "control plane request failed with 502".to_string())) + } + // Another code, likely 500 or 404, means that compute is unknown to the control plane + // or some internal failure happened. Doesn't make much sense to retry in this case. + _ => Err(( + false, + format!( + "unexpected control plane response status code: {}", + resp.status() + ), + )), + } +} + /// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT` /// env variable is set, it will be used for authorization. -pub fn get_spec_from_control_plane(base_uri: &str, compute_id: &str) -> Result { +pub fn get_spec_from_control_plane( + base_uri: &str, + compute_id: &str, +) -> Result> { let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec"); - let jwt: String = match std::env::var("NEON_CONSOLE_JWT") { + let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") { Ok(v) => v, Err(_) => "".to_string(), }; + let mut attempt = 1; + let mut spec: Result> = Ok(None); + info!("getting spec from control plane: {}", cp_uri); - // TODO: check the response. We should distinguish cases when it's - // - network error, then retry - // - no spec for compute yet, then wait - // - compute id is unknown or any other error, then bail out - let resp: ControlPlaneSpecResponse = reqwest::blocking::Client::new() - .get(cp_uri) - .header("Authorization", jwt) - .send() - .map_err(|e| anyhow!("could not send spec request to control plane: {}", e))? - .json() - .map_err(|e| anyhow!("could not get compute spec from control plane: {}", e))?; + // Do 3 attempts to get spec from the control plane using the following logic: + // - network error -> then retry + // - compute id is unknown or any other error -> bail out + // - no spec for compute yet (Empty state) -> return Ok(None) + // - got spec -> return Ok(Some(spec)) + while attempt < 4 { + spec = match do_control_plane_request(&cp_uri, &jwt) { + Ok(spec_resp) => match spec_resp.status { + ControlPlaneComputeStatus::Empty => Ok(None), + ControlPlaneComputeStatus::Attached => { + if let Some(spec) = spec_resp.spec { + Ok(Some(spec)) + } else { + bail!("compute is attached, but spec is empty") + } + } + }, + Err((retry, msg)) => { + if retry { + Err(anyhow!(msg)) + } else { + bail!(msg); + } + } + }; - if let Some(spec) = resp.spec { - Ok(spec) - } else { - bail!("could not get compute spec from control plane") + if let Err(e) = &spec { + error!("attempt {} to get spec failed with: {}", attempt, e); + } else { + return spec; + } + + attempt += 1; + std::thread::sleep(std::time::Duration::from_millis(100)); } + + // All attempts failed, return error. + spec } /// It takes cluster specification and does the following: @@ -70,6 +146,21 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { Ok(()) } +/// Create a standby.signal file +pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { + // XXX: consider making it a part of spec.json + info!("adding standby.signal"); + let signalfile = pgdata_path.join("standby.signal"); + + if !signalfile.exists() { + info!("created standby.signal"); + File::create(signalfile)?; + } else { + info!("reused pre-existing standby.signal"); + } + Ok(()) +} + /// Given a cluster spec json and open transaction it handles roles creation, /// deletion and update. #[instrument(skip_all)] diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 665cad8783..09278e1726 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -8,6 +8,7 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; use control_plane::endpoint::ComputeControlPlane; +use control_plane::endpoint::Replication; use control_plane::local_env::LocalEnv; use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; @@ -474,7 +475,14 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; println!("Creating endpoint for imported timeline ..."); - cplane.new_endpoint(tenant_id, name, timeline_id, None, None, pg_version)?; + cplane.new_endpoint( + tenant_id, + name, + timeline_id, + None, + pg_version, + Replication::Primary, + )?; println!("Done"); } Some(("branch", branch_match)) => { @@ -560,20 +568,20 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .iter() .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id) { - let lsn_str = match endpoint.lsn { - None => { - // -> primary endpoint + let lsn_str = match endpoint.replication { + Replication::Static(lsn) => { + // -> read-only endpoint + // Use the node's LSN. + lsn.to_string() + } + _ => { + // -> primary endpoint or hot replica // Use the LSN at the end of the timeline. timeline_infos .get(&endpoint.timeline_id) .map(|bi| bi.last_record_lsn.to_string()) .unwrap_or_else(|| "?".to_string()) } - Some(lsn) => { - // -> read-only endpoint - // Use the endpoint's LSN. - lsn.to_string() - } }; let branch_name = timeline_name_mappings @@ -619,7 +627,26 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .copied() .context("Failed to parse postgres version from the argument string")?; - cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, lsn, port, pg_version)?; + let hot_standby = sub_args + .get_one::("hot-standby") + .copied() + .unwrap_or(false); + + let replication = match (lsn, hot_standby) { + (Some(lsn), false) => Replication::Static(lsn), + (None, true) => Replication::Replica, + (None, false) => Replication::Primary, + (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), + }; + + cplane.new_endpoint( + tenant_id, + &endpoint_id, + timeline_id, + port, + pg_version, + replication, + )?; } "start" => { let port: Option = sub_args.get_one::("port").copied(); @@ -637,7 +664,21 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( None }; + let hot_standby = sub_args + .get_one::("hot-standby") + .copied() + .unwrap_or(false); + if let Some(endpoint) = endpoint { + match (&endpoint.replication, hot_standby) { + (Replication::Static(_), true) => { + bail!("Cannot start a node in hot standby mode when it is already configured as a static replica") + } + (Replication::Primary, true) => { + bail!("Cannot start a node as a hot standby replica, it is already configured as primary node") + } + _ => {} + } println!("Starting existing endpoint {endpoint_id}..."); endpoint.start(&auth_token)?; } else { @@ -659,6 +700,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .get_one::("pg-version") .copied() .context("Failed to `pg-version` from the argument string")?; + + let replication = match (lsn, hot_standby) { + (Some(lsn), false) => Replication::Static(lsn), + (None, true) => Replication::Replica, + (None, false) => Replication::Primary, + (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), + }; + // when used with custom port this results in non obvious behaviour // port is remembered from first start command, i e // start --port X @@ -670,9 +719,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( tenant_id, endpoint_id, timeline_id, - lsn, port, pg_version, + replication, )?; ep.start(&auth_token)?; } @@ -928,6 +977,12 @@ fn cli() -> Command { .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.") .required(false); + let hot_standby_arg = Arg::new("hot-standby") + .value_parser(value_parser!(bool)) + .long("hot-standby") + .help("If set, the node will be a hot replica on the specified timeline") + .required(false); + Command::new("Neon CLI") .arg_required_else_help(true) .version(GIT_VERSION) @@ -1052,6 +1107,7 @@ fn cli() -> Command { .long("config-only") .required(false)) .arg(pg_version_arg.clone()) + .arg(hot_standby_arg.clone()) ) .subcommand(Command::new("start") .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") @@ -1062,6 +1118,7 @@ fn cli() -> Command { .arg(lsn_arg) .arg(port_arg) .arg(pg_version_arg) + .arg(hot_standby_arg) ) .subcommand( Command::new("stop") diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 9e85138e68..7d3485518f 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -68,18 +68,19 @@ impl ComputeControlPlane { tenant_id: TenantId, name: &str, timeline_id: TimelineId, - lsn: Option, port: Option, pg_version: u32, + replication: Replication, ) -> Result> { let port = port.unwrap_or_else(|| self.get_port()); + let ep = Arc::new(Endpoint { name: name.to_owned(), address: SocketAddr::new("127.0.0.1".parse().unwrap(), port), env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), timeline_id, - lsn, + replication, tenant_id, pg_version, }); @@ -95,6 +96,18 @@ impl ComputeControlPlane { /////////////////////////////////////////////////////////////////////////////// +#[derive(Debug, Clone, Eq, PartialEq)] +pub enum Replication { + // Regular read-write node + Primary, + // if recovery_target_lsn is provided, and we want to pin the node to a specific LSN + Static(Lsn), + // Hot standby; read-only replica. + // Future versions may want to distinguish between replicas with hot standby + // feedback and other kinds of replication configurations. + Replica, +} + #[derive(Debug)] pub struct Endpoint { /// used as the directory name @@ -102,7 +115,7 @@ pub struct Endpoint { pub tenant_id: TenantId, pub timeline_id: TimelineId, // Some(lsn) if this is a read-only endpoint anchored at 'lsn'. None for the primary. - pub lsn: Option, + pub replication: Replication, // port and address of the Postgres server pub address: SocketAddr, @@ -153,9 +166,17 @@ impl Endpoint { fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string()); let pg_version = u32::from_str(&pg_version_str)?; - // parse recovery_target_lsn, if any - let recovery_target_lsn: Option = - conf.parse_field_optional("recovery_target_lsn", &context)?; + // parse recovery_target_lsn and primary_conninfo into Recovery Target, if any + let replication = if let Some(lsn_str) = conf.get("recovery_target_lsn") { + Replication::Static(Lsn::from_str(lsn_str)?) + } else if let Some(slot_name) = conf.get("primary_slot_name") { + let slot_name = slot_name.to_string(); + let prefix = format!("repl_{}_", timeline_id); + assert!(slot_name.starts_with(&prefix)); + Replication::Replica + } else { + Replication::Primary + }; // ok now Ok(Endpoint { @@ -164,7 +185,7 @@ impl Endpoint { env: env.clone(), pageserver: Arc::clone(pageserver), timeline_id, - lsn: recovery_target_lsn, + replication, tenant_id, pg_version, }) @@ -299,50 +320,83 @@ impl Endpoint { conf.append("neon.pageserver_connstring", &pageserver_connstr); conf.append("neon.tenant_id", &self.tenant_id.to_string()); conf.append("neon.timeline_id", &self.timeline_id.to_string()); - if let Some(lsn) = self.lsn { - conf.append("recovery_target_lsn", &lsn.to_string()); - } conf.append_line(""); - // Configure backpressure - // - Replication write lag depends on how fast the walreceiver can process incoming WAL. - // This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec, - // so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB. - // Actually latency should be much smaller (better if < 1sec). But we assume that recently - // updates pages are not requested from pageserver. - // - Replication flush lag depends on speed of persisting data by checkpointer (creation of - // delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to - // remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long - // recovery time (in case of pageserver crash) and disk space overflow at safekeepers. - // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread. - // To be able to restore database in case of pageserver node crash, safekeeper should not - // remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers - // (if they are not able to upload WAL to S3). - conf.append("max_replication_write_lag", "15MB"); - conf.append("max_replication_flush_lag", "10GB"); + // Replication-related configurations, such as WAL sending + match &self.replication { + Replication::Primary => { + // Configure backpressure + // - Replication write lag depends on how fast the walreceiver can process incoming WAL. + // This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec, + // so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB. + // Actually latency should be much smaller (better if < 1sec). But we assume that recently + // updates pages are not requested from pageserver. + // - Replication flush lag depends on speed of persisting data by checkpointer (creation of + // delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to + // remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long + // recovery time (in case of pageserver crash) and disk space overflow at safekeepers. + // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread. + // To be able to restore database in case of pageserver node crash, safekeeper should not + // remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers + // (if they are not able to upload WAL to S3). + conf.append("max_replication_write_lag", "15MB"); + conf.append("max_replication_flush_lag", "10GB"); - if !self.env.safekeepers.is_empty() { - // Configure Postgres to connect to the safekeepers - conf.append("synchronous_standby_names", "walproposer"); + if !self.env.safekeepers.is_empty() { + // Configure Postgres to connect to the safekeepers + conf.append("synchronous_standby_names", "walproposer"); - let safekeepers = self - .env - .safekeepers - .iter() - .map(|sk| format!("localhost:{}", sk.pg_port)) - .collect::>() - .join(","); - conf.append("neon.safekeepers", &safekeepers); - } else { - // We only use setup without safekeepers for tests, - // and don't care about data durability on pageserver, - // so set more relaxed synchronous_commit. - conf.append("synchronous_commit", "remote_write"); + let safekeepers = self + .env + .safekeepers + .iter() + .map(|sk| format!("localhost:{}", sk.pg_port)) + .collect::>() + .join(","); + conf.append("neon.safekeepers", &safekeepers); + } else { + // We only use setup without safekeepers for tests, + // and don't care about data durability on pageserver, + // so set more relaxed synchronous_commit. + conf.append("synchronous_commit", "remote_write"); - // Configure the node to stream WAL directly to the pageserver - // This isn't really a supported configuration, but can be useful for - // testing. - conf.append("synchronous_standby_names", "pageserver"); + // Configure the node to stream WAL directly to the pageserver + // This isn't really a supported configuration, but can be useful for + // testing. + conf.append("synchronous_standby_names", "pageserver"); + } + } + Replication::Static(lsn) => { + conf.append("recovery_target_lsn", &lsn.to_string()); + } + Replication::Replica => { + assert!(!self.env.safekeepers.is_empty()); + + // TODO: use future host field from safekeeper spec + // Pass the list of safekeepers to the replica so that it can connect to any of them, + // whichever is availiable. + let sk_ports = self + .env + .safekeepers + .iter() + .map(|x| x.pg_port.to_string()) + .collect::>() + .join(","); + let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(","); + + let connstr = format!( + "host={} port={} options='-c timeline_id={} tenant_id={}' application_name=replica replication=true", + sk_hosts, + sk_ports, + &self.timeline_id.to_string(), + &self.tenant_id.to_string(), + ); + + let slot_name = format!("repl_{}_", self.timeline_id); + conf.append("primary_conninfo", connstr.as_str()); + conf.append("primary_slot_name", slot_name.as_str()); + conf.append("hot_standby", "on"); + } } let mut file = File::create(self.pgdata().join("postgresql.conf"))?; @@ -355,21 +409,27 @@ impl Endpoint { } fn load_basebackup(&self, auth_token: &Option) -> Result<()> { - let backup_lsn = if let Some(lsn) = self.lsn { - Some(lsn) - } else if !self.env.safekeepers.is_empty() { - // LSN 0 means that it is bootstrap and we need to download just - // latest data from the pageserver. That is a bit clumsy but whole bootstrap - // procedure evolves quite actively right now, so let's think about it again - // when things would be more stable (TODO). - let lsn = self.sync_safekeepers(auth_token, self.pg_version)?; - if lsn == Lsn(0) { - None - } else { - Some(lsn) + let backup_lsn = match &self.replication { + Replication::Primary => { + if !self.env.safekeepers.is_empty() { + // LSN 0 means that it is bootstrap and we need to download just + // latest data from the pageserver. That is a bit clumsy but whole bootstrap + // procedure evolves quite actively right now, so let's think about it again + // when things would be more stable (TODO). + let lsn = self.sync_safekeepers(auth_token, self.pg_version)?; + if lsn == Lsn(0) { + None + } else { + Some(lsn) + } + } else { + None + } + } + Replication::Static(lsn) => Some(*lsn), + Replication::Replica => { + None // Take the latest snapshot available to start with } - } else { - None }; self.do_basebackup(backup_lsn)?; @@ -466,7 +526,7 @@ impl Endpoint { // 3. Load basebackup self.load_basebackup(auth_token)?; - if self.lsn.is_some() { + if self.replication != Replication::Primary { File::create(self.pgdata().join("standby.signal"))?; } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index b700d426ba..75991045a4 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -359,8 +359,8 @@ impl PageServerNode { .transpose() .context("Failed to parse 'trace_read_requests' as bool")?, eviction_policy: settings - .get("eviction_policy") - .map(|x| serde_json::from_str(x)) + .remove("eviction_policy") + .map(serde_json::from_str) .transpose() .context("Failed to parse 'eviction_policy' json")?, min_resident_size_override: settings diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs index 34dc769e78..638575eb82 100644 --- a/control_plane/src/postgresql_conf.rs +++ b/control_plane/src/postgresql_conf.rs @@ -13,7 +13,7 @@ use std::io::BufRead; use std::str::FromStr; /// In-memory representation of a postgresql.conf file -#[derive(Default)] +#[derive(Default, Debug)] pub struct PostgresConf { lines: Vec, hash: HashMap, diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json index 10ae0b0ecf..565e5e368e 100644 --- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json +++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json @@ -28,11 +28,6 @@ "value": "replica", "vartype": "enum" }, - { - "name": "hot_standby", - "value": "on", - "vartype": "bool" - }, { "name": "wal_log_hints", "value": "on", diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 370b2c5626..c409563b56 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -14,6 +14,7 @@ pub struct GenericAPIError { #[derive(Serialize, Debug)] #[serde(rename_all = "snake_case")] pub struct ComputeStatusResponse { + pub start_time: DateTime, pub tenant: Option, pub timeline: Option, pub status: ComputeStatus, @@ -63,6 +64,7 @@ where /// Response of the /metrics.json API #[derive(Clone, Debug, Default, Serialize)] pub struct ComputeMetrics { + pub wait_for_spec_ms: u64, pub sync_safekeepers_ms: u64, pub basebackup_ms: u64, pub config_ms: u64, @@ -75,4 +77,16 @@ pub struct ComputeMetrics { #[derive(Deserialize, Debug)] pub struct ControlPlaneSpecResponse { pub spec: Option, + pub status: ControlPlaneComputeStatus, +} + +#[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ControlPlaneComputeStatus { + // Compute is known to control-plane, but it's not + // yet attached to any timeline / endpoint. + Empty, + // Compute is attached to some timeline / endpoint and + // should be able to start with provided spec. + Attached, } diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 492ec9748a..b8eb469cb0 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -95,10 +95,13 @@ pub fn generate_wal_segment( segno: u64, system_id: u64, pg_version: u32, + lsn: Lsn, ) -> Result { + assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE)); + match pg_version { - 14 => v14::xlog_utils::generate_wal_segment(segno, system_id), - 15 => v15::xlog_utils::generate_wal_segment(segno, system_id), + 14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn), + 15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn), _ => Err(SerializeError::BadInput), } } diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 09678353af..6bc89ed37e 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -195,6 +195,7 @@ pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384; pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; +pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; pub const XLP_LONG_HEADER: u16 = 0x0002; /* From fsm_internals.h */ diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 272c4d6dcc..4d7bb61883 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -270,6 +270,11 @@ impl XLogPageHeaderData { use utils::bin_ser::LeSer; XLogPageHeaderData::des_from(&mut buf.reader()) } + + pub fn encode(&self) -> Result { + use utils::bin_ser::LeSer; + self.ser().map(|b| b.into()) + } } impl XLogLongPageHeaderData { @@ -328,22 +333,32 @@ impl CheckPoint { } } -// -// Generate new, empty WAL segment. -// We need this segment to start compute node. -// -pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result { +/// Generate new, empty WAL segment, with correct block headers at the first +/// page of the segment and the page that contains the given LSN. +/// We need this segment to start compute node. +pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result { let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE); let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); + + let page_off = lsn.block_offset(); + let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE); + + let first_page_only = seg_off < XLOG_BLCKSZ; + let (shdr_rem_len, infoflags) = if first_page_only { + (seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD) + } else { + (0, 0) + }; + let hdr = XLogLongPageHeaderData { std: { XLogPageHeaderData { xlp_magic: XLOG_PAGE_MAGIC as u16, - xlp_info: pg_constants::XLP_LONG_HEADER, + xlp_info: pg_constants::XLP_LONG_HEADER | infoflags, xlp_tli: PG_TLI, xlp_pageaddr: pageaddr, - xlp_rem_len: 0, + xlp_rem_len: shdr_rem_len as u32, ..Default::default() // Put 0 in padding fields. } }, @@ -357,6 +372,33 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result= pg_constants::SIZE_OF_PAGE_HEADER as u64 { + pg_constants::XLP_FIRST_IS_CONTRECORD + } else { + 0 + }, + xlp_tli: PG_TLI, + xlp_pageaddr: lsn.page_lsn().0, + xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 { + page_off as u32 + } else { + 0u32 + }, + ..Default::default() // Put 0 in padding fields. + }; + let hdr_bytes = header.encode()?; + + debug_assert!(seg_buf.len() > block_offset + hdr_bytes.len()); + debug_assert_ne!(block_offset, 0); + + seg_buf[block_offset..block_offset + hdr_bytes.len()].copy_from_slice(&hdr_bytes[..]); + } + Ok(seg_buf.freeze()) } diff --git a/libs/remote_storage/tests/pagination_tests.rs b/libs/remote_storage/tests/pagination_tests.rs index 048e99d841..86a6888f98 100644 --- a/libs/remote_storage/tests/pagination_tests.rs +++ b/libs/remote_storage/tests/pagination_tests.rs @@ -99,7 +99,11 @@ struct S3WithTestBlobs { #[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledS3 { async fn setup() -> Self { - utils::logging::init(utils::logging::LogFormat::Test).expect("logging init failed"); + utils::logging::init( + utils::logging::LogFormat::Test, + utils::logging::TracingErrorLayerEnablement::Disabled, + ) + .expect("logging init failed"); if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { info!( "`{}` env variable is not set, skipping the test", diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index dc6326e73e..2b04dfdef6 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -27,7 +27,8 @@ signal-hook.workspace = true thiserror.workspace = true tokio.workspace = true tracing.workspace = true -tracing-subscriber = { workspace = true, features = ["json"] } +tracing-error.workspace = true +tracing-subscriber = { workspace = true, features = ["json", "registry"] } rand.workspace = true serde_with.workspace = true strum.workspace = true diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 616f2b8468..b11aef9892 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -76,6 +76,7 @@ where let log_quietly = method == Method::GET; async move { + let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding(); if log_quietly { debug!("Handling request"); } else { @@ -87,7 +88,11 @@ where // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call. // // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally. - match (self.0)(request).await { + let res = (self.0)(request).await; + + cancellation_guard.disarm(); + + match res { Ok(response) => { let response_status = response.status(); if log_quietly && response_status.is_success() { @@ -105,6 +110,38 @@ where } } +/// Drop guard to WARN in case the request was dropped before completion. +struct RequestCancelled { + warn: Option, +} + +impl RequestCancelled { + /// Create the drop guard using the [`tracing::Span::current`] as the span. + fn warn_when_dropped_without_responding() -> Self { + RequestCancelled { + warn: Some(tracing::Span::current()), + } + } + + /// Consume the drop guard without logging anything. + fn disarm(mut self) { + self.warn = None; + } +} + +impl Drop for RequestCancelled { + fn drop(&mut self) { + if let Some(span) = self.warn.take() { + // the span has all of the info already, but the outer `.instrument(span)` has already + // been dropped, so we need to manually re-enter it for this message. + // + // this is what the instrument would do before polling so it is fine. + let _g = span.entered(); + warn!("request was dropped before completing"); + } + } +} + async fn prometheus_metrics_handler(_req: Request) -> Result, ApiError> { SERVE_METRICS_COUNT.inc(); diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 92c243ede3..10862d1771 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -54,6 +54,8 @@ pub mod measured_stream; pub mod serde_percent; pub mod serde_regex; +pub mod tracing_span_assert; + /// use with fail::cfg("$name", "return(2000)") #[macro_export] macro_rules! failpoint_sleep_millis_async { diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index f770622a60..2b8c852d86 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -1,6 +1,7 @@ use std::str::FromStr; use anyhow::Context; +use once_cell::sync::Lazy; use strum_macros::{EnumString, EnumVariantNames}; #[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)] @@ -23,24 +24,81 @@ impl LogFormat { } } -pub fn init(log_format: LogFormat) -> anyhow::Result<()> { - let default_filter_str = "info"; +static TRACING_EVENT_COUNT: Lazy = Lazy::new(|| { + metrics::register_int_counter_vec!( + "libmetrics_tracing_event_count", + "Number of tracing events, by level", + &["level"] + ) + .expect("failed to define metric") +}); +struct TracingEventCountLayer(&'static metrics::IntCounterVec); + +impl tracing_subscriber::layer::Layer for TracingEventCountLayer +where + S: tracing::Subscriber, +{ + fn on_event( + &self, + event: &tracing::Event<'_>, + _ctx: tracing_subscriber::layer::Context<'_, S>, + ) { + let level = event.metadata().level(); + let level = match *level { + tracing::Level::ERROR => "error", + tracing::Level::WARN => "warn", + tracing::Level::INFO => "info", + tracing::Level::DEBUG => "debug", + tracing::Level::TRACE => "trace", + }; + self.0.with_label_values(&[level]).inc(); + } +} + +/// Whether to add the `tracing_error` crate's `ErrorLayer` +/// to the global tracing subscriber. +/// +pub enum TracingErrorLayerEnablement { + /// Do not add the `ErrorLayer`. + Disabled, + /// Add the `ErrorLayer` with the filter specified by RUST_LOG, defaulting to `info` if `RUST_LOG` is unset. + EnableWithRustLogFilter, +} + +pub fn init( + log_format: LogFormat, + tracing_error_layer_enablement: TracingErrorLayerEnablement, +) -> anyhow::Result<()> { // We fall back to printing all spans at info-level or above if // the RUST_LOG environment variable is not set. - let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() - .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str)); + let rust_log_env_filter = || { + tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")) + }; - let base_logger = tracing_subscriber::fmt() - .with_env_filter(env_filter) - .with_target(false) - .with_ansi(atty::is(atty::Stream::Stdout)) - .with_writer(std::io::stdout); - - match log_format { - LogFormat::Json => base_logger.json().init(), - LogFormat::Plain => base_logger.init(), - LogFormat::Test => base_logger.with_test_writer().init(), + // NB: the order of the with() calls does not matter. + // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering + use tracing_subscriber::prelude::*; + let r = tracing_subscriber::registry(); + let r = r.with({ + let log_layer = tracing_subscriber::fmt::layer() + .with_target(false) + .with_ansi(atty::is(atty::Stream::Stdout)) + .with_writer(std::io::stdout); + let log_layer = match log_format { + LogFormat::Json => log_layer.json().boxed(), + LogFormat::Plain => log_layer.boxed(), + LogFormat::Test => log_layer.with_test_writer().boxed(), + }; + log_layer.with_filter(rust_log_env_filter()) + }); + let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter())); + match tracing_error_layer_enablement { + TracingErrorLayerEnablement::EnableWithRustLogFilter => r + .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter())) + .init(), + TracingErrorLayerEnablement::Disabled => r.init(), } Ok(()) @@ -157,3 +215,33 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> { ::fmt(self, f) } } + +#[cfg(test)] +mod tests { + use metrics::{core::Opts, IntCounterVec}; + + use super::TracingEventCountLayer; + + #[test] + fn tracing_event_count_metric() { + let counter_vec = + IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap(); + let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static + let layer = TracingEventCountLayer(counter_vec); + use tracing_subscriber::prelude::*; + + tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || { + tracing::trace!("foo"); + tracing::debug!("foo"); + tracing::info!("foo"); + tracing::warn!("foo"); + tracing::error!("foo"); + }); + + assert_eq!(counter_vec.with_label_values(&["trace"]).get(), 1); + assert_eq!(counter_vec.with_label_values(&["debug"]).get(), 1); + assert_eq!(counter_vec.with_label_values(&["info"]).get(), 1); + assert_eq!(counter_vec.with_label_values(&["warn"]).get(), 1); + assert_eq!(counter_vec.with_label_values(&["error"]).get(), 1); + } +} diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index acf5ea28d7..0493d43088 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -62,29 +62,48 @@ impl Lsn { } /// Compute the offset into a segment + #[inline] pub fn segment_offset(self, seg_sz: usize) -> usize { (self.0 % seg_sz as u64) as usize } /// Compute LSN of the segment start. + #[inline] pub fn segment_lsn(self, seg_sz: usize) -> Lsn { Lsn(self.0 - (self.0 % seg_sz as u64)) } /// Compute the segment number + #[inline] pub fn segment_number(self, seg_sz: usize) -> u64 { self.0 / seg_sz as u64 } /// Compute the offset into a block + #[inline] pub fn block_offset(self) -> u64 { const BLCKSZ: u64 = XLOG_BLCKSZ as u64; self.0 % BLCKSZ } + /// Compute the block offset of the first byte of this Lsn within this + /// segment + #[inline] + pub fn page_lsn(self) -> Lsn { + Lsn(self.0 - self.block_offset()) + } + + /// Compute the block offset of the first byte of this Lsn within this + /// segment + #[inline] + pub fn page_offset_in_segment(self, seg_sz: usize) -> u64 { + (self.0 - self.block_offset()) - self.segment_lsn(seg_sz).0 + } + /// Compute the bytes remaining in this block /// /// If the LSN is already at the block boundary, it will return `XLOG_BLCKSZ`. + #[inline] pub fn remaining_in_block(self) -> u64 { const BLCKSZ: u64 = XLOG_BLCKSZ as u64; BLCKSZ - (self.0 % BLCKSZ) diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs new file mode 100644 index 0000000000..b9f7986442 --- /dev/null +++ b/libs/utils/src/tracing_span_assert.rs @@ -0,0 +1,287 @@ +//! Assert that the current [`tracing::Span`] has a given set of fields. +//! +//! # Usage +//! +//! ``` +//! use tracing_subscriber::prelude::*; +//! let registry = tracing_subscriber::registry() +//! .with(tracing_error::ErrorLayer::default()); +//! +//! // Register the registry as the global subscriber. +//! // In this example, we'll only use it as a thread-local subscriber. +//! let _guard = tracing::subscriber::set_default(registry); +//! +//! // Then, in the main code: +//! +//! let span = tracing::info_span!("TestSpan", test_id = 1); +//! let _guard = span.enter(); +//! +//! // ... down the call stack +//! +//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor}; +//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]); +//! match check_fields_present([&extractor]) { +//! Ok(()) => {}, +//! Err(missing) => { +//! panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::>()); +//! } +//! } +//! ``` +//! +//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering +//! + +use std::{ + collections::HashSet, + fmt::{self}, + hash::{Hash, Hasher}, +}; + +pub enum ExtractionResult { + Present, + Absent, +} + +pub trait Extractor: Send + Sync + std::fmt::Debug { + fn name(&self) -> &str; + fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult; +} + +#[derive(Debug)] +pub struct MultiNameExtractor { + name: &'static str, + field_names: [&'static str; L], +} + +impl MultiNameExtractor { + pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor { + MultiNameExtractor { name, field_names } + } +} +impl Extractor for MultiNameExtractor { + fn name(&self) -> &str { + self.name + } + fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult { + if fields.iter().any(|f| self.field_names.contains(&f.name())) { + ExtractionResult::Present + } else { + ExtractionResult::Absent + } + } +} + +struct MemoryIdentity<'a>(&'a dyn Extractor); + +impl<'a> MemoryIdentity<'a> { + fn as_ptr(&self) -> *const () { + self.0 as *const _ as *const () + } +} +impl<'a> PartialEq for MemoryIdentity<'a> { + fn eq(&self, other: &Self) -> bool { + self.as_ptr() == other.as_ptr() + } +} +impl<'a> Eq for MemoryIdentity<'a> {} +impl<'a> Hash for MemoryIdentity<'a> { + fn hash(&self, state: &mut H) { + self.as_ptr().hash(state); + } +} +impl<'a> fmt::Debug for MemoryIdentity<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:p}: {}", self.as_ptr(), self.0.name()) + } +} + +/// The extractor names passed as keys to [`new`]. +pub fn check_fields_present( + must_be_present: [&dyn Extractor; L], +) -> Result<(), Vec<&dyn Extractor>> { + let mut missing: HashSet = + HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r))); + let trace = tracing_error::SpanTrace::capture(); + trace.with_spans(|md, _formatted_fields| { + missing.retain(|extractor| match extractor.0.extract(md.fields()) { + ExtractionResult::Present => false, + ExtractionResult::Absent => true, + }); + !missing.is_empty() // continue walking up until we've found all missing + }); + if missing.is_empty() { + Ok(()) + } else { + Err(missing.into_iter().map(|mi| mi.0).collect()) + } +} + +#[cfg(test)] +mod tests { + + use tracing_subscriber::prelude::*; + + use super::*; + + struct Setup { + _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard, + tenant_extractor: MultiNameExtractor<2>, + timeline_extractor: MultiNameExtractor<2>, + } + + fn setup_current_thread() -> Setup { + let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]); + let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]); + + let registry = tracing_subscriber::registry() + .with(tracing_subscriber::fmt::layer()) + .with(tracing_error::ErrorLayer::default()); + + let guard = tracing::subscriber::set_default(registry); + + Setup { + _current_thread_subscriber_guard: guard, + tenant_extractor, + timeline_extractor, + } + } + + fn assert_missing(missing: Vec<&dyn Extractor>, expected: Vec<&dyn Extractor>) { + let missing: HashSet = + HashSet::from_iter(missing.into_iter().map(MemoryIdentity)); + let expected: HashSet = + HashSet::from_iter(expected.into_iter().map(MemoryIdentity)); + assert_eq!(missing, expected); + } + + #[test] + fn positive_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1"); + let _guard = span.enter(); + check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap(); + } + + #[test] + fn negative_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", timeline_id = "timeline-1"); + let _guard = span.enter(); + let missing = + check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn positive_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", tenant_id = "tenant-1"); + let _guard = span.enter(); + + let span = tracing::info_span!("grandchild", timeline_id = "timeline-1"); + let _guard = span.enter(); + + check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap(); + } + + #[test] + fn negative_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", timeline_id = "timeline-1"); + let _guard = span.enter(); + + let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn positive_subset_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1"); + let _guard = span.enter(); + check_fields_present([&setup.tenant_extractor]).unwrap(); + } + + #[test] + fn positive_subset_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", tenant_id = "tenant-1"); + let _guard = span.enter(); + + let span = tracing::info_span!("grandchild", timeline_id = "timeline-1"); + let _guard = span.enter(); + + check_fields_present([&setup.tenant_extractor]).unwrap(); + } + + #[test] + fn negative_subset_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", timeline_id = "timeline-1"); + let _guard = span.enter(); + let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn negative_subset_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", timeline_id = "timeline-1"); + let _guard = span.enter(); + + let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn tracing_error_subscriber_not_set_up() { + // no setup + + let span = tracing::info_span!("foo", e = "some value"); + let _guard = span.enter(); + + let extractor = MultiNameExtractor::new("E", ["e"]); + let missing = check_fields_present([&extractor]).unwrap_err(); + assert_missing(missing, vec![&extractor]); + } + + #[test] + #[should_panic] + fn panics_if_tracing_error_subscriber_has_wrong_filter() { + let r = tracing_subscriber::registry().with({ + tracing_error::ErrorLayer::default().with_filter( + tracing_subscriber::filter::dynamic_filter_fn(|md, _| { + if md.is_span() && *md.level() == tracing::Level::INFO { + return false; + } + true + }), + ) + }); + + let _guard = tracing::subscriber::set_default(r); + + let span = tracing::info_span!("foo", e = "some value"); + let _guard = span.enter(); + + let extractor = MultiNameExtractor::new("E", ["e"]); + let missing = check_fields_present([&extractor]).unwrap_err(); + assert_missing(missing, vec![&extractor]); + } +} diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 41fa0a67bb..c666fc785c 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -463,9 +463,13 @@ where let wal_file_path = format!("pg_wal/{}", wal_file_name); let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?; - let wal_seg = - postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version) - .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; + let wal_seg = postgres_ffi::generate_wal_segment( + segno, + system_identifier, + self.timeline.pg_version, + self.lsn, + ) + .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); self.ar.append(&header, &wal_seg[..]).await?; Ok(()) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index ed23a18ee0..d843b01ed7 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -25,6 +25,7 @@ use pageserver::{ virtual_file, }; use postgres_backend::AuthType; +use utils::logging::TracingErrorLayerEnablement; use utils::signals::ShutdownSignals; use utils::{ auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal, @@ -86,8 +87,19 @@ fn main() -> anyhow::Result<()> { } }; - // Initialize logging, which must be initialized before the custom panic hook is installed. - logging::init(conf.log_format)?; + // Initialize logging. + // + // It must be initialized before the custom panic hook is installed below. + // + // Regarding tracing_error enablement: at this time, we only use the + // tracing_error crate to debug_assert that log spans contain tenant and timeline ids. + // See `debug_assert_current_span_has_tenant_and_timeline_id` in the timeline module + let tracing_error_layer_enablement = if cfg!(debug_assertions) { + TracingErrorLayerEnablement::EnableWithRustLogFilter + } else { + TracingErrorLayerEnablement::Disabled + }; + logging::init(conf.log_format, tracing_error_layer_enablement)?; // mind the order required here: 1. logging, 2. panic_hook, 3. sentry. // disarming this hook on pageserver, because we never tear down tracing. @@ -226,6 +238,7 @@ fn start_pageserver( ); set_build_info_metric(GIT_VERSION); set_launch_timestamp_metric(launch_ts); + pageserver::preinitialize_metrics(); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index b0e4e1ca85..95f6e96a5b 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -520,6 +520,43 @@ paths: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/synthetic_size: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + get: + description: | + Calculate tenant's synthetic size + responses: + "200": + description: Tenant's synthetic size + content: + application/json: + schema: + $ref: "#/components/schemas/SyntheticSizeResponse" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/size: parameters: - name: tenant_id @@ -948,6 +985,84 @@ components: latest_gc_cutoff_lsn: type: string format: hex + + SyntheticSizeResponse: + type: object + required: + - id + - size + - segment_sizes + - inputs + properties: + id: + type: string + format: hex + size: + type: integer + segment_sizes: + type: array + items: + $ref: "#/components/schemas/SegmentSize" + inputs: + type: object + properties: + segments: + type: array + items: + $ref: "#/components/schemas/SegmentData" + timeline_inputs: + type: array + items: + $ref: "#/components/schemas/TimelineInput" + + SegmentSize: + type: object + required: + - method + - accum_size + properties: + method: + type: string + accum_size: + type: integer + + SegmentData: + type: object + required: + - segment + properties: + segment: + type: object + required: + - lsn + properties: + parent: + type: integer + lsn: + type: integer + size: + type: integer + needed: + type: boolean + timeline_id: + type: string + format: hex + kind: + type: string + + TimelineInput: + type: object + required: + - timeline_id + properties: + ancestor_id: + type: string + ancestor_lsn: + type: string + timeline_id: + type: string + format: hex + Error: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index de1e7d49fb..13acaf525b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1202,6 +1202,37 @@ async fn handler_404(_: Request) -> Result, ApiError> { ) } +#[cfg(feature = "testing")] +async fn post_tracing_event_handler(mut r: Request) -> Result, ApiError> { + #[derive(Debug, serde::Deserialize)] + #[serde(rename_all = "lowercase")] + enum Level { + Error, + Warn, + Info, + Debug, + Trace, + } + #[derive(Debug, serde::Deserialize)] + struct Request { + level: Level, + message: String, + } + let body: Request = json_request(&mut r) + .await + .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?; + + match body.level { + Level::Error => tracing::error!(?body.message), + Level::Warn => tracing::warn!(?body.message), + Level::Info => tracing::info!(?body.message), + Level::Debug => tracing::debug!(?body.message), + Level::Trace => tracing::trace!(?body.message), + } + + json_response(StatusCode::OK, ()) +} + pub fn make_router( conf: &'static PageServerConf, launch_ts: &'static LaunchTimestamp, @@ -1342,5 +1373,9 @@ pub fn make_router( testing_api!("set tenant state to broken", handle_tenant_break), ) .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r)) + .post( + "/v1/tracing/event", + testing_api!("emit a tracing event", post_tracing_event_handler), + ) .any(handler_404)) } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 39e434a023..936de35eb9 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -114,7 +114,7 @@ async fn import_rel( path: &Path, spcoid: Oid, dboid: Oid, - reader: &mut (impl AsyncRead + Send + Sync + Unpin), + reader: &mut (impl AsyncRead + Unpin), len: usize, ctx: &RequestContext, ) -> anyhow::Result<()> { @@ -200,7 +200,7 @@ async fn import_slru( modification: &mut DatadirModification<'_>, slru: SlruKind, path: &Path, - reader: &mut (impl AsyncRead + Send + Sync + Unpin), + reader: &mut (impl AsyncRead + Unpin), len: usize, ctx: &RequestContext, ) -> anyhow::Result<()> { @@ -612,8 +612,8 @@ async fn import_file( Ok(None) } -async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result { +async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result { let mut buf: Vec = vec![]; reader.read_to_end(&mut buf).await?; - Ok(Bytes::copy_from_slice(&buf[..])) + Ok(Bytes::from(buf)) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 278658eba3..04863886cb 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -44,6 +44,8 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61; static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); +pub use crate::metrics::preinitialize_metrics; + pub async fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint task. This prevents new connections from // being accepted. diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index c075315683..deb20f21f8 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,9 +1,9 @@ use metrics::core::{AtomicU64, GenericCounter}; use metrics::{ register_counter_vec, register_histogram, register_histogram_vec, register_int_counter, - register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, - Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, - UIntGauge, UIntGaugeVec, + register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec, Counter, CounterVec, + Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, + UIntGaugeVec, }; use once_cell::sync::Lazy; use pageserver_api::models::TenantState; @@ -205,6 +205,15 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy = Lazy::new(|| .expect("failed to define a metric") }); +pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_unexpected_ondemand_downloads_count", + "Number of unexpected on-demand downloads. \ + We log more context for each increment, so, forgo any labels in this metric.", + ) + .expect("failed to define a metric") +}); + /// Each [`Timeline`]'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. #[derive(Debug)] pub struct EvictionsWithLowResidenceDuration { @@ -350,11 +359,6 @@ pub static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static NUM_ONDISK_LAYERS: Lazy = Lazy::new(|| { - register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk") - .expect("failed to define a metric") -}); - // remote storage metrics /// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`]. @@ -385,6 +389,26 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy = Lazy::new .expect("failed to define a metric") }); +static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_timeline_client_bytes_started", + "Incremented by the number of bytes associated with a remote timeline client operation. \ + The increment happens when the operation is scheduled.", + &["tenant_id", "timeline_id", "file_kind", "op_kind"], + ) + .expect("failed to define a metric") +}); + +static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_timeline_client_bytes_finished", + "Incremented by the number of bytes associated with a remote timeline client operation. \ + The increment happens when the operation finishes (regardless of success/failure/shutdown).", + &["tenant_id", "timeline_id", "file_kind", "op_kind"], + ) + .expect("failed to define a metric") +}); + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RemoteOpKind { Upload, @@ -739,6 +763,8 @@ pub struct RemoteTimelineClientMetrics { remote_operation_time: Mutex>, calls_unfinished_gauge: Mutex>, calls_started_hist: Mutex>, + bytes_started_counter: Mutex>, + bytes_finished_counter: Mutex>, } impl RemoteTimelineClientMetrics { @@ -749,6 +775,8 @@ impl RemoteTimelineClientMetrics { remote_operation_time: Mutex::new(HashMap::default()), calls_unfinished_gauge: Mutex::new(HashMap::default()), calls_started_hist: Mutex::new(HashMap::default()), + bytes_started_counter: Mutex::new(HashMap::default()), + bytes_finished_counter: Mutex::new(HashMap::default()), remote_physical_size_gauge: Mutex::new(None), } } @@ -787,6 +815,7 @@ impl RemoteTimelineClientMetrics { }); metric.clone() } + fn calls_unfinished_gauge( &self, file_kind: &RemoteOpFileKind, @@ -828,32 +857,125 @@ impl RemoteTimelineClientMetrics { }); metric.clone() } + + fn bytes_started_counter( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> IntCounter { + // XXX would be nice to have an upgradable RwLock + let mut guard = self.bytes_started_counter.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + let metric = guard.entry(key).or_insert_with(move || { + REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + key.0, + key.1, + ]) + .unwrap() + }); + metric.clone() + } + + fn bytes_finished_counter( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> IntCounter { + // XXX would be nice to have an upgradable RwLock + let mut guard = self.bytes_finished_counter.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + let metric = guard.entry(key).or_insert_with(move || { + REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + key.0, + key.1, + ]) + .unwrap() + }); + metric.clone() + } +} + +#[cfg(test)] +impl RemoteTimelineClientMetrics { + pub fn get_bytes_started_counter_value( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> Option { + let guard = self.bytes_started_counter.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + guard.get(&key).map(|counter| counter.get()) + } + + pub fn get_bytes_finished_counter_value( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> Option { + let guard = self.bytes_finished_counter.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + guard.get(&key).map(|counter| counter.get()) + } } /// See [`RemoteTimelineClientMetrics::call_begin`]. #[must_use] -pub(crate) struct RemoteTimelineClientCallMetricGuard(Option); +pub(crate) struct RemoteTimelineClientCallMetricGuard { + /// Decremented on drop. + calls_unfinished_metric: Option, + /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop. + bytes_finished: Option<(IntCounter, u64)>, +} impl RemoteTimelineClientCallMetricGuard { - /// Consume this guard object without decrementing the metric. - /// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out. + /// Consume this guard object without performing the metric updates it would do on `drop()`. + /// The caller vouches to do the metric updates manually. pub fn will_decrement_manually(mut self) { - self.0 = None; // prevent drop() from decrementing + let RemoteTimelineClientCallMetricGuard { + calls_unfinished_metric, + bytes_finished, + } = &mut self; + calls_unfinished_metric.take(); + bytes_finished.take(); } } impl Drop for RemoteTimelineClientCallMetricGuard { fn drop(&mut self) { - if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self { + let RemoteTimelineClientCallMetricGuard { + calls_unfinished_metric, + bytes_finished, + } = self; + if let Some(guard) = calls_unfinished_metric.take() { guard.dec(); } + if let Some((bytes_finished_metric, value)) = bytes_finished { + bytes_finished_metric.inc_by(*value); + } } } +/// The enum variants communicate to the [`RemoteTimelineClientMetrics`] whether to +/// track the byte size of this call in applicable metric(s). +pub(crate) enum RemoteTimelineClientMetricsCallTrackSize { + /// Do not account for this call's byte size in any metrics. + /// The `reason` field is there to make the call sites self-documenting + /// about why they don't need the metric. + DontTrackSize { reason: &'static str }, + /// Track the byte size of the call in applicable metric(s). + Bytes(u64), +} + impl RemoteTimelineClientMetrics { - /// Increment the metrics that track ongoing calls to the remote timeline client instance. + /// Update the metrics that change when a call to the remote timeline client instance starts. /// - /// Drop the returned guard object once the operation is finished to decrement the values. + /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions. /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that /// is more suitable. /// Never do both. @@ -861,24 +983,51 @@ impl RemoteTimelineClientMetrics { &self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, + size: RemoteTimelineClientMetricsCallTrackSize, ) -> RemoteTimelineClientCallMetricGuard { - let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); + let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); self.calls_started_hist(file_kind, op_kind) - .observe(unfinished_metric.get() as f64); - unfinished_metric.inc(); - RemoteTimelineClientCallMetricGuard(Some(unfinished_metric)) + .observe(calls_unfinished_metric.get() as f64); + calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric + + let bytes_finished = match size { + RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => { + // nothing to do + None + } + RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => { + self.bytes_started_counter(file_kind, op_kind).inc_by(size); + let finished_counter = self.bytes_finished_counter(file_kind, op_kind); + Some((finished_counter, size)) + } + }; + RemoteTimelineClientCallMetricGuard { + calls_unfinished_metric: Some(calls_unfinished_metric), + bytes_finished, + } } - /// Manually decrement the metric instead of using the guard object. + /// Manually udpate the metrics that track completions, instead of using the guard object. /// Using the guard object is generally preferable. /// See [`call_begin`] for more context. - pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) { - let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); + pub(crate) fn call_end( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + size: RemoteTimelineClientMetricsCallTrackSize, + ) { + let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); debug_assert!( - unfinished_metric.get() > 0, + calls_unfinished_metric.get() > 0, "begin and end should cancel out" ); - unfinished_metric.dec(); + calls_unfinished_metric.dec(); + match size { + RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {} + RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => { + self.bytes_finished_counter(file_kind, op_kind).inc_by(size); + } + } } } @@ -891,6 +1040,8 @@ impl Drop for RemoteTimelineClientMetrics { remote_operation_time, calls_unfinished_gauge, calls_started_hist, + bytes_started_counter, + bytes_finished_counter, } = self; for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() { let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]); @@ -911,6 +1062,22 @@ impl Drop for RemoteTimelineClientMetrics { b, ]); } + for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() { + let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[ + tenant_id, + timeline_id, + a, + b, + ]); + } + for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() { + let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[ + tenant_id, + timeline_id, + a, + b, + ]); + } { let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); @@ -974,3 +1141,10 @@ impl>, O, E> Future for MeasuredRemoteOp { poll_result } } + +pub fn preinitialize_metrics() { + // We want to alert on this metric increasing. + // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0. + assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0); + UNEXPECTED_ONDEMAND_DOWNLOADS.reset(); +} diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index bd38a7a2f3..3610704f2c 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -20,7 +20,6 @@ use pageserver_api::models::{ PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, }; -use postgres_backend::PostgresBackendTCP; use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError}; use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; @@ -32,6 +31,7 @@ use std::str; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::io::StreamReader; use tracing::*; use utils::id::ConnectionId; @@ -57,7 +57,10 @@ use crate::trace::Tracer; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; -fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream> + '_ { +fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream> + '_ +where + IO: AsyncRead + AsyncWrite + Unpin, +{ async_stream::try_stream! { loop { let msg = tokio::select! { @@ -65,8 +68,8 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream { // We were requested to shut down. - let msg = "pageserver is shutting down".to_string(); - let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)); + let msg = "pageserver is shutting down"; + let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None)); Err(QueryError::Other(anyhow::anyhow!(msg))) } @@ -125,7 +128,7 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream anyhow::Result<()> { +async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> { use tokio::io::AsyncReadExt; let mut buf = [0u8; 512]; @@ -245,12 +248,14 @@ async fn page_service_conn_main( .set_nodelay(true) .context("could not set TCP_NODELAY")?; + let peer_addr = socket.peer_addr().context("get peer address")?; + // XXX: pgbackend.run() should take the connection_ctx, // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx); - let pgbackend = PostgresBackend::new(socket, auth_type, None)?; + let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; match pgbackend .run(&mut conn_handler, task_mgr::shutdown_watcher) @@ -332,13 +337,16 @@ impl PageServerHandler { } #[instrument(skip(self, pgb, ctx))] - async fn handle_pagerequests( + async fn handle_pagerequests( &self, - pgb: &mut PostgresBackendTCP, + pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, ctx: RequestContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result<()> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { // NOTE: pagerequests handler exits when connection is closed, // so there is no need to reset the association task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); @@ -436,16 +444,19 @@ impl PageServerHandler { #[allow(clippy::too_many_arguments)] #[instrument(skip(self, pgb, ctx))] - async fn handle_import_basebackup( + async fn handle_import_basebackup( &self, - pgb: &mut PostgresBackendTCP, + pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, base_lsn: Lsn, _end_lsn: Lsn, pg_version: u32, ctx: RequestContext, - ) -> Result<(), QueryError> { + ) -> Result<(), QueryError> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); @@ -486,15 +497,18 @@ impl PageServerHandler { } #[instrument(skip(self, pgb, ctx))] - async fn handle_import_wal( + async fn handle_import_wal( &self, - pgb: &mut PostgresBackendTCP, + pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, start_lsn: Lsn, end_lsn: Lsn, ctx: RequestContext, - ) -> Result<(), QueryError> { + ) -> Result<(), QueryError> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?; @@ -690,16 +704,21 @@ impl PageServerHandler { #[allow(clippy::too_many_arguments)] #[instrument(skip(self, pgb, ctx))] - async fn handle_basebackup_request( + async fn handle_basebackup_request( &mut self, - pgb: &mut PostgresBackendTCP, + pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, lsn: Option, prev_lsn: Option, full_backup: bool, ctx: RequestContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result<()> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { + let started = std::time::Instant::now(); + // check that the timeline exists let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); @@ -712,6 +731,8 @@ impl PageServerHandler { .context("invalid basebackup lsn")?; } + let lsn_awaited_after = started.elapsed(); + // switch client to COPYOUT pgb.write_message_noflush(&BeMessage::CopyOutResponse)?; pgb.flush().await?; @@ -732,7 +753,17 @@ impl PageServerHandler { pgb.write_message_noflush(&BeMessage::CopyDone)?; pgb.flush().await?; - info!("basebackup complete"); + + let basebackup_after = started + .elapsed() + .checked_sub(lsn_awaited_after) + .unwrap_or(Duration::ZERO); + + info!( + lsn_await_millis = lsn_awaited_after.as_millis(), + basebackup_millis = basebackup_after.as_millis(), + "basebackup complete" + ); Ok(()) } @@ -756,10 +787,13 @@ impl PageServerHandler { } #[async_trait::async_trait] -impl postgres_backend::Handler for PageServerHandler { +impl postgres_backend::Handler for PageServerHandler +where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, +{ fn check_auth_jwt( &mut self, - _pgb: &mut PostgresBackendTCP, + _pgb: &mut PostgresBackend, jwt_response: &[u8], ) -> Result<(), QueryError> { // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT @@ -787,7 +821,7 @@ impl postgres_backend::Handler for PageServerHandler { fn startup( &mut self, - _pgb: &mut PostgresBackendTCP, + _pgb: &mut PostgresBackend, _sm: &FeStartupPacket, ) -> Result<(), QueryError> { Ok(()) @@ -795,7 +829,7 @@ impl postgres_backend::Handler for PageServerHandler { async fn process_query( &mut self, - pgb: &mut PostgresBackendTCP, + pgb: &mut PostgresBackend, query_string: &str, ) -> Result<(), QueryError> { let ctx = self.connection_ctx.attached_child(); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c437fff89b..e60567f5fa 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -119,6 +119,10 @@ pub struct Tenant { // Global pageserver config parameters pub conf: &'static PageServerConf, + /// The value creation timestamp, used to measure activation delay, see: + /// + loading_started_at: Instant, + state: watch::Sender, // Overridden tenant-specific config parameters. @@ -1516,7 +1520,7 @@ impl Tenant { TenantState::Loading | TenantState::Attaching => { *current_state = TenantState::Active; - info!("Activating tenant {}", self.tenant_id); + debug!(tenant_id = %self.tenant_id, "Activating tenant"); let timelines_accessor = self.timelines.lock().unwrap(); let not_broken_timelines = timelines_accessor @@ -1527,12 +1531,17 @@ impl Tenant { // down when they notice that the tenant is inactive. tasks::start_background_loops(self.tenant_id); + let mut activated_timelines = 0; + let mut timelines_broken_during_activation = 0; + for timeline in not_broken_timelines { match timeline .activate(ctx) .context("timeline activation for activating tenant") { - Ok(()) => {} + Ok(()) => { + activated_timelines += 1; + } Err(e) => { error!( "Failed to activate timeline {}: {:#}", @@ -1543,9 +1552,26 @@ impl Tenant { "failed to activate timeline {}: {}", timeline.timeline_id, e )); + + timelines_broken_during_activation += 1; } } } + + let elapsed = self.loading_started_at.elapsed(); + let total_timelines = timelines_accessor.len(); + + // log a lot of stuff, because some tenants sometimes suffer from user-visible + // times to activate. see https://github.com/neondatabase/neon/issues/4025 + info!( + since_creation_millis = elapsed.as_millis(), + tenant_id = %self.tenant_id, + activated_timelines, + timelines_broken_during_activation, + total_timelines, + post_state = <&'static str>::from(&*current_state), + "activation attempt finished" + ); } } }); @@ -1852,6 +1878,9 @@ impl Tenant { Tenant { tenant_id, conf, + // using now here is good enough approximation to catch tenants with really long + // activation times. + loading_started_at: Instant::now(), tenant_conf: Arc::new(RwLock::new(tenant_conf)), timelines: Mutex::new(HashMap::new()), gc_cs: tokio::sync::Mutex::new(()), @@ -2897,7 +2926,13 @@ pub mod harness { }; LOG_HANDLE.get_or_init(|| { - logging::init(logging::LogFormat::Test).expect("Failed to init test logging") + logging::init( + logging::LogFormat::Test, + // enable it in case in case the tests exercise code paths that use + // debug_assert_current_span_has_tenant_and_timeline_id + logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + ) + .expect("Failed to init test logging") }); let repo_dir = PageServerConf::test_repo_dir(test_name); diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 02159ee291..0ee0c6f77d 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -48,7 +48,6 @@ mod layer_coverage; use crate::context::RequestContext; use crate::keyspace::KeyPartitioning; -use crate::metrics::NUM_ONDISK_LAYERS; use crate::repository::Key; use crate::tenant::storage_layer::InMemoryLayer; use crate::tenant::storage_layer::Layer; @@ -288,7 +287,6 @@ where self.l0_delta_layers.push(layer); } - NUM_ONDISK_LAYERS.inc(); Ok(()) } @@ -314,8 +312,6 @@ where "failed to locate removed historic layer from l0_delta_layers" ); } - - NUM_ONDISK_LAYERS.dec(); } pub(self) fn replace_historic_noflush( diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 525725eb9e..a9218f7bbf 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -222,7 +222,8 @@ use utils::lsn::Lsn; use crate::metrics::{ MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, - REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS, + RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES, + REMOTE_ONDEMAND_DOWNLOADED_LAYERS, }; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; use crate::{ @@ -383,9 +384,13 @@ impl RemoteTimelineClient { /// Download index file pub async fn download_index_file(&self) -> Result { - let _unfinished_gauge_guard = self - .metrics - .call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download); + let _unfinished_gauge_guard = self.metrics.call_begin( + &RemoteOpFileKind::Index, + &RemoteOpKind::Download, + crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { + reason: "no need for a downloads gauge", + }, + ); let index_part = download::download_index_part( self.conf, @@ -420,9 +425,13 @@ impl RemoteTimelineClient { layer_metadata: &LayerFileMetadata, ) -> anyhow::Result { let downloaded_size = { - let _unfinished_gauge_guard = self - .metrics - .call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download); + let _unfinished_gauge_guard = self.metrics.call_begin( + &RemoteOpFileKind::Layer, + &RemoteOpKind::Download, + crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { + reason: "no need for a downloads gauge", + }, + ); download::download_layer_file( self.conf, &self.storage_impl, @@ -990,11 +999,32 @@ impl RemoteTimelineClient { fn calls_unfinished_metric_impl( &self, op: &UploadOp, - ) -> Option<(RemoteOpFileKind, RemoteOpKind)> { + ) -> Option<( + RemoteOpFileKind, + RemoteOpKind, + RemoteTimelineClientMetricsCallTrackSize, + )> { + use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize; let res = match op { - UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload), - UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload), - UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete), + UploadOp::UploadLayer(_, m) => ( + RemoteOpFileKind::Layer, + RemoteOpKind::Upload, + RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()), + ), + UploadOp::UploadMetadata(_, _) => ( + RemoteOpFileKind::Index, + RemoteOpKind::Upload, + DontTrackSize { + reason: "metadata uploads are tiny", + }, + ), + UploadOp::Delete(file_kind, _) => ( + *file_kind, + RemoteOpKind::Delete, + DontTrackSize { + reason: "should we track deletes? positive or negative sign?", + }, + ), UploadOp::Barrier(_) => { // we do not account these return None; @@ -1004,20 +1034,20 @@ impl RemoteTimelineClient { } fn calls_unfinished_metric_begin(&self, op: &UploadOp) { - let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) { + let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) { Some(x) => x, None => return, }; - let guard = self.metrics.call_begin(&file_kind, &op_kind); + let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes); guard.will_decrement_manually(); // in unfinished_ops_metric_end() } fn calls_unfinished_metric_end(&self, op: &UploadOp) { - let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) { + let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) { Some(x) => x, None => return, }; - self.metrics.call_end(&file_kind, &op_kind); + self.metrics.call_end(&file_kind, &op_kind, track_bytes); } /// Close the upload queue for new operations and cancel queued operations. @@ -1095,11 +1125,19 @@ impl RemoteTimelineClient { mod tests { use super::*; use crate::{ - tenant::harness::{TenantHarness, TIMELINE_ID}, + context::RequestContext, + tenant::{ + harness::{TenantHarness, TIMELINE_ID}, + Tenant, + }, DEFAULT_PG_VERSION, }; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; - use std::{collections::HashSet, path::Path}; + use std::{ + collections::HashSet, + path::{Path, PathBuf}, + }; + use tokio::runtime::EnterGuard; use utils::lsn::Lsn; pub(super) fn dummy_contents(name: &str) -> Vec { @@ -1148,39 +1186,80 @@ mod tests { assert_eq!(found, expected); } + struct TestSetup { + runtime: &'static tokio::runtime::Runtime, + entered_runtime: EnterGuard<'static>, + harness: TenantHarness<'static>, + tenant: Arc, + tenant_ctx: RequestContext, + remote_fs_dir: PathBuf, + client: Arc, + } + + impl TestSetup { + fn new(test_name: &str) -> anyhow::Result { + // Use a current-thread runtime in the test + let runtime = Box::leak(Box::new( + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?, + )); + let entered_runtime = runtime.enter(); + + let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}"))); + let harness = TenantHarness::create(test_name)?; + let (tenant, ctx) = runtime.block_on(harness.load()); + // create an empty timeline directory + let timeline = + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let _ = timeline.initialize(&ctx).unwrap(); + + let remote_fs_dir = harness.conf.workdir.join("remote_fs"); + std::fs::create_dir_all(remote_fs_dir)?; + let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?; + + let storage_config = RemoteStorageConfig { + max_concurrent_syncs: std::num::NonZeroUsize::new( + remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS, + ) + .unwrap(), + max_sync_errors: std::num::NonZeroU32::new( + remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS, + ) + .unwrap(), + storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), + }; + + let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); + + let client = Arc::new(RemoteTimelineClient { + conf: harness.conf, + runtime, + tenant_id: harness.tenant_id, + timeline_id: TIMELINE_ID, + storage_impl: storage, + upload_queue: Mutex::new(UploadQueue::Uninitialized), + metrics: Arc::new(RemoteTimelineClientMetrics::new( + &harness.tenant_id, + &TIMELINE_ID, + )), + }); + + Ok(Self { + runtime, + entered_runtime, + harness, + tenant, + tenant_ctx: ctx, + remote_fs_dir, + client, + }) + } + } + // Test scheduling #[test] fn upload_scheduling() -> anyhow::Result<()> { - // Use a current-thread runtime in the test - let runtime = Box::leak(Box::new( - tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?, - )); - let _entered = runtime.enter(); - - let harness = TenantHarness::create("upload_scheduling")?; - let (tenant, ctx) = runtime.block_on(harness.load()); - let _timeline = - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; - let timeline_path = harness.timeline_path(&TIMELINE_ID); - - let remote_fs_dir = harness.conf.workdir.join("remote_fs"); - std::fs::create_dir_all(remote_fs_dir)?; - let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?; - - let storage_config = RemoteStorageConfig { - max_concurrent_syncs: std::num::NonZeroUsize::new( - remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS, - ) - .unwrap(), - max_sync_errors: std::num::NonZeroU32::new( - remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS, - ) - .unwrap(), - storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), - }; - // Test outline: // // Schedule upload of a bunch of layers. Check that they are started immediately, not queued @@ -1195,21 +1274,19 @@ mod tests { // Schedule another deletion. Check that it's launched immediately. // Schedule index upload. Check that it's queued - println!("workdir: {}", harness.conf.workdir.display()); - - let storage_impl = GenericRemoteStorage::from_config(&storage_config)?; - let client = Arc::new(RemoteTimelineClient { - conf: harness.conf, + let TestSetup { runtime, - tenant_id: harness.tenant_id, - timeline_id: TIMELINE_ID, - storage_impl, - upload_queue: Mutex::new(UploadQueue::Uninitialized), - metrics: Arc::new(RemoteTimelineClientMetrics::new( - &harness.tenant_id, - &TIMELINE_ID, - )), - }); + entered_runtime: _entered_runtime, + harness, + tenant: _tenant, + tenant_ctx: _tenant_ctx, + remote_fs_dir, + client, + } = TestSetup::new("upload_scheduling").unwrap(); + + let timeline_path = harness.timeline_path(&TIMELINE_ID); + + println!("workdir: {}", harness.conf.workdir.display()); let remote_timeline_dir = remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir)?); @@ -1334,4 +1411,90 @@ mod tests { Ok(()) } + + #[test] + fn bytes_unfinished_gauge_for_layer_file_uploads() -> anyhow::Result<()> { + // Setup + + let TestSetup { + runtime, + harness, + client, + .. + } = TestSetup::new("metrics")?; + + let metadata = dummy_metadata(Lsn(0x10)); + client.init_upload_queue_for_empty_remote(&metadata)?; + + let timeline_path = harness.timeline_path(&TIMELINE_ID); + + let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let content_1 = dummy_contents("foo"); + std::fs::write( + timeline_path.join(layer_file_name_1.file_name()), + &content_1, + )?; + + #[derive(Debug, PartialEq)] + struct BytesStartedFinished { + started: Option, + finished: Option, + } + let get_bytes_started_stopped = || { + let started = client + .metrics + .get_bytes_started_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload) + .map(|v| v.try_into().unwrap()); + let stopped = client + .metrics + .get_bytes_finished_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload) + .map(|v| v.try_into().unwrap()); + BytesStartedFinished { + started, + finished: stopped, + } + }; + + // Test + + let init = get_bytes_started_stopped(); + + client.schedule_layer_file_upload( + &layer_file_name_1, + &LayerFileMetadata::new(content_1.len() as u64), + )?; + + let pre = get_bytes_started_stopped(); + + runtime.block_on(client.wait_completion())?; + + let post = get_bytes_started_stopped(); + + // Validate + + assert_eq!( + init, + BytesStartedFinished { + started: None, + finished: None + } + ); + assert_eq!( + pre, + BytesStartedFinished { + started: Some(content_1.len()), + // assert that the _finished metric is created eagerly so that subtractions work on first sample + finished: Some(0), + } + ); + assert_eq!( + post, + BytesStartedFinished { + started: Some(content_1.len()), + finished: Some(content_1.len()) + } + ); + + Ok(()) + } } diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index bda095d850..a0d8c0193a 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -16,6 +16,7 @@ use tracing::{info, warn}; use crate::config::PageServerConf; use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id; use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use remote_storage::{DownloadError, GenericRemoteStorage}; use utils::crashsafe::path_with_suffix_extension; @@ -43,6 +44,8 @@ pub async fn download_layer_file<'a>( layer_file_name: &'a LayerFileName, layer_metadata: &'a LayerFileMetadata, ) -> Result { + debug_assert_current_span_has_tenant_and_timeline_id(); + let timeline_path = conf.timeline_path(&timeline_id, &tenant_id); let local_path = timeline_path.join(layer_file_name.file_name()); @@ -154,7 +157,7 @@ pub async fn download_layer_file<'a>( .with_context(|| format!("Could not fsync layer file {}", local_path.display(),)) .map_err(DownloadError::Other)?; - tracing::info!("download complete: {}", local_path.display()); + tracing::debug!("download complete: {}", local_path.display()); Ok(bytes_amount) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b8b1f963e5..5c671ffd63 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -48,7 +48,7 @@ use crate::tenant::{ use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; -use crate::metrics::TimelineMetrics; +use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError}; @@ -936,6 +936,7 @@ impl Timeline { } } + #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))] pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result> { let Some(layer) = self.find_layer(layer_file_name) else { return Ok(None) }; let Some(remote_layer) = layer.downcast_remote_layer() else { return Ok(Some(false)) }; @@ -2355,6 +2356,7 @@ impl Timeline { id, ctx.task_kind() ); + UNEXPECTED_ONDEMAND_DOWNLOADS.inc(); timeline.download_remote_layer(remote_layer).await?; continue 'layer_map_search; } @@ -3818,11 +3820,13 @@ impl Timeline { /// If the caller has a deadline or needs a timeout, they can simply stop polling: /// we're **cancellation-safe** because the download happens in a separate task_mgr task. /// So, the current download attempt will run to completion even if we stop polling. - #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))] + #[instrument(skip_all, fields(layer=%remote_layer.short_id()))] pub async fn download_remote_layer( &self, remote_layer: Arc, ) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_and_timeline_id(); + use std::sync::atomic::Ordering::Relaxed; let permit = match Arc::clone(&remote_layer.ongoing_download) @@ -3866,6 +3870,8 @@ impl Timeline { .await; if let Ok(size) = &result { + info!("layer file download finished"); + // XXX the temp file is still around in Err() case // and consumes space until we clean up upon pageserver restart. self_clone.metrics.resident_physical_size_gauge.add(*size); @@ -3937,6 +3943,8 @@ impl Timeline { updates.flush(); drop(layers); + info!("on-demand download successful"); + // Now that we've inserted the download into the layer map, // close the semaphore. This will make other waiters for // this download return Ok(()). @@ -3944,7 +3952,7 @@ impl Timeline { remote_layer.ongoing_download.close(); } else { // Keep semaphore open. We'll drop the permit at the end of the function. - error!("on-demand download failed: {:?}", result.as_ref().unwrap_err()); + error!("layer file download failed: {:?}", result.as_ref().unwrap_err()); } // Don't treat it as an error if the task that triggered the download @@ -4255,3 +4263,36 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> { bail!("couldn't find an unused backup number for {:?}", path) } + +#[cfg(not(debug_assertions))] +#[inline] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {} + +#[cfg(debug_assertions)] +#[inline] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { + use utils::tracing_span_assert; + + pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy< + tracing_span_assert::MultiNameExtractor<2>, + > = once_cell::sync::Lazy::new(|| { + tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]) + }); + + pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy< + tracing_span_assert::MultiNameExtractor<2>, + > = once_cell::sync::Lazy::new(|| { + tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]) + }); + + match tracing_span_assert::check_fields_present([ + &*TENANT_ID_EXTRACTOR, + &*TIMELINE_ID_EXTRACTOR, + ]) { + Ok(()) => (), + Err(missing) => panic!( + "missing extractors: {:?}", + missing.into_iter().map(|e| e.name()).collect::>() + ), + } +} diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index efcbfbce3d..731c5c4644 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -348,7 +348,7 @@ impl ConnectionManagerState { .context("walreceiver connection handling failure") } .instrument( - info_span!("walreceiver_connection", id = %id, node_id = %new_sk.safekeeper_id), + info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, node_id = %new_sk.safekeeper_id), ) }); diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 8dff259f02..cc46fb5a25 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -370,6 +370,74 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) return found; } +/* + * Evict a page (if present) from the local file cache + */ +void +lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) +{ + BufferTag tag; + FileCacheEntry* entry; + ssize_t rc; + bool found; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); + uint32 hash; + + if (lfc_size_limit == 0) /* fast exit if file cache is disabled */ + return; + + INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1))); + + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, &found); + + if (!found) + { + /* nothing to do */ + LWLockRelease(lfc_lock); + return; + } + + /* remove the page from the cache */ + entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1))); + + /* + * If the chunk has no live entries, we can position the chunk to be + * recycled first. + */ + if (entry->bitmap[chunk_offs >> 5] == 0) + { + bool has_remaining_pages; + + for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) { + if (entry->bitmap[i] != 0) + { + has_remaining_pages = true; + break; + } + } + + /* + * Put the entry at the position that is first to be reclaimed when + * we have no cached pages remaining in the chunk + */ + if (!has_remaining_pages) + { + dlist_delete(&entry->lru_node); + dlist_push_head(&lfc_ctl->lru, &entry->lru_node); + } + } + + /* + * Done: apart from empty chunks, we don't move chunks in the LRU when + * they're empty because eviction isn't usage. + */ + + LWLockRelease(lfc_lock); +} + /* * Try to read page from local cache. * Returns true if page is found in local cache. @@ -528,7 +596,6 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, LWLockRelease(lfc_lock); } - /* * Record structure holding the to be exposed cache data. */ diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index c44e8fcda5..21330c018f 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -17,6 +17,8 @@ #include "pagestore_client.h" #include "fmgr.h" #include "access/xlog.h" +#include "access/xlogutils.h" +#include "storage/buf_internals.h" #include "libpq-fe.h" #include "libpq/pqformat.h" @@ -57,6 +59,8 @@ int n_unflushed_requests = 0; int flush_every_n_requests = 8; int readahead_buffer_size = 128; +bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL; + static void pageserver_flush(void); static bool @@ -467,6 +471,8 @@ pg_init_libpagestore(void) smgr_hook = smgr_neon; smgr_init_hook = smgr_init_neon; dbsize_hook = neon_dbsize; + old_redo_read_buffer_filter = redo_read_buffer_filter; + redo_read_buffer_filter = neon_redo_read_buffer_filter; } lfc_init(); } diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 5c98902554..217c1974a0 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -24,6 +24,7 @@ #include "neon.h" #include "walproposer.h" +#include "pagestore_client.h" PG_MODULE_MAGIC; void _PG_init(void); diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 3eac8f4570..60d321a945 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -11,6 +11,7 @@ #ifndef NEON_H #define NEON_H +#include "access/xlogreader.h" /* GUCs */ extern char *neon_auth_token; @@ -20,4 +21,11 @@ extern char *neon_tenant; extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); +/* + * Returns true if we shouldn't do REDO on that block in record indicated by + * block_id; false otherwise. + */ +extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); +extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); + #endif /* NEON_H */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index a1f05ac685..22f5cdb73a 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -207,6 +207,7 @@ extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); extern void lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer); extern bool lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer); extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno); +extern void lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno); extern void lfc_init(void); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 5b30641856..528d4eb051 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -189,6 +189,7 @@ typedef struct PrfHashEntry { #define SH_DEFINE #define SH_DECLARE #include "lib/simplehash.h" +#include "neon.h" /* * PrefetchState maintains the state of (prefetch) getPage@LSN requests. @@ -1209,6 +1210,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch if (ShutdownRequestPending) return; + /* Don't log any pages if we're not allowed to do so. */ + if (!XLogInsertAllowed()) + return; /* * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM @@ -1375,8 +1379,18 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN if (RecoveryInProgress()) { + /* + * We don't know if WAL has been generated but not yet replayed, so + * we're conservative in our estimates about latest pages. + */ *latest = false; - lsn = GetXLogReplayRecPtr(NULL); + + /* + * Get the last written LSN of this page. + */ + lsn = GetLastWrittenLSN(rnode, forknum, blkno); + lsn = nm_adjust_lsn(lsn); + elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", (uint32) ((lsn) >> 32), (uint32) (lsn)); } @@ -1559,6 +1573,15 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) /* * Newly created relation is empty, remember that in the relsize cache. * + * Note that in REDO, this is called to make sure the relation fork exists, + * but it does not truncate the relation. So, we can only update the + * relsize if it didn't exist before. + * + * Also, in redo, we must make sure to update the cached size of the + * relation, as that is the primary source of truth for REDO's + * file length considerations, and as file extension isn't (perfectly) + * logged, we need to take care of that before we hit file size checks. + * * FIXME: This is currently not just an optimization, but required for * correctness. Postgres can call smgrnblocks() on the newly-created * relation. Currently, we don't call SetLastWrittenLSN() when a new @@ -1566,7 +1589,14 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * cache, we might call smgrnblocks() on the newly-created relation before * the creation WAL record hass been received by the page server. */ - set_cached_relsize(reln->smgr_rnode.node, forkNum, 0); + if (isRedo) + { + update_cached_relsize(reln->smgr_rnode.node, forkNum, 0); + get_cached_relsize(reln->smgr_rnode.node, forkNum, + &reln->smgr_cached_nblocks[forkNum]); + } + else + set_cached_relsize(reln->smgr_rnode.node, forkNum, 0); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1831,6 +1861,26 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, .blockNum = blkno, }; + /* + * The redo process does not lock pages that it needs to replay but are + * not in the shared buffers, so a concurrent process may request the + * page after redo has decided it won't redo that page and updated the + * LwLSN for that page. + * If we're in hot standby we need to take care that we don't return + * until after REDO has finished replaying up to that LwLSN, as the page + * should have been locked up to that point. + * + * See also the description on neon_redo_read_buffer_filter below. + * + * NOTE: It is possible that the WAL redo process will still do IO due to + * concurrent failed read IOs. Those IOs should never have a request_lsn + * that is as large as the WAL record we're currently replaying, if it + * weren't for the behaviour of the LwLsn cache that uses the highest + * value of the LwLsn cache when the entry is not found. + */ + if (RecoveryInProgress() && !(MyBackendType == B_STARTUP)) + XLogWaitForReplayOf(request_lsn); + /* * Try to find prefetched page in the list of received pages. */ @@ -2584,3 +2634,143 @@ smgr_init_neon(void) smgr_init_standard(); neon_init(); } + + +/* + * Return whether we can skip the redo for this block. + * + * The conditions for skipping the IO are: + * + * - The block is not in the shared buffers, and + * - The block is not in the local file cache + * + * ... because any subsequent read of the page requires us to read + * the new version of the page from the PageServer. We do not + * check the local file cache; we instead evict the page from LFC: it + * is cheaper than going through the FS calls to read the page, and + * limits the number of lock operations used in the REDO process. + * + * We have one exception to the rules for skipping IO: We always apply + * changes to shared catalogs' pages. Although this is mostly out of caution, + * catalog updates usually result in backends rebuilding their catalog snapshot, + * which means it's quite likely the modified page is going to be used soon. + * + * It is important to note that skipping WAL redo for a page also means + * the page isn't locked by the redo process, as there is no Buffer + * being returned, nor is there a buffer descriptor to lock. + * This means that any IO that wants to read this block needs to wait + * for the WAL REDO process to finish processing the WAL record before + * it allows the system to start reading the block, as releasing the + * block early could lead to phantom reads. + * + * For example, REDO for a WAL record that modifies 3 blocks could skip + * the first block, wait for a lock on the second, and then modify the + * third block. Without skipping, all blocks would be locked and phantom + * reads would not occur, but with skipping, a concurrent process could + * read block 1 with post-REDO contents and read block 3 with pre-REDO + * contents, where with REDO locking it would wait on block 1 and see + * block 3 with post-REDO contents only. + */ +bool +neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) +{ + XLogRecPtr end_recptr = record->EndRecPtr; + XLogRecPtr prev_end_recptr = record->ReadRecPtr - 1; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + BufferTag tag; + uint32 hash; + LWLock *partitionLock; + Buffer buffer; + bool no_redo_needed; + BlockNumber relsize; + + if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id)) + return true; + +#if PG_VERSION_NUM < 150000 + if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) + elog(PANIC, "failed to locate backup block with ID %d", block_id); +#else + XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno); +#endif + + /* + * Out of an abundance of caution, we always run redo on shared catalogs, + * regardless of whether the block is stored in shared buffers. + * See also this function's top comment. + */ + if (!OidIsValid(rnode.dbNode)) + return false; + + INIT_BUFFERTAG(tag, rnode, forknum, blkno); + hash = BufTableHashCode(&tag); + partitionLock = BufMappingPartitionLock(hash); + + /* + * Lock the partition of shared_buffers so that it can't be updated + * concurrently. + */ + LWLockAcquire(partitionLock, LW_SHARED); + + /* Try to find the relevant buffer */ + buffer = BufTableLookup(&tag, hash); + + no_redo_needed = buffer < 0; + + /* we don't have the buffer in memory, update lwLsn past this record */ + if (no_redo_needed) + { + SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno); + lfc_evict(rnode, forknum, blkno); + } + else + { + SetLastWrittenLSNForBlock(prev_end_recptr, rnode, forknum, blkno); + } + + LWLockRelease(partitionLock); + + /* Extend the relation if we know its size */ + if (get_cached_relsize(rnode, forknum, &relsize)) + { + if (relsize < blkno + 1) + update_cached_relsize(rnode, forknum, blkno + 1); + } + else + { + /* + * Size was not cached. We populate the cache now, with the size of the + * relation measured after this WAL record is applied. + * + * This length is later reused when we open the smgr to read the block, + * which is fine and expected. + */ + + NeonResponse *response; + NeonNblocksResponse *nbresponse; + NeonNblocksRequest request = { + .req = (NeonRequest) { + .lsn = end_recptr, + .latest = false, + .tag = T_NeonNblocksRequest, + }, + .rnode = rnode, + .forknum = forknum, + }; + + response = page_server_request(&request); + + Assert(response->tag == T_NeonNblocksResponse); + nbresponse = (NeonNblocksResponse *) response; + + Assert(nbresponse->n_blocks > blkno); + + set_cached_relsize(rnode, forknum, nbresponse->n_blocks); + + elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks); + } + + return no_redo_needed; +} diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 45037a8c01..a99be40955 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -1964,18 +1964,26 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) { if (safekeeper[i].appendResponse.hs.ts != 0) { - if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin)) + HotStandbyFeedback *skhs = &safekeeper[i].appendResponse.hs; + if (FullTransactionIdIsNormal(skhs->xmin) + && FullTransactionIdPrecedes(skhs->xmin, hs->xmin)) { - hs->xmin = safekeeper[i].appendResponse.hs.xmin; - hs->ts = safekeeper[i].appendResponse.hs.ts; + hs->xmin = skhs->xmin; + hs->ts = skhs->ts; } - if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin)) + if (FullTransactionIdIsNormal(skhs->catalog_xmin) + && FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin)) { - hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin; - hs->ts = safekeeper[i].appendResponse.hs.ts; + hs->catalog_xmin = skhs->catalog_xmin; + hs->ts = skhs->ts; } } } + + if (hs->xmin.value == ~0) + hs->xmin = InvalidFullTransactionId; + if (hs->catalog_xmin.value == ~0) + hs->catalog_xmin = InvalidFullTransactionId; } /* diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index b5efc72803..0465703ae6 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,8 +1,8 @@ use crate::{cancellation::CancelClosure, error::UserFacingError}; -use futures::TryFutureExt; +use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use pq_proto::StartupMessageParams; -use std::{io, net::SocketAddr}; +use std::{io, net::SocketAddr, time::Duration}; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::NoTls; @@ -130,9 +130,23 @@ impl ConnCfg { async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { use tokio_postgres::config::Host; + // wrap TcpStream::connect with timeout + let connect_with_timeout = |host, port| { + let connection_timeout = Duration::from_millis(10000); + tokio::time::timeout(connection_timeout, TcpStream::connect((host, port))).map( + move |res| match res { + Ok(tcpstream_connect_res) => tcpstream_connect_res, + Err(_) => Err(io::Error::new( + io::ErrorKind::TimedOut, + format!("exceeded connection timeout {connection_timeout:?}"), + )), + }, + ) + }; + let connect_once = |host, port| { info!("trying to connect to compute node at {host}:{port}"); - TcpStream::connect((host, port)).and_then(|socket| async { + connect_with_timeout(host, port).and_then(|socket| async { let socket_addr = socket.peer_addr()?; // This prevents load balancer from severing the connection. socket2::SockRef::from(&socket).set_keepalive(true)?; @@ -165,7 +179,6 @@ impl ConnCfg { Host::Unix(_) => continue, // unix sockets are not welcome here }; - // TODO: maybe we should add a timeout. match connect_once(host, *port).await { Ok(socket) => return Ok(socket), Err(err) => { diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 445c2e930c..6ae1e3a447 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -95,7 +95,7 @@ fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { current_metrics.push(( Ids { endpoint_id: endpoint_id.to_string(), - branch_id: "".to_string(), + branch_id: branch_id.to_string(), }, (value, Utc::now()), )); diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 9945e3697f..1169d76160 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -95,9 +95,9 @@ pub async fn task_main( handle_client(config, &cancel_map, session_id, socket).await } - .unwrap_or_else(|e| { + .unwrap_or_else(move |e| { // Acknowledge that the task has finished with an error. - error!("per-client task finished with an error: {e:#}"); + error!(?session_id, "per-client task finished with an error: {e:#}"); }), ); } diff --git a/run_clippy.sh b/run_clippy.sh index 9adfddedc2..ae2a17ec0c 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -1,4 +1,5 @@ -#!/bin/bash +#!/usr/bin/env bash +set -euo pipefail # If you save this in your path under the name "cargo-zclippy" (or whatever # name you like), then you can run it as "cargo zclippy" from the shell prompt. @@ -8,7 +9,11 @@ # warnings and errors right in the editor. # In vscode, this setting is Rust-analyzer>Check On Save:Command -# * `-A unknown_lints` – do not warn about unknown lint suppressions -# that people with newer toolchains might use -# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) -cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings +# NB: the CI runs the full feature powerset, so, it catches slightly more errors +# at the expense of longer runtime. This script is used by developers, so, don't +# do that here. + +thisscript="${BASH_SOURCE[0]}" +thisscript_dir="$(dirname "$thisscript")" +CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" +exec cargo clippy --all-features $CLIPPY_COMMON_ARGS diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index ace921a26d..3699a2a74c 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -134,7 +134,10 @@ fn main() -> anyhow::Result<()> { // 1. init logging // 2. tracing panic hook // 3. sentry - logging::init(LogFormat::from_config(&args.log_format)?)?; + logging::init( + LogFormat::from_config(&args.log_format)?, + logging::TracingErrorLayerEnablement::Disabled, + )?; logging::replace_panic_hook_with_tracing_panic_hook().forget(); info!("version: {GIT_VERSION}"); diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index a589fe1869..2c3d1cea0e 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -3,6 +3,7 @@ use anyhow::Context; use std::str; +use std::str::FromStr; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, info_span, Instrument}; @@ -49,12 +50,14 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { if cmd.starts_with("START_WAL_PUSH") { Ok(SafekeeperPostgresCommand::StartWalPush) } else if cmd.starts_with("START_REPLICATION") { - let re = - Regex::new(r"START_REPLICATION(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)").unwrap(); + let re = Regex::new( + r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)", + ) + .unwrap(); let mut caps = re.captures_iter(cmd); let start_lsn = caps .next() - .map(|cap| cap[1].parse::()) + .map(|cap| Lsn::from_str(&cap[1])) .context("parse start LSN from START_REPLICATION command")??; Ok(SafekeeperPostgresCommand::StartReplication { start_lsn }) } else if cmd.starts_with("IDENTIFY_SYSTEM") { diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 9b385630c2..54e27714ea 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -18,6 +18,7 @@ use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogF use postgres_ffi::{XLogSegNo, PG_TLI}; use std::cmp::{max, min}; +use bytes::Bytes; use std::fs::{self, remove_file, File, OpenOptions}; use std::io::Write; use std::path::{Path, PathBuf}; @@ -36,6 +37,7 @@ use postgres_ffi::XLOG_BLCKSZ; use postgres_ffi::waldecoder::WalStreamDecoder; +use pq_proto::SystemId; use tokio::io::{AsyncReadExt, AsyncSeekExt}; pub trait Storage { @@ -478,6 +480,13 @@ pub struct WalReader { // We don't have WAL locally if LSN is less than local_start_lsn local_start_lsn: Lsn, + // We will respond with zero-ed bytes before this Lsn as long as + // pos is in the same segment as timeline_start_lsn. + timeline_start_lsn: Lsn, + // integer version number of PostgreSQL, e.g. 14; 15; 16 + pg_version: u32, + system_id: SystemId, + timeline_start_segment: Option, } impl WalReader { @@ -488,19 +497,27 @@ impl WalReader { start_pos: Lsn, enable_remote_read: bool, ) -> Result { - if start_pos < state.timeline_start_lsn { + if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) { + bail!("state uninitialized, no data to read"); + } + + // TODO: Upgrade to bail!() once we know this couldn't possibly happen + if state.timeline_start_lsn == Lsn(0) { + warn!("timeline_start_lsn uninitialized before initializing wal reader"); + } + + if start_pos + < state + .timeline_start_lsn + .segment_lsn(state.server.wal_seg_size as usize) + { bail!( - "Requested streaming from {}, which is before the start of the timeline {}", + "Requested streaming from {}, which is before the start of the timeline {}, and also doesn't start at the first segment of that timeline", start_pos, state.timeline_start_lsn ); } - // TODO: add state.timeline_start_lsn == Lsn(0) check - if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) { - bail!("state uninitialized, no data to read"); - } - Ok(Self { workdir, timeline_dir, @@ -509,10 +526,65 @@ impl WalReader { wal_segment: None, enable_remote_read, local_start_lsn: state.local_start_lsn, + timeline_start_lsn: state.timeline_start_lsn, + pg_version: state.server.pg_version / 10000, + system_id: state.server.system_id, + timeline_start_segment: None, }) } pub async fn read(&mut self, buf: &mut [u8]) -> Result { + // If this timeline is new, we may not have a full segment yet, so + // we pad the first bytes of the timeline's first WAL segment with 0s + if self.pos < self.timeline_start_lsn { + debug_assert_eq!( + self.pos.segment_number(self.wal_seg_size), + self.timeline_start_lsn.segment_number(self.wal_seg_size) + ); + + // All bytes after timeline_start_lsn are in WAL, but those before + // are not, so we manually construct an empty segment for the bytes + // not available in this timeline. + if self.timeline_start_segment.is_none() { + let it = postgres_ffi::generate_wal_segment( + self.timeline_start_lsn.segment_number(self.wal_seg_size), + self.system_id, + self.pg_version, + self.timeline_start_lsn, + )?; + self.timeline_start_segment = Some(it); + } + + assert!(self.timeline_start_segment.is_some()); + let segment = self.timeline_start_segment.take().unwrap(); + + let seg_bytes = &segment[..]; + + // How much of the current segment have we already consumed? + let pos_seg_offset = self.pos.segment_offset(self.wal_seg_size); + + // How many bytes may we consume in total? + let tl_start_seg_offset = self.timeline_start_lsn.segment_offset(self.wal_seg_size); + + debug_assert!(seg_bytes.len() > pos_seg_offset); + debug_assert!(seg_bytes.len() > tl_start_seg_offset); + + // Copy as many bytes as possible into the buffer + let len = (tl_start_seg_offset - pos_seg_offset).min(buf.len()); + buf[0..len].copy_from_slice(&seg_bytes[pos_seg_offset..pos_seg_offset + len]); + + self.pos += len as u64; + + // If we're done with the segment, we can release it's memory. + // However, if we're not yet done, store it so that we don't have to + // construct the segment the next time this function is called. + if self.pos < self.timeline_start_lsn { + self.timeline_start_segment = Some(segment); + } + + return Ok(len); + } + let mut wal_segment = match self.wal_segment.take() { Some(reader) => reader, None => self.open_segment().await?, diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index 829cc814e8..262950b61d 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -42,12 +42,16 @@ def main(args: argparse.Namespace): res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]] res = defaultdict(lambda: defaultdict(dict)) - logging.info("connecting to the database...") - with psycopg2.connect(connstr, connect_timeout=10) as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: - logging.info("fetching flaky tests...") - cur.execute(FLAKY_TESTS_QUERY, (interval_days,)) - rows = cur.fetchall() + try: + logging.info("connecting to the database...") + with psycopg2.connect(connstr, connect_timeout=30) as conn: + with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: + logging.info("fetching flaky tests...") + cur.execute(FLAKY_TESTS_QUERY, (interval_days,)) + rows = cur.fetchall() + except psycopg2.OperationalError as exc: + logging.error("cannot fetch flaky tests from the DB due to an error", exc) + rows = [] for row in rows: logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}") diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index de7b634ba0..597d9860d8 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -430,7 +430,10 @@ async fn main() -> Result<(), Box> { // 1. init logging // 2. tracing panic hook // 3. sentry - logging::init(LogFormat::from_config(&args.log_format)?)?; + logging::init( + LogFormat::from_config(&args.log_format)?, + logging::TracingErrorLayerEnablement::Disabled, + )?; logging::replace_panic_hook_with_tracing_panic_hook().forget(); // initialize sentry if SENTRY_DSN is provided let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 2984f2c7d3..0e958ddd06 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -45,14 +45,18 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = ( *[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]], *[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]], "pageserver_remote_physical_size", + "pageserver_remote_timeline_client_bytes_started_total", + "pageserver_remote_timeline_client_bytes_finished_total", ) PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "pageserver_storage_operations_seconds_global_count", "pageserver_storage_operations_seconds_global_sum", "pageserver_storage_operations_seconds_global_bucket", + "pageserver_unexpected_ondemand_downloads_count_total", "libmetrics_launch_timestamp", "libmetrics_build_info", + "libmetrics_tracing_event_count_total", ) PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c6610ba062..a46c19d7fd 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1451,6 +1451,7 @@ class NeonCli(AbstractNeonCli): branch_name: str, endpoint_id: Optional[str] = None, tenant_id: Optional[TenantId] = None, + hot_standby: bool = False, lsn: Optional[Lsn] = None, port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": @@ -1470,6 +1471,8 @@ class NeonCli(AbstractNeonCli): args.extend(["--port", str(port)]) if endpoint_id is not None: args.append(endpoint_id) + if hot_standby: + args.extend(["--hot-standby", "true"]) res = self.raw_cli(args) res.check_returncode() @@ -2206,6 +2209,7 @@ class Endpoint(PgProtocol): super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres") self.env = env self.running = False + self.branch_name: Optional[str] = None # dubious self.endpoint_id: Optional[str] = None # dubious, see asserts below self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA self.tenant_id = tenant_id @@ -2217,6 +2221,7 @@ class Endpoint(PgProtocol): self, branch_name: str, endpoint_id: Optional[str] = None, + hot_standby: bool = False, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> "Endpoint": @@ -2231,12 +2236,14 @@ class Endpoint(PgProtocol): if endpoint_id is None: endpoint_id = self.env.generate_endpoint_id() self.endpoint_id = endpoint_id + self.branch_name = branch_name self.env.neon_cli.endpoint_create( branch_name, endpoint_id=self.endpoint_id, tenant_id=self.tenant_id, lsn=lsn, + hot_standby=hot_standby, port=self.port, ) path = Path("endpoints") / self.endpoint_id / "pgdata" @@ -2361,6 +2368,7 @@ class Endpoint(PgProtocol): self, branch_name: str, endpoint_id: Optional[str] = None, + hot_standby: bool = False, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> "Endpoint": @@ -2375,6 +2383,7 @@ class Endpoint(PgProtocol): branch_name=branch_name, endpoint_id=endpoint_id, config_lines=config_lines, + hot_standby=hot_standby, lsn=lsn, ).start() @@ -2408,6 +2417,7 @@ class EndpointFactory: endpoint_id: Optional[str] = None, tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, + hot_standby: bool = False, config_lines: Optional[List[str]] = None, ) -> Endpoint: ep = Endpoint( @@ -2421,6 +2431,7 @@ class EndpointFactory: return ep.create_start( branch_name=branch_name, endpoint_id=endpoint_id, + hot_standby=hot_standby, config_lines=config_lines, lsn=lsn, ) @@ -2431,6 +2442,7 @@ class EndpointFactory: endpoint_id: Optional[str] = None, tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, + hot_standby: bool = False, config_lines: Optional[List[str]] = None, ) -> Endpoint: ep = Endpoint( @@ -2449,6 +2461,7 @@ class EndpointFactory: branch_name=branch_name, endpoint_id=endpoint_id, lsn=lsn, + hot_standby=hot_standby, config_lines=config_lines, ) @@ -2458,6 +2471,36 @@ class EndpointFactory: return self + def new_replica(self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]]): + branch_name = origin.branch_name + assert origin in self.endpoints + assert branch_name is not None + + return self.create( + branch_name=branch_name, + endpoint_id=endpoint_id, + tenant_id=origin.tenant_id, + lsn=None, + hot_standby=True, + config_lines=config_lines, + ) + + def new_replica_start( + self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]] = None + ): + branch_name = origin.branch_name + assert origin in self.endpoints + assert branch_name is not None + + return self.create_start( + branch_name=branch_name, + endpoint_id=endpoint_id, + tenant_id=origin.tenant_id, + lsn=None, + hot_standby=True, + config_lines=config_lines, + ) + @dataclass class SafekeeperPort: @@ -2928,32 +2971,18 @@ def fork_at_current_lsn( return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn) -def wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn( - tenant_id: TenantId, - timeline_id: TimelineId, - safekeepers: List[Safekeeper], - pageserver: NeonPageserver, -): - sk_commit_lsns = [ - sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn for sk in safekeepers - ] - lsn = max(sk_commit_lsns) - ps_http = pageserver.http_client() - wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, lsn) - return lsn - - -def wait_for_sk_commit_lsn_to_reach_remote_storage( - tenant_id: TenantId, - timeline_id: TimelineId, - safekeepers: List[Safekeeper], - pageserver: NeonPageserver, -): - lsn = wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn( - tenant_id, timeline_id, safekeepers, pageserver - ) - ps_http = pageserver.http_client() +def last_flush_lsn_upload( + env: NeonEnv, endpoint: Endpoint, tenant_id: TenantId, timeline_id: TimelineId +) -> Lsn: + """ + Wait for pageserver to catch to the latest flush LSN of given endpoint, + checkpoint pageserver, and wait for it to be uploaded (remote_consistent_lsn + reaching flush LSN). + """ + last_flush_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + ps_http = env.pageserver.http_client() + wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_flush_lsn) # force a checkpoint to trigger upload ps_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload(ps_http, tenant_id, timeline_id, lsn) - return lsn + wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) + return last_flush_lsn diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index fe6da9f774..1cbd75ebaf 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -550,3 +550,13 @@ class PageserverHttpClient(requests.Session): def tenant_break(self, tenant_id: TenantId): res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break") self.verbose_error(res) + + def post_tracing_event(self, level: str, message: str): + res = self.post( + f"http://localhost:{self.port}/v1/tracing/event", + json={ + "level": level, + "message": message, + }, + ) + self.verbose_error(res) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index c060fc8dea..7f8bb40bda 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -54,10 +54,9 @@ def wait_for_upload( if current_lsn >= lsn: log.info("wait finished") return + lr_lsn = last_record_lsn(pageserver_http, tenant, timeline) log.info( - "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( - lsn, current_lsn, i + 1 - ) + f"waiting for remote_consistent_lsn to reach {lsn}, now {current_lsn}, last_record_lsn={lr_lsn}, iteration {i + 1}" ) time.sleep(1) raise Exception( diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py index aa99a01c83..d72ffe078d 100644 --- a/test_runner/regress/test_compute_ctl.py +++ b/test_runner/regress/test_compute_ctl.py @@ -59,11 +59,6 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): "value": "replica", "vartype": "enum" }, - { - "name": "hot_standby", - "value": "on", - "vartype": "bool" - }, { "name": "neon.safekeepers", "value": """ diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py new file mode 100644 index 0000000000..12e034cea2 --- /dev/null +++ b/test_runner/regress/test_hot_standby.py @@ -0,0 +1,79 @@ +import pytest +from fixtures.neon_fixtures import NeonEnv + + +@pytest.mark.timeout(1800) +def test_hot_standby(neon_simple_env: NeonEnv): + env = neon_simple_env + + with env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) as primary: + with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary: + primary_lsn = None + cought_up = False + queries = [ + "SHOW neon.timeline_id", + "SHOW neon.tenant_id", + "SELECT relname FROM pg_class WHERE relnamespace = current_schema()::regnamespace::oid", + "SELECT COUNT(*), SUM(i) FROM test", + ] + responses = dict() + + with primary.connect() as p_con: + with p_con.cursor() as p_cur: + p_cur.execute("CREATE TABLE test AS SELECT generate_series(1, 100) AS i") + + # Explicit commit to make sure other connections (and replicas) can + # see the changes of this commit. + p_con.commit() + + with p_con.cursor() as p_cur: + p_cur.execute("SELECT pg_current_wal_insert_lsn()::text") + res = p_cur.fetchone() + assert res is not None + (lsn,) = res + primary_lsn = lsn + + # Explicit commit to make sure other connections (and replicas) can + # see the changes of this commit. + # Note that this may generate more WAL if the transaction has changed + # things, but we don't care about that. + p_con.commit() + + for query in queries: + with p_con.cursor() as p_cur: + p_cur.execute(query) + res = p_cur.fetchone() + assert res is not None + response = res + responses[query] = response + + with secondary.connect() as s_con: + with s_con.cursor() as s_cur: + s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()") + res = s_cur.fetchone() + assert res is not None + + while not cought_up: + with s_con.cursor() as secondary_cursor: + secondary_cursor.execute("SELECT pg_last_wal_replay_lsn()") + res = secondary_cursor.fetchone() + assert res is not None + (secondary_lsn,) = res + # There may be more changes on the primary after we got our LSN + # due to e.g. autovacuum, but that shouldn't impact the content + # of the tables, so we check whether we've replayed up to at + # least after the commit of the `test` table. + cought_up = secondary_lsn >= primary_lsn + + # Explicit commit to flush any transient transaction-level state. + s_con.commit() + + for query in queries: + with s_con.cursor() as secondary_cursor: + secondary_cursor.execute(query) + response = secondary_cursor.fetchone() + assert response is not None + assert response == responses[query] diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 1ae32fb398..a96532c0d8 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -6,7 +6,6 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, RemoteStorageKind, wait_for_last_flush_lsn, - wait_for_sk_commit_lsn_to_reach_remote_storage, ) from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.types import Lsn, TenantId, TimelineId @@ -199,7 +198,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): # with image_creation_threshold=1 which we will use on the last compaction cur.execute("vacuum") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + last_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) if i == 1 and j == 2 and k == 1: # last iteration; stop before checkpoint to avoid leaving an inmemory layer @@ -222,10 +221,8 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): tenant_update_config({"image_creation_threshold": "1"}) ps_http.timeline_compact(tenant_id, timeline_id) - # wait for all uploads to finish - wait_for_sk_commit_lsn_to_reach_remote_storage( - tenant_id, timeline_id, env.safekeepers, env.pageserver - ) + # wait for all uploads to finish (checkpoint has been done above) + wait_for_upload(ps_http, tenant_id, timeline_id, last_lsn) # shutdown safekeepers to avoid on-demand downloads from walreceiver for sk in env.safekeepers: diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py new file mode 100644 index 0000000000..d559be0a8f --- /dev/null +++ b/test_runner/regress/test_logging.py @@ -0,0 +1,49 @@ +import uuid + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import wait_until + + +@pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"]) +def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str): + # self-test: make sure the event is logged (i.e., our testing endpoint works) + log_expected = { + "trace": False, + "debug": False, + "info": True, + "warn": True, + "error": True, + }[level] + + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + msg_id = uuid.uuid4().hex + + # NB: the _total suffix is added by our prometheus client + before = ps_http.get_metric_value("libmetrics_tracing_event_count_total", {"level": level}) + + # post the event + ps_http.post_tracing_event(level, msg_id) + if log_expected: + env.pageserver.allowed_errors.append(f".*{msg_id}.*") + + def assert_logged(): + if not log_expected: + return + assert env.pageserver.log_contains(f".*{msg_id}.*") + + wait_until(10, 0.5, assert_logged) + + # make sure it's counted + def assert_metric_value(): + if not log_expected: + return + # NB: the _total suffix is added by our prometheus client + val = ps_http.get_metric_value("libmetrics_tracing_event_count_total", {"level": level}) + val = val or 0.0 + log.info("libmetrics_tracing_event_count: %s", val) + assert val > (before or 0.0) + + wait_until(10, 1, assert_metric_value) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index cb08b014fd..5c02708457 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -12,8 +12,8 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, RemoteStorageKind, available_remote_storages, + last_flush_lsn_upload, wait_for_last_flush_lsn, - wait_for_sk_commit_lsn_to_reach_remote_storage, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( @@ -207,9 +207,7 @@ def test_ondemand_download_timetravel( env.endpoints.stop_all() # wait until pageserver has successfully uploaded all the data to remote storage - wait_for_sk_commit_lsn_to_reach_remote_storage( - tenant_id, timeline_id, env.safekeepers, env.pageserver - ) + wait_for_upload(client, tenant_id, timeline_id, current_lsn) def get_api_current_physical_size(): d = client.timeline_detail(tenant_id, timeline_id) @@ -347,12 +345,9 @@ def test_download_remote_layers_api( """ ) + last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) env.endpoints.stop_all() - wait_for_sk_commit_lsn_to_reach_remote_storage( - tenant_id, timeline_id, env.safekeepers, env.pageserver - ) - def get_api_current_physical_size(): d = client.timeline_detail(tenant_id, timeline_id) return d["current_physical_size"] diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 1ed86d19a2..8677a554f7 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -1,3 +1,4 @@ +import json from contextlib import closing import psycopg2.extras @@ -22,9 +23,12 @@ wait_lsn_timeout='111 s'; checkpoint_distance = 10000 compaction_target_size = 1048576 evictions_low_residence_duration_metric_threshold = "2 days" +eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "23 hours" } """ env = neon_env_builder.init_start() + # we configure eviction but no remote storage, there might be error lines + env.pageserver.allowed_errors.append(".* no remote storage configured, cannot evict layers .*") http_client = env.pageserver.http_client() # Check that we raise on misspelled configs @@ -44,6 +48,7 @@ evictions_low_residence_duration_metric_threshold = "2 days" "checkpoint_distance": "20000", "gc_period": "30sec", "evictions_low_residence_duration_metric_threshold": "42s", + "eviction_policy": json.dumps({"kind": "NoEviction"}), } tenant, _ = env.neon_cli.create_tenant(conf=new_conf) @@ -84,6 +89,11 @@ evictions_low_residence_duration_metric_threshold = "2 days" assert effective_config["image_creation_threshold"] == 3 assert effective_config["pitr_interval"] == "7days" assert effective_config["evictions_low_residence_duration_metric_threshold"] == "2days" + assert effective_config["eviction_policy"] == { + "kind": "LayerAccessThreshold", + "period": "20s", + "threshold": "23h", + } # check the configuration of the new tenant with closing(env.pageserver.connect()) as psconn: @@ -121,6 +131,9 @@ evictions_low_residence_duration_metric_threshold = "2 days" assert ( new_effective_config["evictions_low_residence_duration_metric_threshold"] == "42s" ), "Should override default value" + assert new_effective_config["eviction_policy"] == { + "kind": "NoEviction" + }, "Specific 'eviction_policy' config should override the default value" assert new_effective_config["compaction_target_size"] == 1048576 assert new_effective_config["compaction_period"] == "20s" assert new_effective_config["compaction_threshold"] == 10 @@ -135,6 +148,9 @@ evictions_low_residence_duration_metric_threshold = "2 days" "compaction_period": "80sec", "image_creation_threshold": "2", "evictions_low_residence_duration_metric_threshold": "23h", + "eviction_policy": json.dumps( + {"kind": "LayerAccessThreshold", "period": "80s", "threshold": "42h"} + ), } env.neon_cli.config_tenant( tenant_id=tenant, @@ -180,6 +196,11 @@ evictions_low_residence_duration_metric_threshold = "2 days" assert ( updated_effective_config["evictions_low_residence_duration_metric_threshold"] == "23h" ), "Should override default value" + assert updated_effective_config["eviction_policy"] == { + "kind": "LayerAccessThreshold", + "period": "1m 20s", + "threshold": "1day 18h", + }, "Specific 'eviction_policy' config should override the default value" assert updated_effective_config["compaction_target_size"] == 1048576 assert updated_effective_config["compaction_threshold"] == 10 assert updated_effective_config["gc_horizon"] == 67108864 @@ -239,6 +260,11 @@ evictions_low_residence_duration_metric_threshold = "2 days" assert final_effective_config["gc_period"] == "1h" assert final_effective_config["image_creation_threshold"] == 3 assert final_effective_config["evictions_low_residence_duration_metric_threshold"] == "2days" + assert final_effective_config["eviction_policy"] == { + "kind": "LayerAccessThreshold", + "period": "20s", + "threshold": "23h", + } # restart the pageserver and ensure that the config is still correct env.pageserver.stop() diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index d7c0814570..dca2cd3d28 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -21,7 +21,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, RemoteStorageKind, available_remote_storages, - wait_for_sk_commit_lsn_to_reach_remote_storage, + last_flush_lsn_upload, ) from fixtures.pageserver.utils import ( assert_tenant_state, @@ -174,12 +174,9 @@ def test_tenants_attached_after_download( ) ##### Stop the pageserver, erase its layer file to force it being downloaded from S3 + last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) env.endpoints.stop_all() - wait_for_sk_commit_lsn_to_reach_remote_storage( - tenant_id, timeline_id, env.safekeepers, env.pageserver - ) - env.pageserver.stop() timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)